From 1ef227f4824432f25db29bdfff22e8da94f960a1 Mon Sep 17 00:00:00 2001
From: byuu <2107894+byuu@users.noreply.github.com>
Date: Wed, 25 Sep 2019 15:13:12 +0900
Subject: [PATCH] v110.2

Added CRT-Royale [hunterk]
Improved libretro target [rtretiakov]
---
 bsnes/emulator/emulator.hpp                   |     2 +-
 bsnes/target-bsnes/GNUmakefile                |     7 +-
 .../presentation/presentation.cpp             |     2 +-
 bsnes/target-libretro/GNUmakefile             |    14 +-
 bsnes/target-libretro/program.cpp             |     5 +-
 bsnes/target-libretro/resources.hpp           |   760 +-
 shaders/CRT-Royale.shader/bloom-approx.fs     | 13973 +++++++++++++++
 shaders/CRT-Royale.shader/bloom-approx.vs     |  5859 +++++++
 .../bloom-horizontal-reconstitute.fs          |  7240 ++++++++
 .../bloom-horizontal-reconstitute.vs          |  6570 +++++++
 shaders/CRT-Royale.shader/bloom-vertical.fs   |  4824 +++++
 shaders/CRT-Royale.shader/bloom-vertical.vs   |  3792 ++++
 .../CRT-Royale.shader/blur9fast-horizontal.fs |  2016 +++
 .../CRT-Royale.shader/blur9fast-horizontal.vs |  2025 +++
 .../CRT-Royale.shader/blur9fast-vertical.fs   |  2016 +++
 .../CRT-Royale.shader/blur9fast-vertical.vs   |  2025 +++
 shaders/CRT-Royale.shader/brightpass.fs       | 14481 ++++++++++++++++
 shaders/CRT-Royale.shader/brightpass.vs       |  6551 +++++++
 ...rst-pass-linearize-crt-gamma-bob-fields.fs |  4748 +++++
 ...rst-pass-linearize-crt-gamma-bob-fields.vs |  4704 +++++
 .../geometry-aa-last-pass.fs                  |  5279 ++++++
 .../geometry-aa-last-pass.vs                  |  5263 ++++++
 shaders/CRT-Royale.shader/manifest.bml        |   214 +
 .../mask-resize-horizontal.fs                 |  3208 ++++
 .../mask-resize-horizontal.vs                 |  3236 ++++
 .../CRT-Royale.shader/mask-resize-vertical.fs |  3248 ++++
 .../CRT-Royale.shader/mask-resize-vertical.vs |  3212 ++++
 .../scanlines-horizontal-apply-mask.fs        | 10845 ++++++++++++
 .../scanlines-horizontal-apply-mask.vs        |  6047 +++++++
 .../scanlines-vertical-interlacing.fs         |  5963 +++++++
 .../scanlines-vertical-interlacing.vs         |  5830 +++++++
 ...nearApertureGrille15Wide8And5d5Spacing.png |   Bin 0 -> 198848 bytes
 ...reGrille15Wide8And5d5SpacingResizeTo64.png |   Bin 0 -> 4173 bytes
 .../textures/TileableLinearShadowMask.png     |   Bin 0 -> 218631 bytes
 .../textures/TileableLinearShadowMaskEDP.png  |   Bin 0 -> 206668 bytes
 .../TileableLinearShadowMaskEDPResizeTo64.png |   Bin 0 -> 5373 bytes
 .../TileableLinearShadowMaskResizeTo64.png    |   Bin 0 -> 6008 bytes
 ...de9And4d5Horizontal9d14VerticalSpacing.png |   Bin 0 -> 204254 bytes
 ...orizontal9d14VerticalSpacingResizeTo64.png |   Bin 0 -> 6916 bytes
 39 files changed, 133576 insertions(+), 383 deletions(-)
 create mode 100644 shaders/CRT-Royale.shader/bloom-approx.fs
 create mode 100644 shaders/CRT-Royale.shader/bloom-approx.vs
 create mode 100644 shaders/CRT-Royale.shader/bloom-horizontal-reconstitute.fs
 create mode 100644 shaders/CRT-Royale.shader/bloom-horizontal-reconstitute.vs
 create mode 100644 shaders/CRT-Royale.shader/bloom-vertical.fs
 create mode 100644 shaders/CRT-Royale.shader/bloom-vertical.vs
 create mode 100644 shaders/CRT-Royale.shader/blur9fast-horizontal.fs
 create mode 100644 shaders/CRT-Royale.shader/blur9fast-horizontal.vs
 create mode 100644 shaders/CRT-Royale.shader/blur9fast-vertical.fs
 create mode 100644 shaders/CRT-Royale.shader/blur9fast-vertical.vs
 create mode 100644 shaders/CRT-Royale.shader/brightpass.fs
 create mode 100644 shaders/CRT-Royale.shader/brightpass.vs
 create mode 100644 shaders/CRT-Royale.shader/first-pass-linearize-crt-gamma-bob-fields.fs
 create mode 100644 shaders/CRT-Royale.shader/first-pass-linearize-crt-gamma-bob-fields.vs
 create mode 100644 shaders/CRT-Royale.shader/geometry-aa-last-pass.fs
 create mode 100644 shaders/CRT-Royale.shader/geometry-aa-last-pass.vs
 create mode 100644 shaders/CRT-Royale.shader/manifest.bml
 create mode 100644 shaders/CRT-Royale.shader/mask-resize-horizontal.fs
 create mode 100644 shaders/CRT-Royale.shader/mask-resize-horizontal.vs
 create mode 100644 shaders/CRT-Royale.shader/mask-resize-vertical.fs
 create mode 100644 shaders/CRT-Royale.shader/mask-resize-vertical.vs
 create mode 100644 shaders/CRT-Royale.shader/scanlines-horizontal-apply-mask.fs
 create mode 100644 shaders/CRT-Royale.shader/scanlines-horizontal-apply-mask.vs
 create mode 100644 shaders/CRT-Royale.shader/scanlines-vertical-interlacing.fs
 create mode 100644 shaders/CRT-Royale.shader/scanlines-vertical-interlacing.vs
 create mode 100644 shaders/CRT-Royale.shader/textures/TileableLinearApertureGrille15Wide8And5d5Spacing.png
 create mode 100644 shaders/CRT-Royale.shader/textures/TileableLinearApertureGrille15Wide8And5d5SpacingResizeTo64.png
 create mode 100644 shaders/CRT-Royale.shader/textures/TileableLinearShadowMask.png
 create mode 100644 shaders/CRT-Royale.shader/textures/TileableLinearShadowMaskEDP.png
 create mode 100644 shaders/CRT-Royale.shader/textures/TileableLinearShadowMaskEDPResizeTo64.png
 create mode 100644 shaders/CRT-Royale.shader/textures/TileableLinearShadowMaskResizeTo64.png
 create mode 100644 shaders/CRT-Royale.shader/textures/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing.png
 create mode 100644 shaders/CRT-Royale.shader/textures/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacingResizeTo64.png

diff --git a/bsnes/emulator/emulator.hpp b/bsnes/emulator/emulator.hpp
index b720b098..91e3be52 100644
--- a/bsnes/emulator/emulator.hpp
+++ b/bsnes/emulator/emulator.hpp
@@ -29,7 +29,7 @@ using namespace nall;
 
 namespace Emulator {
   static const string Name    = "bsnes";
-  static const string Version = "110.1";
+  static const string Version = "110.2";
   static const string Author  = "byuu";
   static const string License = "GPLv3";
   static const string Website = "https://byuu.org";
diff --git a/bsnes/target-bsnes/GNUmakefile b/bsnes/target-bsnes/GNUmakefile
index 79a3a705..7c7baa72 100644
--- a/bsnes/target-bsnes/GNUmakefile
+++ b/bsnes/target-bsnes/GNUmakefile
@@ -27,9 +27,11 @@ ifeq ($(platform),macos)
 	mkdir -p out/$(name).app/Contents/MacOS/
 	mkdir -p out/$(name).app/Contents/MacOS/Database/
 	mkdir -p out/$(name).app/Contents/MacOS/Firmware/
+	mkdir -p out/$(name).app/Contents/MacOS/Shaders/
 	mkdir -p out/$(name).app/Contents/Resources/
 	mv out/$(name) out/$(name).app/Contents/MacOS/$(name)
 	cp Database/* out/$(name).app/Contents/MacOS/Database/
+	cp -r ../shaders/* out/$(name).app/Contents/macOS/Shaders/
 	cp $(ui)/resource/$(name).plist out/$(name).app/Contents/Info.plist
 	sips -s format icns $(ui)/resource/$(name).png --out out/$(name).app/Contents/Resources/$(name).icns
 endif
@@ -44,6 +46,7 @@ else ifeq ($(platform),macos)
 	mkdir -p ~/Library/Application\ Support/$(name)/
 	mkdir -p ~/Library/Application\ Support/$(name)/Database/
 	mkdir -p ~/Library/Application\ Support/$(name)/Firmware/
+    mkdir -p ~/Library/Application\ Support/$(name)/Shaders/
 	cp -R out/$(name).app /Applications/$(name).app
 else ifneq ($(filter $(platform),linux bsd),)
 	mkdir -p $(prefix)/bin/
@@ -52,12 +55,12 @@ else ifneq ($(filter $(platform),linux bsd),)
 	mkdir -p $(prefix)/share/$(name)/
 	mkdir -p $(prefix)/share/$(name)/Database/
 	mkdir -p $(prefix)/share/$(name)/Firmware/
-	mkdir -p $(prefix)/share/$(name)/Locale/
+	mkdir -p $(prefix)/share/$(name)/Shaders/
 	cp out/$(name) $(prefix)/bin/$(name)
 	cp $(ui)/resource/$(name).desktop $(prefix)/share/applications/$(name).desktop
 	cp $(ui)/resource/$(name).png $(prefix)/share/icons/$(name).png
 	cp Database/* $(prefix)/share/$(name)/Database/
-	cp Locale/* $(prefix)/share/$(name)/Locale/
+	cp -r ../shaders/* $(prefix)/share/$(name)/Shaders/
 endif
 
 uninstall:
diff --git a/bsnes/target-bsnes/presentation/presentation.cpp b/bsnes/target-bsnes/presentation/presentation.cpp
index 94b8e204..700454e4 100644
--- a/bsnes/target-bsnes/presentation/presentation.cpp
+++ b/bsnes/target-bsnes/presentation/presentation.cpp
@@ -530,7 +530,7 @@ auto Presentation::updateShaders() -> void {
   });
   shaders.append(blur);
 
-  auto location = locate("shaders/");
+  auto location = locate("Shaders/");
 
   if(settings.video.driver == "OpenGL 3.2") {
     for(auto shader : directory::folders(location, "*.shader")) {
diff --git a/bsnes/target-libretro/GNUmakefile b/bsnes/target-libretro/GNUmakefile
index 4c4cf7fb..104128e3 100644
--- a/bsnes/target-libretro/GNUmakefile
+++ b/bsnes/target-libretro/GNUmakefile
@@ -1,5 +1,7 @@
 name := libretro.so
-flags += -Wno-narrowing -Wno-multichar -fopenmp -g -fPIC
+local := false
+openmp := true
+flags += -Wno-narrowing -Wno-multichar -g -fPIC
 
 objects := libretro $(objects)
 objects := $(patsubst %,obj/%.o,$(objects))
@@ -13,4 +15,14 @@ else ifeq ($(platform),windows)
 	$(strip $(compiler) -o out/bsnes_libretro.dll -shared $(objects) -Wl,--no-undefined -Wl,--version-script=target-libretro/link.T -static-libgcc -static-libstdc++ -Wl,-Bstatic -lws2_32 -lpthread -lgomp -Wl,-Bdynamic)
 else ifeq ($(platform),macos)
 	$(strip $(compiler) -o out/bsnes_libretro.dylib -shared $(objects) -lpthread -ldl)
+else ifeq ($(platform), ios-arm64)
+    ifeq ($(IOSSDK),)
+       IOSSDK := $(shell xcodebuild -version -sdk iphoneos Path)
+    endif
+	$(strip c++ -arch arm64 -marm -miphoneos-version-min=11.0 -isysroot $(IOSSDK) -o out/bsnes_libretro_ios.dylib -shared $(objects) -lpthread -ldl)
+else ifeq ($(platform), tvos-arm64)
+    ifeq ($(IOSSDK),)
+       IOSSDK := $(shell xcodebuild -version -sdk appletvos Path)
+    endif
+	$(strip c++ -arch arm64 -marm -mtvos-version-min=11.0 -isysroot $(IOSSDK) -o out/bsnes_libretro_tvos.dylib -shared $(objects) -lpthread -ldl)
 endif
diff --git a/bsnes/target-libretro/program.cpp b/bsnes/target-libretro/program.cpp
index c07365c7..cbfa4c37 100644
--- a/bsnes/target-libretro/program.cpp
+++ b/bsnes/target-libretro/program.cpp
@@ -146,6 +146,9 @@ auto Program::load() -> void {
 	//fixes an errant scanline on the title screen due to writing to PPU registers too late
 	if(title == "ADVENTURES OF FRANKEN" && region == "PAL") emulator->configure("Hacks/PPU/RenderCycle", 32);
 
+	//fixes an errant scanline on the title screen due to writing to PPU registers too late
+	if(title == "FIREPOWER 2000") emulator->configure("Hacks/PPU/RenderCycle", 32);
+
 	emulator->power();
 }
 
@@ -288,7 +291,7 @@ auto Program::openRomSuperFamicom(string name, vfs::file::mode mode) -> shared_p
 		string save_path;
 
 		auto suffix = Location::suffix(base_name);
-		auto base = Location::base(base_name);
+		auto base = Location::base(base_name.transform("\\", "/"));
 
 		const char *save = nullptr;
 		if (environ_cb && environ_cb(RETRO_ENVIRONMENT_GET_SAVE_DIRECTORY, &save) && save)
diff --git a/bsnes/target-libretro/resources.hpp b/bsnes/target-libretro/resources.hpp
index b0ba3fa6..1a438626 100644
--- a/bsnes/target-libretro/resources.hpp
+++ b/bsnes/target-libretro/resources.hpp
@@ -1,4 +1,4 @@
-const unsigned char boardsbml[30846] = {
+const unsigned char boardsbml[31025] = {
   100,97,116,97,98,97,115,101,10,32,32,114,101,118,105,115,105,111,110,58,32,50,48,49,56,45,48,55,45,50,53,10,
   10,47,47,66,111,97,114,100,115,32,40,80,114,111,100,117,99,116,105,111,110,41,10,10,100,97,116,97,98,97,115,101,
   10,32,32,114,101,118,105,115,105,111,110,58,32,50,48,49,56,45,48,53,45,49,54,10,10,98,111,97,114,100,58,32,
@@ -571,398 +571,404 @@ const unsigned char boardsbml[30846] = {
   109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,
   32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,
   102,102,102,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,55,100,44,99,48,45,102,102,58,
-  48,48,48,48,45,102,102,102,102,10,10,47,47,66,111,97,114,100,115,32,40,71,101,110,101,114,105,99,41,10,10,100,
-  97,116,97,98,97,115,101,10,32,32,114,101,118,105,115,105,111,110,58,32,50,48,49,56,45,48,55,45,50,53,10,10,
-  98,111,97,114,100,58,32,65,82,77,45,76,79,82,79,77,45,82,65,77,10,32,32,109,101,109,111,114,121,32,116,121,
-  112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,
-  100,100,114,101,115,115,61,48,48,45,55,100,44,56,48,45,102,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,
-  107,61,48,120,56,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,54,102,44,99,
-  48,45,101,102,58,48,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,109,101,109,
-  111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,109,97,
-  112,32,97,100,100,114,101,115,115,61,55,48,45,55,100,44,102,48,45,102,102,58,48,48,48,48,45,102,102,102,102,10,
-  32,32,112,114,111,99,101,115,115,111,114,32,97,114,99,104,105,116,101,99,116,117,114,101,61,65,82,77,54,10,32,32,
-  32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,51,56,48,48,45,51,
-  56,102,102,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,
-  80,114,111,103,114,97,109,32,97,114,99,104,105,116,101,99,116,117,114,101,61,65,82,77,54,10,32,32,32,32,109,101,
-  109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,
-  116,101,99,116,117,114,101,61,65,82,77,54,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,
+  48,48,48,48,45,102,102,102,102,10,10,47,47,66,111,97,114,100,115,32,40,80,114,111,116,111,116,121,112,101,115,41,
+  10,10,98,111,97,114,100,58,32,83,72,86,67,45,52,80,86,53,66,45,48,49,10,32,32,109,101,109,111,114,121,32,
+  116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,
+  32,97,100,100,114,101,115,115,61,48,48,45,55,100,44,56,48,45,102,102,58,56,48,48,48,45,102,102,102,102,32,109,
+  97,115,107,61,48,120,56,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,55,100,
+  44,99,48,45,102,102,58,48,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,10,47,47,
+  66,111,97,114,100,115,32,40,71,101,110,101,114,105,99,41,10,10,100,97,116,97,98,97,115,101,10,32,32,114,101,118,
+  105,115,105,111,110,58,32,50,48,49,56,45,48,55,45,50,53,10,10,98,111,97,114,100,58,32,65,82,77,45,76,79,
+  82,79,77,45,82,65,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,
+  116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,55,100,44,
+  56,48,45,102,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,32,32,
+  109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,54,102,44,99,48,45,101,102,58,48,48,48,48,45,55,102,102,
+  102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,
+  99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,55,48,45,
+  55,100,44,102,48,45,102,102,58,48,48,48,48,45,102,102,102,102,10,32,32,112,114,111,99,101,115,115,111,114,32,97,
+  114,99,104,105,116,101,99,116,117,114,101,61,65,82,77,54,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,
+  61,48,48,45,51,102,44,56,48,45,98,102,58,51,56,48,48,45,51,56,102,102,10,32,32,32,32,109,101,109,111,114,
+  121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,32,97,114,99,104,105,
+  116,101,99,116,117,114,101,61,65,82,77,54,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,
   32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,99,116,117,114,101,61,65,82,77,54,10,
-  32,32,32,32,111,115,99,105,108,108,97,116,111,114,10,10,98,111,97,114,100,58,32,66,83,45,72,73,82,79,77,45,
-  82,65,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,
-  111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,49,102,44,56,48,45,57,
-  102,58,56,48,48,48,45,102,102,102,102,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,53,
-  102,44,99,48,45,100,102,58,48,48,48,48,45,102,102,102,102,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,
-  82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,
-  61,50,48,45,51,102,44,97,48,45,98,102,58,54,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,101,48,
-  48,48,10,32,32,115,108,111,116,32,116,121,112,101,61,66,83,77,101,109,111,114,121,10,32,32,32,32,109,97,112,32,
-  97,100,100,114,101,115,115,61,50,48,45,51,102,44,97,48,45,98,102,58,56,48,48,48,45,102,102,102,102,10,32,32,
-  32,32,109,97,112,32,97,100,100,114,101,115,115,61,54,48,45,55,100,44,101,48,45,102,102,58,48,48,48,48,45,102,
-  102,102,102,10,10,98,111,97,114,100,58,32,66,83,45,76,79,82,79,77,45,82,65,77,10,32,32,109,101,109,111,114,
-  121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,
-  97,112,32,97,100,100,114,101,115,115,61,48,48,45,49,102,58,56,48,48,48,45,102,102,102,102,32,98,97,115,101,61,
-  48,120,48,48,48,48,48,48,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,
-  114,101,115,115,61,50,48,45,51,102,58,56,48,48,48,45,102,102,102,102,32,98,97,115,101,61,48,120,49,48,48,48,
-  48,48,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,56,
-  48,45,57,102,58,56,48,48,48,45,102,102,102,102,32,98,97,115,101,61,48,120,50,48,48,48,48,48,32,109,97,115,
-  107,61,48,120,56,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,97,48,45,98,102,58,56,
-  48,48,48,45,102,102,102,102,32,98,97,115,101,61,48,120,49,48,48,48,48,48,32,109,97,115,107,61,48,120,56,48,
-  48,48,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,
-  101,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,55,48,45,55,100,44,102,48,45,102,102,58,48,48,
-  48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,115,108,111,116,32,116,121,112,101,61,
-  66,83,77,101,109,111,114,121,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,99,48,45,101,102,58,48,
-  48,48,48,45,102,102,102,102,10,10,98,111,97,114,100,58,32,66,83,45,77,67,67,45,82,65,77,10,32,32,109,101,
+  32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,68,97,116,97,
+  32,97,114,99,104,105,116,101,99,116,117,114,101,61,65,82,77,54,10,32,32,32,32,111,115,99,105,108,108,97,116,111,
+  114,10,10,98,111,97,114,100,58,32,66,83,45,72,73,82,79,77,45,82,65,77,10,32,32,109,101,109,111,114,121,32,
+  116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,
+  32,97,100,100,114,101,115,115,61,48,48,45,49,102,44,56,48,45,57,102,58,56,48,48,48,45,102,102,102,102,10,32,
+  32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,53,102,44,99,48,45,100,102,58,48,48,48,48,45,
+  102,102,102,102,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,
+  97,118,101,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,50,48,45,51,102,44,97,48,45,98,102,58,
+  54,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,101,48,48,48,10,32,32,115,108,111,116,32,116,121,112,
+  101,61,66,83,77,101,109,111,114,121,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,50,48,45,51,102,
+  44,97,48,45,98,102,58,56,48,48,48,45,102,102,102,102,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,
+  61,54,48,45,55,100,44,101,48,45,102,102,58,48,48,48,48,45,102,102,102,102,10,10,98,111,97,114,100,58,32,66,
+  83,45,76,79,82,79,77,45,82,65,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,
+  110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,
+  45,49,102,58,56,48,48,48,45,102,102,102,102,32,98,97,115,101,61,48,120,48,48,48,48,48,48,32,109,97,115,107,
+  61,48,120,56,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,50,48,45,51,102,58,56,48,
+  48,48,45,102,102,102,102,32,98,97,115,101,61,48,120,49,48,48,48,48,48,32,109,97,115,107,61,48,120,56,48,48,
+  48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,56,48,45,57,102,58,56,48,48,48,45,102,102,102,
+  102,32,98,97,115,101,61,48,120,50,48,48,48,48,48,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,32,32,
+  109,97,112,32,97,100,100,114,101,115,115,61,97,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,98,97,115,101,
+  61,48,120,49,48,48,48,48,48,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,109,101,109,111,114,121,32,116,
+  121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,109,97,112,32,97,100,100,
+  114,101,115,115,61,55,48,45,55,100,44,102,48,45,102,102,58,48,48,48,48,45,55,102,102,102,32,109,97,115,107,61,
+  48,120,56,48,48,48,10,32,32,115,108,111,116,32,116,121,112,101,61,66,83,77,101,109,111,114,121,10,32,32,32,32,
+  109,97,112,32,97,100,100,114,101,115,115,61,99,48,45,101,102,58,48,48,48,48,45,102,102,102,102,10,10,98,111,97,
+  114,100,58,32,66,83,45,77,67,67,45,82,65,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,
+  32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,49,48,
+  45,49,55,58,53,48,48,48,45,53,102,102,102,32,109,97,115,107,61,48,120,102,48,48,48,10,32,32,112,114,111,99,
+  101,115,115,111,114,32,105,100,101,110,116,105,102,105,101,114,61,77,67,67,10,32,32,32,32,109,97,112,32,97,100,100,
+  114,101,115,115,61,48,48,45,48,102,58,53,48,48,48,45,53,102,102,102,10,32,32,32,32,109,99,117,10,32,32,32,
+  32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,45,
+  102,102,102,102,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,55,100,44,99,48,45,
+  102,102,58,48,48,48,48,45,102,102,102,102,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,50,
+  48,45,51,102,44,97,48,45,98,102,58,54,48,48,48,45,55,102,102,102,10,32,32,32,32,32,32,109,101,109,111,114,
+  121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,32,
+  32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,68,111,119,110,108,111,97,
+  100,10,32,32,32,32,32,32,115,108,111,116,32,116,121,112,101,61,66,83,77,101,109,111,114,121,10,10,98,111,97,114,
+  100,58,32,66,83,45,83,65,49,45,82,65,77,10,32,32,112,114,111,99,101,115,115,111,114,32,97,114,99,104,105,116,
+  101,99,116,117,114,101,61,87,54,53,67,56,49,54,83,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,
+  48,48,45,51,102,44,56,48,45,98,102,58,50,50,48,48,45,50,51,102,102,10,32,32,32,32,109,99,117,10,32,32,
+  32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,
+  45,102,102,102,102,32,109,97,115,107,61,48,120,52,48,56,48,48,48,10,32,32,32,32,32,32,109,97,112,32,97,100,
+  100,114,101,115,115,61,99,48,45,102,102,58,48,48,48,48,45,102,102,102,102,10,32,32,32,32,32,32,109,101,109,111,
+  114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,
+  32,32,115,108,111,116,32,116,121,112,101,61,66,83,77,101,109,111,114,121,10,32,32,32,32,109,101,109,111,114,121,32,
+  116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,32,32,109,97,112,32,
+  97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,54,48,48,48,45,55,102,102,102,32,115,105,
+  122,101,61,48,120,50,48,48,48,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,52,
+  102,58,48,48,48,48,45,102,102,102,102,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,
+  99,111,110,116,101,110,116,61,73,110,116,101,114,110,97,108,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,
+  115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,51,48,48,48,45,51,55,102,102,32,115,105,122,101,61,48,120,
+  56,48,48,10,10,98,111,97,114,100,58,32,69,86,69,78,84,45,67,67,57,50,10,32,32,109,101,109,111,114,121,32,
+  116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,109,97,112,32,97,100,
+  100,114,101,115,115,61,55,48,45,55,100,44,102,48,45,102,102,58,48,48,48,48,45,55,102,102,102,32,109,97,115,107,
+  61,48,120,56,48,48,48,10,32,32,112,114,111,99,101,115,115,111,114,32,109,97,110,117,102,97,99,116,117,114,101,114,
+  61,78,69,67,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,56,50,49,52,10,32,32,32,32,105,
+  100,101,110,116,105,102,105,101,114,58,32,67,97,109,112,117,115,32,67,104,97,108,108,101,110,103,101,32,39,57,50,10,
+  32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,99,48,44,101,48,58,48,48,48,48,10,32,32,32,32,109,
+  99,117,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,49,102,44,56,48,45,57,102,
+  58,56,48,48,48,45,102,102,102,102,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,
+  32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,
+  112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,76,101,118,101,108,45,49,10,32,32,32,32,32,32,109,101,109,
+  111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,76,101,118,101,108,45,50,10,32,32,32,
+  32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,76,101,118,101,108,
+  45,51,10,32,32,32,32,100,105,112,10,32,32,112,114,111,99,101,115,115,111,114,32,109,97,110,117,102,97,99,116,117,
+  114,101,114,61,78,69,67,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,
+  32,109,97,112,32,97,100,100,114,101,115,115,61,50,48,45,51,102,44,97,48,45,98,102,58,56,48,48,48,45,102,102,
+  102,102,32,109,97,115,107,61,48,120,55,102,102,102,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,
+  79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,32,97,114,99,104,105,116,101,99,116,117,114,101,61,
+  117,80,68,55,55,50,53,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,
+  101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,
+  32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,
+  114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,111,115,99,105,108,108,97,116,
+  111,114,10,10,98,111,97,114,100,58,32,69,86,69,78,84,45,80,70,57,52,10,32,32,109,101,109,111,114,121,32,116,
+  121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,109,97,112,32,97,100,100,
+  114,101,115,115,61,51,48,45,51,102,44,98,48,45,98,102,58,54,48,48,48,45,55,102,102,102,32,109,97,115,107,61,
+  48,120,101,48,48,48,10,32,32,112,114,111,99,101,115,115,111,114,32,109,97,110,117,102,97,99,116,117,114,101,114,61,
+  78,69,67,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,56,50,49,52,10,32,32,32,32,105,100,
+  101,110,116,105,102,105,101,114,58,32,80,111,119,101,114,70,101,115,116,32,39,57,52,10,32,32,32,32,109,97,112,32,
+  97,100,100,114,101,115,115,61,49,48,44,50,48,58,54,48,48,48,10,32,32,32,32,109,99,117,10,32,32,32,32,32,
+  32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,
+  102,102,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,99,48,45,102,102,58,48,48,48,48,45,
+  102,102,102,102,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,
+  110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,
+  32,99,111,110,116,101,110,116,61,76,101,118,101,108,45,49,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,
+  112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,76,101,118,101,108,45,50,10,32,32,32,32,32,32,109,101,109,
+  111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,76,101,118,101,108,45,51,10,32,32,32,
+  32,100,105,112,10,32,32,112,114,111,99,101,115,115,111,114,32,109,97,110,117,102,97,99,116,117,114,101,114,61,78,69,
+  67,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,97,112,32,97,
+  100,100,114,101,115,115,61,48,48,45,48,102,44,56,48,45,56,102,58,54,48,48,48,45,55,102,102,102,32,109,97,115,
+  107,61,48,120,102,102,102,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,
+  101,110,116,61,80,114,111,103,114,97,109,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,
+  10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,68,97,116,
+  97,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,101,109,111,114,
+  121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,99,
+  116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,111,115,99,105,108,108,97,116,111,114,10,10,98,111,97,
+  114,100,58,32,69,88,72,73,82,79,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,
+  110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,
+  45,51,102,58,56,48,48,48,45,102,102,102,102,32,98,97,115,101,61,48,120,52,48,48,48,48,48,10,32,32,32,32,
+  109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,55,100,58,48,48,48,48,45,102,102,102,102,32,98,97,115,101,
+  61,48,120,52,48,48,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,56,48,45,98,102,58,
+  56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,99,48,48,48,48,48,10,32,32,32,32,109,97,112,32,
+  97,100,100,114,101,115,115,61,99,48,45,102,102,58,48,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,99,
+  48,48,48,48,48,10,10,98,111,97,114,100,58,32,69,88,72,73,82,79,77,45,82,65,77,10,32,32,109,101,109,111,
+  114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,
+  109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,58,56,48,48,48,45,102,102,102,102,32,98,97,115,101,
+  61,48,120,52,48,48,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,55,100,58,
+  48,48,48,48,45,102,102,102,102,32,98,97,115,101,61,48,120,52,48,48,48,48,48,10,32,32,32,32,109,97,112,32,
+  97,100,100,114,101,115,115,61,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,99,
+  48,48,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,99,48,45,102,102,58,48,48,48,48,
+  45,102,102,102,102,32,109,97,115,107,61,48,120,99,48,48,48,48,48,10,32,32,109,101,109,111,114,121,32,116,121,112,
+  101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,109,97,112,32,97,100,100,114,101,
+  115,115,61,50,48,45,51,102,44,97,48,45,98,102,58,54,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,
+  101,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,55,48,45,55,100,58,48,48,48,48,45,
+  55,102,102,102,10,10,98,111,97,114,100,58,32,69,88,72,73,82,79,77,45,82,65,77,45,83,72,65,82,80,82,84,
+  67,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,
+  114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,58,56,48,48,48,45,102,
+  102,102,102,32,98,97,115,101,61,48,120,52,48,48,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,
+  115,61,52,48,45,55,100,58,48,48,48,48,45,102,102,102,102,32,98,97,115,101,61,48,120,52,48,48,48,48,48,10,
+  32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,
+  109,97,115,107,61,48,120,99,48,48,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,99,48,
+  45,102,102,58,48,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,99,48,48,48,48,48,10,32,32,109,101,
   109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,109,
-  97,112,32,97,100,100,114,101,115,115,61,49,48,45,49,55,58,53,48,48,48,45,53,102,102,102,32,109,97,115,107,61,
-  48,120,102,48,48,48,10,32,32,112,114,111,99,101,115,115,111,114,32,105,100,101,110,116,105,102,105,101,114,61,77,67,
-  67,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,48,102,58,53,48,48,48,45,53,102,102,
-  102,10,32,32,32,32,109,99,117,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,
-  102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,
-  101,115,115,61,52,48,45,55,100,44,99,48,45,102,102,58,48,48,48,48,45,102,102,102,102,10,32,32,32,32,32,32,
-  109,97,112,32,97,100,100,114,101,115,115,61,50,48,45,51,102,44,97,48,45,98,102,58,54,48,48,48,45,55,102,102,
-  102,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,
-  80,114,111,103,114,97,109,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,
-  110,116,101,110,116,61,68,111,119,110,108,111,97,100,10,32,32,32,32,32,32,115,108,111,116,32,116,121,112,101,61,66,
-  83,77,101,109,111,114,121,10,10,98,111,97,114,100,58,32,66,83,45,83,65,49,45,82,65,77,10,32,32,112,114,111,
-  99,101,115,115,111,114,32,97,114,99,104,105,116,101,99,116,117,114,101,61,87,54,53,67,56,49,54,83,10,32,32,32,
-  32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,50,50,48,48,45,50,51,
-  102,102,10,32,32,32,32,109,99,117,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,
-  51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,52,48,56,48,48,48,
-  10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,99,48,45,102,102,58,48,48,48,48,45,102,102,
-  102,102,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,
-  61,80,114,111,103,114,97,109,10,32,32,32,32,32,32,115,108,111,116,32,116,121,112,101,61,66,83,77,101,109,111,114,
-  121,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,
-  118,101,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,
-  58,54,48,48,48,45,55,102,102,102,32,115,105,122,101,61,48,120,50,48,48,48,10,32,32,32,32,32,32,109,97,112,
-  32,97,100,100,114,101,115,115,61,52,48,45,52,102,58,48,48,48,48,45,102,102,102,102,10,32,32,32,32,109,101,109,
-  111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,73,110,116,101,114,110,97,108,10,32,32,
-  32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,51,48,48,48,
-  45,51,55,102,102,32,115,105,122,101,61,48,120,56,48,48,10,10,98,111,97,114,100,58,32,69,86,69,78,84,45,67,
-  67,57,50,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,
-  118,101,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,55,48,45,55,100,44,102,48,45,102,102,58,48,
-  48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,112,114,111,99,101,115,115,111,114,
-  32,109,97,110,117,102,97,99,116,117,114,101,114,61,78,69,67,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,
-  80,68,55,56,50,49,52,10,32,32,32,32,105,100,101,110,116,105,102,105,101,114,58,32,67,97,109,112,117,115,32,67,
-  104,97,108,108,101,110,103,101,32,39,57,50,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,99,48,44,
-  101,48,58,48,48,48,48,10,32,32,32,32,109,99,117,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,
-  115,61,48,48,45,49,102,44,56,48,45,57,102,58,56,48,48,48,45,102,102,102,102,10,32,32,32,32,32,32,109,101,
+  97,112,32,97,100,100,114,101,115,115,61,50,48,45,51,102,44,97,48,45,98,102,58,54,48,48,48,45,55,102,102,102,
+  32,109,97,115,107,61,48,120,101,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,55,48,45,
+  55,100,58,48,48,48,48,45,55,102,102,102,10,32,32,114,116,99,32,109,97,110,117,102,97,99,116,117,114,101,114,61,
+  83,104,97,114,112,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,
+  102,58,50,56,48,48,45,50,56,48,49,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,84,67,32,
+  99,111,110,116,101,110,116,61,84,105,109,101,32,109,97,110,117,102,97,99,116,117,114,101,114,61,83,104,97,114,112,10,
+  10,98,111,97,114,100,58,32,69,88,76,79,82,79,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,
+  77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,
+  115,61,48,48,45,55,100,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,56,48,48,48,32,
+  98,97,115,101,61,48,120,52,48,48,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,56,48,
+  45,102,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,56,48,48,48,32,98,97,115,101,
+  61,48,120,48,48,48,48,48,48,10,10,98,111,97,114,100,58,32,69,88,76,79,82,79,77,45,82,65,77,10,32,32,
+  109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,
+  32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,55,100,58,56,48,48,48,45,102,102,102,102,32,
+  109,97,115,107,61,48,120,56,48,56,48,48,48,32,98,97,115,101,61,48,120,52,48,48,48,48,48,10,32,32,32,32,
+  109,97,112,32,97,100,100,114,101,115,115,61,56,48,45,102,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,
+  61,48,120,56,48,56,48,48,48,32,98,97,115,101,61,48,120,48,48,48,48,48,48,10,32,32,109,101,109,111,114,121,
+  32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,109,97,112,32,97,
+  100,100,114,101,115,115,61,55,48,45,55,100,44,102,48,45,102,102,58,48,48,48,48,45,55,102,102,102,32,109,97,115,
+  107,61,48,120,56,48,48,48,10,10,98,111,97,114,100,58,32,69,88,78,69,67,45,76,79,82,79,77,10,32,32,109,
+  101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,
+  32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,55,100,44,56,48,45,102,102,58,56,48,48,48,45,
+  102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,112,114,111,99,101,115,115,111,114,32,97,114,99,
+  104,105,116,101,99,116,117,114,101,61,117,80,68,57,54,48,53,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,
+  115,115,61,54,48,45,54,55,44,101,48,45,101,55,58,48,48,48,48,45,51,102,102,102,10,32,32,32,32,109,101,109,
+  111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,32,97,114,99,
+  104,105,116,101,99,116,117,114,101,61,117,80,68,57,54,48,53,48,10,32,32,32,32,109,101,109,111,114,121,32,116,121,
+  112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,99,116,117,114,101,
+  61,117,80,68,57,54,48,53,48,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,
+  110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,57,54,48,53,48,
+  10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,54,56,45,54,102,44,101,56,45,101,102,58,48,
+  48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,32,32,111,115,99,105,108,108,97,
+  116,111,114,10,10,98,111,97,114,100,58,32,69,88,83,80,67,55,49,49,48,45,82,65,77,45,69,80,83,79,78,82,
+  84,67,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,69,120,112,
+  97,110,115,105,111,110,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,52,102,58,48,48,48,
+  48,45,102,102,102,102,10,32,32,112,114,111,99,101,115,115,111,114,32,105,100,101,110,116,105,102,105,101,114,61,83,80,
+  67,55,49,49,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,
+  102,58,52,56,48,48,45,52,56,51,102,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,53,48,44,53,
+  56,58,48,48,48,48,45,102,102,102,102,10,32,32,32,32,109,99,117,10,32,32,32,32,32,32,109,97,112,32,97,100,
+  100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,
+  61,48,120,56,48,48,48,48,48,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,99,48,45,102,
+  102,58,48,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,99,48,48,48,48,48,10,32,32,32,32,32,32,
+  109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,
+  32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,68,97,
+  116,97,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,
+  97,118,101,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,
+  102,58,54,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,101,48,48,48,10,32,32,114,116,99,32,109,97,
+  110,117,102,97,99,116,117,114,101,114,61,69,112,115,111,110,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,
+  61,48,48,45,51,102,44,56,48,45,98,102,58,52,56,52,48,45,52,56,52,50,10,32,32,32,32,109,101,109,111,114,
+  121,32,116,121,112,101,61,82,84,67,32,99,111,110,116,101,110,116,61,84,105,109,101,32,109,97,110,117,102,97,99,116,
+  117,114,101,114,61,69,112,115,111,110,10,10,98,111,97,114,100,58,32,71,66,45,76,79,82,79,77,10,32,32,109,101,
   109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,
-  32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,76,101,118,101,
-  108,45,49,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,
-  116,61,76,101,118,101,108,45,50,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,
-  99,111,110,116,101,110,116,61,76,101,118,101,108,45,51,10,32,32,32,32,100,105,112,10,32,32,112,114,111,99,101,115,
-  115,111,114,32,109,97,110,117,102,97,99,116,117,114,101,114,61,78,69,67,32,97,114,99,104,105,116,101,99,116,117,114,
-  101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,50,48,45,51,102,44,
-  97,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,55,102,102,102,10,32,32,32,32,
-  109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,32,
-  97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,101,109,111,114,121,32,
-  116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,99,116,117,
-  114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,
-  111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,
-  10,32,32,32,32,111,115,99,105,108,108,97,116,111,114,10,10,98,111,97,114,100,58,32,69,86,69,78,84,45,80,70,
-  57,52,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,
-  101,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,51,48,45,51,102,44,98,48,45,98,102,58,54,48,
-  48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,101,48,48,48,10,32,32,112,114,111,99,101,115,115,111,114,32,
-  109,97,110,117,102,97,99,116,117,114,101,114,61,78,69,67,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,
-  68,55,56,50,49,52,10,32,32,32,32,105,100,101,110,116,105,102,105,101,114,58,32,80,111,119,101,114,70,101,115,116,
-  32,39,57,52,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,49,48,44,50,48,58,54,48,48,48,10,
-  32,32,32,32,109,99,117,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,
-  56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,
-  115,61,99,48,45,102,102,58,48,48,48,48,45,102,102,102,102,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,
-  121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,32,32,109,101,
-  109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,76,101,118,101,108,45,49,10,32,32,
-  32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,76,101,118,101,
-  108,45,50,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,
-  116,61,76,101,118,101,108,45,51,10,32,32,32,32,100,105,112,10,32,32,112,114,111,99,101,115,115,111,114,32,109,97,
-  110,117,102,97,99,116,117,114,101,114,61,78,69,67,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,
-  55,50,53,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,48,102,44,56,48,45,56,102,58,
+  32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,55,100,44,56,48,45,102,102,58,56,48,48,48,45,102,
+  102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,
+  52,48,45,55,100,44,99,48,45,102,102,58,48,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,56,48,48,
+  48,10,32,32,112,114,111,99,101,115,115,111,114,32,105,100,101,110,116,105,102,105,101,114,61,73,67,68,32,114,101,118,
+  105,115,105,111,110,61,50,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,
+  45,98,102,58,54,48,48,48,45,54,55,102,102,44,55,48,48,48,45,55,102,102,102,10,32,32,32,32,109,101,109,111,
+  114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,66,111,111,116,32,97,114,99,104,105,116,101,
+  99,116,117,114,101,61,76,82,51,53,57,48,50,10,32,32,32,32,111,115,99,105,108,108,97,116,111,114,10,32,32,32,
+  32,115,108,111,116,32,116,121,112,101,61,71,97,109,101,66,111,121,10,10,98,111,97,114,100,58,32,71,83,85,45,82,
+  65,77,10,32,32,112,114,111,99,101,115,115,111,114,32,97,114,99,104,105,116,101,99,116,117,114,101,61,71,83,85,10,
+  32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,51,48,48,48,
+  45,51,52,102,102,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,
+  116,61,80,114,111,103,114,97,109,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,
+  102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,
+  32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,53,102,44,99,48,45,100,102,58,48,48,48,48,
+  45,102,102,102,102,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,
+  116,61,83,97,118,101,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,
+  48,45,98,102,58,54,48,48,48,45,55,102,102,102,32,115,105,122,101,61,48,120,50,48,48,48,10,32,32,32,32,32,
+  32,109,97,112,32,97,100,100,114,101,115,115,61,55,48,45,55,49,44,102,48,45,102,49,58,48,48,48,48,45,102,102,
+  102,102,10,10,98,111,97,114,100,58,32,72,73,82,79,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,
+  79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,
+  115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,10,32,32,32,32,109,97,112,
+  32,97,100,100,114,101,115,115,61,52,48,45,55,100,44,99,48,45,102,102,58,48,48,48,48,45,102,102,102,102,10,10,
+  98,111,97,114,100,58,32,72,73,82,79,77,45,82,65,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,
+  79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,
+  115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,10,32,32,32,32,109,97,112,
+  32,97,100,100,114,101,115,115,61,52,48,45,55,100,44,99,48,45,102,102,58,48,48,48,48,45,102,102,102,102,10,32,
+  32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,
+  32,32,109,97,112,32,97,100,100,114,101,115,115,61,50,48,45,51,102,44,97,48,45,98,102,58,54,48,48,48,45,55,
+  102,102,102,32,109,97,115,107,61,48,120,101,48,48,48,10,10,98,111,97,114,100,58,32,72,73,84,65,67,72,73,45,
+  76,79,82,79,77,10,32,32,112,114,111,99,101,115,115,111,114,32,97,114,99,104,105,116,101,99,116,117,114,101,61,72,
+  71,53,49,66,83,49,54,57,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,
+  48,45,98,102,58,54,99,48,48,45,54,102,102,102,44,55,99,48,48,45,55,102,102,102,10,32,32,32,32,109,101,109,
+  111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,
+  32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,45,
+  102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,
+  61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,
+  101,115,115,61,55,48,45,55,55,58,48,48,48,48,45,55,102,102,102,10,32,32,32,32,109,101,109,111,114,121,32,116,
+  121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,99,116,117,114,
+  101,61,72,71,53,49,66,83,49,54,57,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,
+  99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,99,116,117,114,101,61,72,71,53,49,66,83,
+  49,54,57,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,
+  102,58,54,48,48,48,45,54,98,102,102,44,55,48,48,48,45,55,98,102,102,32,109,97,115,107,61,48,120,102,48,48,
+  48,10,32,32,32,32,111,115,99,105,108,108,97,116,111,114,10,10,98,111,97,114,100,58,32,76,79,82,79,77,10,32,
+  32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,
+  10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,55,100,44,56,48,45,102,102,58,56,48,48,
+  48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,10,98,111,97,114,100,58,32,76,79,82,79,77,
+  45,82,65,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,
+  114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,55,100,44,56,48,45,
+  102,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,109,101,109,111,114,
+  121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,109,97,112,32,
+  97,100,100,114,101,115,115,61,55,48,45,55,100,44,102,48,45,102,102,58,48,48,48,48,45,55,102,102,102,32,109,97,
+  115,107,61,48,120,56,48,48,48,10,10,98,111,97,114,100,58,32,76,79,82,79,77,45,82,65,77,35,65,10,32,32,
+  109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,
+  32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,
+  45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,
+  82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,
+  61,55,48,45,55,100,44,102,48,45,102,102,58,48,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,
+  48,48,10,10,98,111,97,114,100,58,32,78,69,67,45,72,73,82,79,77,10,32,32,109,101,109,111,114,121,32,116,121,
+  112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,
+  100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,10,32,32,32,
+  32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,55,100,44,99,48,45,102,102,58,48,48,48,48,45,102,102,
+  102,102,10,32,32,112,114,111,99,101,115,115,111,114,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,
+  55,50,53,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,49,102,44,56,48,45,57,102,58,
   54,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,102,102,102,10,32,32,32,32,109,101,109,111,114,121,32,
   116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,32,97,114,99,104,105,116,101,
   99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,
   77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,
   55,50,53,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,
   68,97,116,97,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,111,115,
-  99,105,108,108,97,116,111,114,10,10,98,111,97,114,100,58,32,69,88,72,73,82,79,77,10,32,32,109,101,109,111,114,
-  121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,
-  97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,58,56,48,48,48,45,102,102,102,102,32,98,97,115,101,61,
-  48,120,52,48,48,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,55,100,58,48,
-  48,48,48,45,102,102,102,102,32,98,97,115,101,61,48,120,52,48,48,48,48,48,10,32,32,32,32,109,97,112,32,97,
-  100,100,114,101,115,115,61,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,99,48,
-  48,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,99,48,45,102,102,58,48,48,48,48,45,
-  102,102,102,102,32,109,97,115,107,61,48,120,99,48,48,48,48,48,10,10,98,111,97,114,100,58,32,69,88,72,73,82,
-  79,77,45,82,65,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,
-  61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,58,56,
-  48,48,48,45,102,102,102,102,32,98,97,115,101,61,48,120,52,48,48,48,48,48,10,32,32,32,32,109,97,112,32,97,
-  100,100,114,101,115,115,61,52,48,45,55,100,58,48,48,48,48,45,102,102,102,102,32,98,97,115,101,61,48,120,52,48,
-  48,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,56,48,45,98,102,58,56,48,48,48,45,
-  102,102,102,102,32,109,97,115,107,61,48,120,99,48,48,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,
-  115,115,61,99,48,45,102,102,58,48,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,99,48,48,48,48,48,
-  10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,
-  32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,50,48,45,51,102,44,97,48,45,98,102,58,54,48,48,48,
-  45,55,102,102,102,32,109,97,115,107,61,48,120,101,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,
-  115,61,55,48,45,55,100,58,48,48,48,48,45,55,102,102,102,10,10,98,111,97,114,100,58,32,69,88,72,73,82,79,
-  77,45,82,65,77,45,83,72,65,82,80,82,84,67,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,
-  32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,
-  61,48,48,45,51,102,58,56,48,48,48,45,102,102,102,102,32,98,97,115,101,61,48,120,52,48,48,48,48,48,10,32,
-  32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,55,100,58,48,48,48,48,45,102,102,102,102,32,98,
-  97,115,101,61,48,120,52,48,48,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,56,48,45,
-  98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,99,48,48,48,48,48,10,32,32,32,32,109,
-  97,112,32,97,100,100,114,101,115,115,61,99,48,45,102,102,58,48,48,48,48,45,102,102,102,102,32,109,97,115,107,61,
-  48,120,99,48,48,48,48,48,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,
-  110,116,61,83,97,118,101,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,50,48,45,51,102,44,97,48,
-  45,98,102,58,54,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,101,48,48,48,10,32,32,32,32,109,97,
-  112,32,97,100,100,114,101,115,115,61,55,48,45,55,100,58,48,48,48,48,45,55,102,102,102,10,32,32,114,116,99,32,
-  109,97,110,117,102,97,99,116,117,114,101,114,61,83,104,97,114,112,10,32,32,32,32,109,97,112,32,97,100,100,114,101,
-  115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,50,56,48,48,45,50,56,48,49,10,32,32,32,32,109,101,109,
-  111,114,121,32,116,121,112,101,61,82,84,67,32,99,111,110,116,101,110,116,61,84,105,109,101,32,109,97,110,117,102,97,
-  99,116,117,114,101,114,61,83,104,97,114,112,10,10,98,111,97,114,100,58,32,69,88,76,79,82,79,77,10,32,32,109,
-  101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,
-  32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,55,100,58,56,48,48,48,45,102,102,102,102,32,109,
-  97,115,107,61,48,120,56,48,56,48,48,48,32,98,97,115,101,61,48,120,52,48,48,48,48,48,10,32,32,32,32,109,
-  97,112,32,97,100,100,114,101,115,115,61,56,48,45,102,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,
-  48,120,56,48,56,48,48,48,32,98,97,115,101,61,48,120,48,48,48,48,48,48,10,10,98,111,97,114,100,58,32,69,
-  88,76,79,82,79,77,45,82,65,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,
-  116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,
-  55,100,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,56,48,48,48,32,98,97,115,101,61,
-  48,120,52,48,48,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,56,48,45,102,102,58,56,
-  48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,56,48,48,48,32,98,97,115,101,61,48,120,48,48,
-  48,48,48,48,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,
-  97,118,101,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,55,48,45,55,100,44,102,48,45,102,102,58,
-  48,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,10,98,111,97,114,100,58,32,69,88,
-  78,69,67,45,76,79,82,79,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,
-  101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,55,
-  100,44,56,48,45,102,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,
-  112,114,111,99,101,115,115,111,114,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,57,54,48,53,48,10,
-  32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,54,48,45,54,55,44,101,48,45,101,55,58,48,48,48,48,
-  45,51,102,102,102,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,
-  116,61,80,114,111,103,114,97,109,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,57,54,48,53,48,10,
-  32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,68,97,116,97,
-  32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,57,54,48,53,48,10,32,32,32,32,109,101,109,111,114,
-  121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,99,
-  116,117,114,101,61,117,80,68,57,54,48,53,48,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,
-  54,56,45,54,102,44,101,56,45,101,102,58,48,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,56,48,48,
-  48,10,32,32,32,32,111,115,99,105,108,108,97,116,111,114,10,10,98,111,97,114,100,58,32,69,88,83,80,67,55,49,
-  49,48,45,82,65,77,45,69,80,83,79,78,82,84,67,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,
-  77,32,99,111,110,116,101,110,116,61,69,120,112,97,110,115,105,111,110,10,32,32,32,32,109,97,112,32,97,100,100,114,
-  101,115,115,61,52,48,45,52,102,58,48,48,48,48,45,102,102,102,102,10,32,32,112,114,111,99,101,115,115,111,114,32,
-  105,100,101,110,116,105,102,105,101,114,61,83,80,67,55,49,49,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,
-  115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,52,56,48,48,45,52,56,51,102,10,32,32,32,32,109,97,112,
-  32,97,100,100,114,101,115,115,61,53,48,44,53,56,58,48,48,48,48,45,102,102,102,102,10,32,32,32,32,109,99,117,
-  10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,
-  48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,48,48,10,32,32,32,32,32,32,109,97,112,
-  32,97,100,100,114,101,115,115,61,99,48,45,102,102,58,48,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,
-  99,48,48,48,48,48,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,
-  116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,
-  79,77,32,99,111,110,116,101,110,116,61,68,97,116,97,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,
-  82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,
-  115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,54,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,
-  101,48,48,48,10,32,32,114,116,99,32,109,97,110,117,102,97,99,116,117,114,101,114,61,69,112,115,111,110,10,32,32,
-  32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,52,56,52,48,45,52,
-  56,52,50,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,84,67,32,99,111,110,116,101,110,116,61,
-  84,105,109,101,32,109,97,110,117,102,97,99,116,117,114,101,114,61,69,112,115,111,110,10,10,98,111,97,114,100,58,32,
-  71,66,45,76,79,82,79,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,
-  110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,55,100,
-  44,56,48,45,102,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,32,
-  32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,55,100,44,99,48,45,102,102,58,48,48,48,48,45,55,102,
-  102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,112,114,111,99,101,115,115,111,114,32,105,100,101,110,116,
-  105,102,105,101,114,61,73,67,68,32,114,101,118,105,115,105,111,110,61,50,10,32,32,32,32,109,97,112,32,97,100,100,
-  114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,54,48,48,48,45,54,55,102,102,44,55,48,48,48,45,
-  55,102,102,102,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,
-  61,66,111,111,116,32,97,114,99,104,105,116,101,99,116,117,114,101,61,76,82,51,53,57,48,50,10,32,32,32,32,111,
-  115,99,105,108,108,97,116,111,114,10,32,32,32,32,115,108,111,116,32,116,121,112,101,61,71,97,109,101,66,111,121,10,
-  10,98,111,97,114,100,58,32,71,83,85,45,82,65,77,10,32,32,112,114,111,99,101,115,115,111,114,32,97,114,99,104,
-  105,116,101,99,116,117,114,101,61,71,83,85,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,
-  51,102,44,56,48,45,98,102,58,51,48,48,48,45,51,52,102,102,10,32,32,32,32,109,101,109,111,114,121,32,116,121,
-  112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,32,32,109,97,112,
-  32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,
-  97,115,107,61,48,120,56,48,48,48,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,
-  53,102,44,99,48,45,100,102,58,48,48,48,48,45,102,102,102,102,10,32,32,32,32,109,101,109,111,114,121,32,116,121,
-  112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,32,32,109,97,112,32,97,100,
-  100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,54,48,48,48,45,55,102,102,102,32,115,105,122,101,
-  61,48,120,50,48,48,48,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,55,48,45,55,49,44,
-  102,48,45,102,49,58,48,48,48,48,45,102,102,102,102,10,10,98,111,97,114,100,58,32,72,73,82,79,77,10,32,32,
-  109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,
-  32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,
-  45,102,102,102,102,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,55,100,44,99,48,45,102,
-  102,58,48,48,48,48,45,102,102,102,102,10,10,98,111,97,114,100,58,32,72,73,82,79,77,45,82,65,77,10,32,32,
-  109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,
-  32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,
-  45,102,102,102,102,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,55,100,44,99,48,45,102,
-  102,58,48,48,48,48,45,102,102,102,102,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,
-  110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,50,48,45,51,102,
-  44,97,48,45,98,102,58,54,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,101,48,48,48,10,10,98,111,
-  97,114,100,58,32,72,73,84,65,67,72,73,45,76,79,82,79,77,10,32,32,112,114,111,99,101,115,115,111,114,32,97,
-  114,99,104,105,116,101,99,116,117,114,101,61,72,71,53,49,66,83,49,54,57,10,32,32,32,32,109,97,112,32,97,100,
-  100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,54,99,48,48,45,54,102,102,102,44,55,99,48,48,
-  45,55,102,102,102,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,
-  116,61,80,114,111,103,114,97,109,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,
-  102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,
-  32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,
-  32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,55,48,45,55,55,58,48,48,48,48,45,55,102,102,102,
-  10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,68,97,116,
-  97,32,97,114,99,104,105,116,101,99,116,117,114,101,61,72,71,53,49,66,83,49,54,57,10,32,32,32,32,109,101,109,
-  111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,
-  101,99,116,117,114,101,61,72,71,53,49,66,83,49,54,57,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,
-  115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,54,48,48,48,45,54,98,102,102,44,55,48,48,48,45,55,98,
-  102,102,32,109,97,115,107,61,48,120,102,48,48,48,10,32,32,32,32,111,115,99,105,108,108,97,116,111,114,10,10,98,
-  111,97,114,100,58,32,76,79,82,79,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,
-  110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,
-  45,55,100,44,56,48,45,102,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,
-  10,98,111,97,114,100,58,32,76,79,82,79,77,45,82,65,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,
-  82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,
-  101,115,115,61,48,48,45,55,100,44,56,48,45,102,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,
-  120,56,48,48,48,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,
-  83,97,118,101,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,55,48,45,55,100,44,102,48,45,102,102,
-  58,48,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,10,98,111,97,114,100,58,32,76,
-  79,82,79,77,45,82,65,77,35,65,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,
-  116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,
-  51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,
-  32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,
-  32,32,109,97,112,32,97,100,100,114,101,115,115,61,55,48,45,55,100,44,102,48,45,102,102,58,48,48,48,48,45,102,
-  102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,10,98,111,97,114,100,58,32,78,69,67,45,72,73,82,79,
-  77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,
-  114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,
-  56,48,48,48,45,102,102,102,102,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,55,100,44,
-  99,48,45,102,102,58,48,48,48,48,45,102,102,102,102,10,32,32,112,114,111,99,101,115,115,111,114,32,97,114,99,104,
-  105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,
-  61,48,48,45,49,102,44,56,48,45,57,102,58,54,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,102,102,
-  102,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,
-  111,103,114,97,109,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,
-  101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,
-  105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,
-  61,82,65,77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,
-  80,68,55,55,50,53,10,32,32,32,32,111,115,99,105,108,108,97,116,111,114,10,10,98,111,97,114,100,58,32,78,69,
-  67,45,72,73,82,79,77,45,82,65,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,
-  110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,
-  45,51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,10,32,32,32,32,109,97,112,32,97,100,100,114,
-  101,115,115,61,52,48,45,55,100,44,99,48,45,102,102,58,48,48,48,48,45,102,102,102,102,10,32,32,109,101,109,111,
-  114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,109,97,112,
-  32,97,100,100,114,101,115,115,61,50,48,45,51,102,44,97,48,45,98,102,58,54,48,48,48,45,55,102,102,102,32,109,
-  97,115,107,61,48,120,101,48,48,48,10,32,32,112,114,111,99,101,115,115,111,114,32,97,114,99,104,105,116,101,99,116,
-  117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,49,
-  102,44,56,48,45,57,102,58,54,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,102,102,102,10,32,32,32,
+  99,105,108,108,97,116,111,114,10,10,98,111,97,114,100,58,32,78,69,67,45,72,73,82,79,77,45,82,65,77,10,32,
+  32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,
+  10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,
+  48,45,102,102,102,102,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,55,100,44,99,48,45,
+  102,102,58,48,48,48,48,45,102,102,102,102,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,
+  111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,50,48,45,51,
+  102,44,97,48,45,98,102,58,54,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,101,48,48,48,10,32,32,
+  112,114,111,99,101,115,115,111,114,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,
+  32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,49,102,44,56,48,45,57,102,58,54,48,48,48,45,
+  55,102,102,102,32,109,97,115,107,61,48,120,102,102,102,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,
+  82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,32,97,114,99,104,105,116,101,99,116,117,114,101,
+  61,117,80,68,55,55,50,53,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,
+  116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,
+  32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,
+  97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,111,115,99,105,108,108,97,
+  116,111,114,10,10,98,111,97,114,100,58,32,78,69,67,45,76,79,82,79,77,10,32,32,109,101,109,111,114,121,32,116,
+  121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,
+  97,100,100,114,101,115,115,61,48,48,45,49,102,44,56,48,45,57,102,58,56,48,48,48,45,102,102,102,102,32,109,97,
+  115,107,61,48,120,56,48,48,48,10,32,32,112,114,111,99,101,115,115,111,114,32,97,114,99,104,105,116,101,99,116,117,
+  114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,51,48,45,51,102,
+  44,98,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,51,102,102,102,10,32,32,32,
   32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,
   32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,101,109,111,114,121,
   32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,99,116,
   117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,
   99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,
   53,10,32,32,32,32,111,115,99,105,108,108,97,116,111,114,10,10,98,111,97,114,100,58,32,78,69,67,45,76,79,82,
-  79,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,
-  103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,49,102,44,56,48,45,57,102,
-  58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,112,114,111,99,101,115,115,
-  111,114,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,97,112,32,
-  97,100,100,114,101,115,115,61,51,48,45,51,102,44,98,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,
-  115,107,61,48,120,51,102,102,102,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,
-  110,116,101,110,116,61,80,114,111,103,114,97,109,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,
-  50,53,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,68,
-  97,116,97,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,101,109,
-  111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,
-  101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,111,115,99,105,108,108,97,116,111,114,10,10,98,
-  111,97,114,100,58,32,78,69,67,45,76,79,82,79,77,45,82,65,77,10,32,32,109,101,109,111,114,121,32,116,121,112,
-  101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,
-  100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,
-  61,48,120,56,48,48,48,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,
-  116,61,83,97,118,101,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,55,48,45,55,100,44,102,48,45,
-  102,102,58,48,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,112,114,111,99,101,
-  115,115,111,114,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,97,
-  112,32,97,100,100,114,101,115,115,61,54,48,45,54,102,44,101,48,45,101,102,58,48,48,48,48,45,55,102,102,102,32,
-  109,97,115,107,61,48,120,51,102,102,102,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,
-  99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,
-  55,55,50,53,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,
-  61,68,97,116,97,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,
-  101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,
-  105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,111,115,99,105,108,108,97,116,111,114,10,
-  10,98,111,97,114,100,58,32,78,69,67,45,76,79,82,79,77,45,82,65,77,35,65,10,32,32,109,101,109,111,114,121,
-  32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,
-  112,32,97,100,100,114,101,115,115,61,48,48,45,49,102,44,56,48,45,57,102,58,56,48,48,48,45,102,102,102,102,32,
-  109,97,115,107,61,48,120,56,48,48,48,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,
-  110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,55,48,45,55,100,
-  44,102,48,45,102,102,58,48,48,48,48,45,102,102,102,102,10,32,32,112,114,111,99,101,115,115,111,114,32,97,114,99,
-  104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,
-  115,61,50,48,45,51,102,44,97,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,51,
-  102,102,102,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,
-  80,114,111,103,114,97,109,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,
-  32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,
-  99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,101,109,111,114,121,32,116,121,
-  112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,99,116,117,114,101,
-  61,117,80,68,55,55,50,53,10,32,32,32,32,111,115,99,105,108,108,97,116,111,114,10,10,98,111,97,114,100,58,32,
-  79,66,67,49,45,76,79,82,79,77,45,82,65,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,
-  32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,
-  61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,
-  48,48,10,32,32,112,114,111,99,101,115,115,111,114,32,105,100,101,110,116,105,102,105,101,114,61,79,66,67,49,10,32,
-  32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,54,48,48,48,45,
-  55,102,102,102,32,109,97,115,107,61,48,120,101,48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,
-  61,55,48,45,55,49,44,102,48,45,102,49,58,54,48,48,48,45,55,102,102,102,44,101,48,48,48,45,102,102,102,102,
-  32,109,97,115,107,61,48,120,101,48,48,48,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,
-  32,99,111,110,116,101,110,116,61,83,97,118,101,10,10,98,111,97,114,100,58,32,83,65,49,45,82,65,77,10,32,32,
-  112,114,111,99,101,115,115,111,114,32,97,114,99,104,105,116,101,99,116,117,114,101,61,87,54,53,67,56,49,54,83,10,
-  32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,50,50,48,48,
-  45,50,51,102,102,10,32,32,32,32,109,99,117,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,
-  48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,52,48,56,
-  48,48,48,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,99,48,45,102,102,58,48,48,48,48,
-  45,102,102,102,102,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,
-  101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,
-  99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,
-  48,45,51,102,44,56,48,45,98,102,58,54,48,48,48,45,55,102,102,102,32,115,105,122,101,61,48,120,50,48,48,48,
-  10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,45,52,102,58,48,48,48,48,45,102,102,
-  102,102,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,73,
-  110,116,101,114,110,97,108,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,
-  56,48,45,98,102,58,51,48,48,48,45,51,55,102,102,32,115,105,122,101,61,48,120,56,48,48,10,10,98,111,97,114,
-  100,58,32,83,68,68,49,10,32,32,112,114,111,99,101,115,115,111,114,32,105,100,101,110,116,105,102,105,101,114,61,83,
-  68,68,49,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,
-  52,56,48,48,45,52,56,48,102,10,32,32,32,32,109,99,117,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,
-  101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,10,32,32,32,32,32,32,
-  109,97,112,32,97,100,100,114,101,115,115,61,99,48,45,102,102,58,48,48,48,48,45,102,102,102,102,10,32,32,32,32,
-  32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,
-  109,10,10,98,111,97,114,100,58,32,83,68,68,49,45,82,65,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,
-  61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,
+  79,77,45,82,65,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,
+  61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,
+  48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,109,101,109,
+  111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,109,97,
+  112,32,97,100,100,114,101,115,115,61,55,48,45,55,100,44,102,48,45,102,102,58,48,48,48,48,45,55,102,102,102,32,
+  109,97,115,107,61,48,120,56,48,48,48,10,32,32,112,114,111,99,101,115,115,111,114,32,97,114,99,104,105,116,101,99,
+  116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,54,48,45,
+  54,102,44,101,48,45,101,102,58,48,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,51,102,102,102,10,32,
+  32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,
+  97,109,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,101,109,111,
+  114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,
+  99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,
+  77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,
+  55,50,53,10,32,32,32,32,111,115,99,105,108,108,97,116,111,114,10,10,98,111,97,114,100,58,32,78,69,67,45,76,
+  79,82,79,77,45,82,65,77,35,65,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,
+  116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,
+  49,102,44,56,48,45,57,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,
+  32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,
+  32,32,109,97,112,32,97,100,100,114,101,115,115,61,55,48,45,55,100,44,102,48,45,102,102,58,48,48,48,48,45,102,
+  102,102,102,10,32,32,112,114,111,99,101,115,115,111,114,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,
+  55,55,50,53,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,50,48,45,51,102,44,97,48,45,98,102,
+  58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,51,102,102,102,10,32,32,32,32,109,101,109,111,114,
+  121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,32,97,114,99,104,105,
+  116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,
+  82,79,77,32,99,111,110,116,101,110,116,61,68,97,116,97,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,
+  68,55,55,50,53,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,
+  116,61,68,97,116,97,32,97,114,99,104,105,116,101,99,116,117,114,101,61,117,80,68,55,55,50,53,10,32,32,32,32,
+  111,115,99,105,108,108,97,116,111,114,10,10,98,111,97,114,100,58,32,79,66,67,49,45,76,79,82,79,77,45,82,65,
+  77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,
+  114,97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,
+  56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,112,114,111,99,101,115,115,111,
+  114,32,105,100,101,110,116,105,102,105,101,114,61,79,66,67,49,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,
   115,61,48,48,45,51,102,44,56,48,45,98,102,58,54,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,101,
-  48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,55,48,45,55,51,58,48,48,48,48,45,102,
-  102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,112,114,111,99,101,115,115,111,114,32,105,100,101,110,
-  116,105,102,105,101,114,61,83,68,68,49,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,
-  102,44,56,48,45,98,102,58,52,56,48,48,45,52,56,48,102,10,32,32,32,32,109,99,117,10,32,32,32,32,32,32,
-  109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,
-  102,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,99,48,45,102,102,58,48,48,48,48,45,102,
-  102,102,102,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,
-  116,61,80,114,111,103,114,97,109,10,10,98,111,97,114,100,58,32,83,80,67,55,49,49,48,45,82,65,77,10,32,32,
-  112,114,111,99,101,115,115,111,114,32,105,100,101,110,116,105,102,105,101,114,61,83,80,67,55,49,49,48,10,32,32,32,
-  32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,52,56,48,48,45,52,56,
-  51,102,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,53,48,44,53,56,58,48,48,48,48,45,102,102,
-  102,102,10,32,32,32,32,109,99,117,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,
-  51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,48,48,
-  10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,99,48,45,102,102,58,48,48,48,48,45,102,102,
-  102,102,32,109,97,115,107,61,48,120,99,48,48,48,48,48,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,
-  112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,32,32,109,101,109,
-  111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,68,97,116,97,10,32,32,32,32,109,101,
-  109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,32,
-  32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,54,48,48,48,45,55,102,
-  102,102,32,109,97,115,107,61,48,120,101,48,48,48,10,10,98,111,97,114,100,58,32,83,80,67,55,49,49,48,45,82,
-  65,77,45,69,80,83,79,78,82,84,67,10,32,32,112,114,111,99,101,115,115,111,114,32,105,100,101,110,116,105,102,105,
-  101,114,61,83,80,67,55,49,49,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,
-  44,56,48,45,98,102,58,52,56,48,48,45,52,56,51,102,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,
-  61,53,48,44,53,56,58,48,48,48,48,45,102,102,102,102,10,32,32,32,32,109,99,117,10,32,32,32,32,32,32,109,
-  97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,
-  32,109,97,115,107,61,48,120,56,48,48,48,48,48,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,
-  61,99,48,45,102,102,58,48,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,99,48,48,48,48,48,10,32,
-  32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,
-  103,114,97,109,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,
-  110,116,61,68,97,116,97,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,
-  101,110,116,61,83,97,118,101,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,
-  44,56,48,45,98,102,58,54,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,101,48,48,48,10,32,32,114,
-  116,99,32,109,97,110,117,102,97,99,116,117,114,101,114,61,69,112,115,111,110,10,32,32,32,32,109,97,112,32,97,100,
-  100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,52,56,52,48,45,52,56,52,50,10,32,32,32,32,
-  109,101,109,111,114,121,32,116,121,112,101,61,82,84,67,32,99,111,110,116,101,110,116,61,84,105,109,101,32,109,97,110,
-  117,102,97,99,116,117,114,101,114,61,69,112,115,111,110,10,10,98,111,97,114,100,58,32,83,84,45,76,79,82,79,77,
-  10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,
-  97,109,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,49,102,44,56,48,45,57,102,58,56,
-  48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,115,108,111,116,32,116,121,112,101,
-  61,83,117,102,97,109,105,84,117,114,98,111,10,32,32,32,32,114,111,109,10,32,32,32,32,32,32,109,97,112,32,97,
-  100,100,114,101,115,115,61,50,48,45,51,102,44,97,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,
-  107,61,48,120,56,48,48,48,10,32,32,32,32,114,97,109,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,
-  115,115,61,54,48,45,54,102,44,101,48,45,101,102,58,48,48,48,48,45,102,102,102,102,10,32,32,115,108,111,116,32,
-  116,121,112,101,61,83,117,102,97,109,105,84,117,114,98,111,10,32,32,32,32,114,111,109,10,32,32,32,32,32,32,109,
-  97,112,32,97,100,100,114,101,115,115,61,52,48,45,53,102,44,99,48,45,100,102,58,48,48,48,48,45,102,102,102,102,
-  32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,32,32,114,97,109,10,32,32,32,32,32,32,109,97,112,32,97,
-  100,100,114,101,115,115,61,55,48,45,55,100,44,102,48,45,102,102,58,48,48,48,48,45,102,102,102,102,10,10,
+  48,48,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,55,48,45,55,49,44,102,48,45,102,49,58,
+  54,48,48,48,45,55,102,102,102,44,101,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,101,48,48,48,10,
+  32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,
+  10,10,98,111,97,114,100,58,32,83,65,49,45,82,65,77,10,32,32,112,114,111,99,101,115,115,111,114,32,97,114,99,
+  104,105,116,101,99,116,117,114,101,61,87,54,53,67,56,49,54,83,10,32,32,32,32,109,97,112,32,97,100,100,114,101,
+  115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,50,50,48,48,45,50,51,102,102,10,32,32,32,32,109,99,117,
+  10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,
+  48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,52,48,56,48,48,48,10,32,32,32,32,32,32,109,97,112,
+  32,97,100,100,114,101,115,115,61,99,48,45,102,102,58,48,48,48,48,45,102,102,102,102,10,32,32,32,32,32,32,109,
+  101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,
+  32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,
+  32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,54,48,
+  48,48,45,55,102,102,102,32,115,105,122,101,61,48,120,50,48,48,48,10,32,32,32,32,32,32,109,97,112,32,97,100,
+  100,114,101,115,115,61,52,48,45,52,102,58,48,48,48,48,45,102,102,102,102,10,32,32,32,32,109,101,109,111,114,121,
+  32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,73,110,116,101,114,110,97,108,10,32,32,32,32,32,
+  32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,51,48,48,48,45,51,55,
+  102,102,32,115,105,122,101,61,48,120,56,48,48,10,10,98,111,97,114,100,58,32,83,68,68,49,10,32,32,112,114,111,
+  99,101,115,115,111,114,32,105,100,101,110,116,105,102,105,101,114,61,83,68,68,49,10,32,32,32,32,109,97,112,32,97,
+  100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,52,56,48,48,45,52,56,48,102,10,32,32,32,
+  32,109,99,117,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,
+  98,102,58,56,48,48,48,45,102,102,102,102,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,99,
+  48,45,102,102,58,48,48,48,48,45,102,102,102,102,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,
+  61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,10,98,111,97,114,100,58,32,83,68,68,
+  49,45,82,65,77,10,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,
+  83,97,118,101,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,
+  58,54,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,101,48,48,48,10,32,32,32,32,109,97,112,32,97,
+  100,100,114,101,115,115,61,55,48,45,55,51,58,48,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,
+  48,48,10,32,32,112,114,111,99,101,115,115,111,114,32,105,100,101,110,116,105,102,105,101,114,61,83,68,68,49,10,32,
+  32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,52,56,48,48,45,
+  52,56,48,102,10,32,32,32,32,109,99,117,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,
+  48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,10,32,32,32,32,32,32,109,97,112,32,97,
+  100,100,114,101,115,115,61,99,48,45,102,102,58,48,48,48,48,45,102,102,102,102,10,32,32,32,32,32,32,109,101,109,
+  111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,10,98,111,
+  97,114,100,58,32,83,80,67,55,49,49,48,45,82,65,77,10,32,32,112,114,111,99,101,115,115,111,114,32,105,100,101,
+  110,116,105,102,105,101,114,61,83,80,67,55,49,49,48,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,
+  48,48,45,51,102,44,56,48,45,98,102,58,52,56,48,48,45,52,56,51,102,10,32,32,32,32,109,97,112,32,97,100,
+  100,114,101,115,115,61,53,48,44,53,56,58,48,48,48,48,45,102,102,102,102,10,32,32,32,32,109,99,117,10,32,32,
+  32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,56,48,48,48,
+  45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,48,48,10,32,32,32,32,32,32,109,97,112,32,97,100,
+  100,114,101,115,115,61,99,48,45,102,102,58,48,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,99,48,48,
+  48,48,48,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,
+  116,61,80,114,111,103,114,97,109,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,79,77,32,
+  99,111,110,116,101,110,116,61,68,97,116,97,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,65,77,
+  32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,
+  48,48,45,51,102,44,56,48,45,98,102,58,54,48,48,48,45,55,102,102,102,32,109,97,115,107,61,48,120,101,48,48,
+  48,10,10,98,111,97,114,100,58,32,83,80,67,55,49,49,48,45,82,65,77,45,69,80,83,79,78,82,84,67,10,32,
+  32,112,114,111,99,101,115,115,111,114,32,105,100,101,110,116,105,102,105,101,114,61,83,80,67,55,49,49,48,10,32,32,
+  32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,52,56,48,48,45,52,
+  56,51,102,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,53,48,44,53,56,58,48,48,48,48,45,102,
+  102,102,102,10,32,32,32,32,109,99,117,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,
+  45,51,102,44,56,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,48,
+  48,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,99,48,45,102,102,58,48,48,48,48,45,102,
+  102,102,102,32,109,97,115,107,61,48,120,99,48,48,48,48,48,10,32,32,32,32,32,32,109,101,109,111,114,121,32,116,
+  121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,32,32,109,101,
+  109,111,114,121,32,116,121,112,101,61,82,79,77,32,99,111,110,116,101,110,116,61,68,97,116,97,10,32,32,32,32,109,
+  101,109,111,114,121,32,116,121,112,101,61,82,65,77,32,99,111,110,116,101,110,116,61,83,97,118,101,10,32,32,32,32,
+  32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,48,45,98,102,58,54,48,48,48,45,55,
+  102,102,102,32,109,97,115,107,61,48,120,101,48,48,48,10,32,32,114,116,99,32,109,97,110,117,102,97,99,116,117,114,
+  101,114,61,69,112,115,111,110,10,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,48,48,45,51,102,44,56,
+  48,45,98,102,58,52,56,52,48,45,52,56,52,50,10,32,32,32,32,109,101,109,111,114,121,32,116,121,112,101,61,82,
+  84,67,32,99,111,110,116,101,110,116,61,84,105,109,101,32,109,97,110,117,102,97,99,116,117,114,101,114,61,69,112,115,
+  111,110,10,10,98,111,97,114,100,58,32,83,84,45,76,79,82,79,77,10,32,32,109,101,109,111,114,121,32,116,121,112,
+  101,61,82,79,77,32,99,111,110,116,101,110,116,61,80,114,111,103,114,97,109,10,32,32,32,32,109,97,112,32,97,100,
+  100,114,101,115,115,61,48,48,45,49,102,44,56,48,45,57,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,
+  61,48,120,56,48,48,48,10,32,32,115,108,111,116,32,116,121,112,101,61,83,117,102,97,109,105,84,117,114,98,111,10,
+  32,32,32,32,114,111,109,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,50,48,45,51,102,44,
+  97,48,45,98,102,58,56,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,32,32,32,32,
+  114,97,109,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,54,48,45,54,102,44,101,48,45,101,
+  102,58,48,48,48,48,45,102,102,102,102,10,32,32,115,108,111,116,32,116,121,112,101,61,83,117,102,97,109,105,84,117,
+  114,98,111,10,32,32,32,32,114,111,109,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,52,48,
+  45,53,102,44,99,48,45,100,102,58,48,48,48,48,45,102,102,102,102,32,109,97,115,107,61,48,120,56,48,48,48,10,
+  32,32,32,32,114,97,109,10,32,32,32,32,32,32,109,97,112,32,97,100,100,114,101,115,115,61,55,48,45,55,100,44,
+  102,48,45,102,102,58,48,48,48,48,45,102,102,102,102,10,10,
 };
 
 const unsigned char iplrom[64] = {
diff --git a/shaders/CRT-Royale.shader/bloom-approx.fs b/shaders/CRT-Royale.shader/bloom-approx.fs
new file mode 100644
index 00000000..a56c09d6
--- /dev/null
+++ b/shaders/CRT-Royale.shader/bloom-approx.fs
@@ -0,0 +1,13973 @@
+#version 150
+
+uniform sampler2D source[];
+uniform vec4 sourceSize[];
+uniform vec4 targetSize;
+
+in Vertex {
+   vec2 vTexCoord;
+   vec2 tex_uv;
+   vec2 blur_dxdy;
+   vec2 uv_scanline_step;
+   float estimated_viewport_size_x;
+   vec2 texture_size_inv;
+   vec2 tex_uv_to_pixel_scale;
+};
+
+out vec4 FragColor;
+
+// USER SETTINGS BLOCK //
+
+#define crt_gamma 2.500000
+#define lcd_gamma 2.200000
+#define levels_contrast 1.0
+#define halation_weight 0.0
+#define diffusion_weight 0.075
+#define bloom_underestimate_levels 0.8
+#define bloom_excess 0.000000
+#define beam_min_sigma 0.020000
+#define beam_max_sigma 0.300000
+#define beam_spot_power 0.330000
+#define beam_min_shape 2.000000
+#define beam_max_shape 4.000000
+#define beam_shape_power 0.250000
+#define beam_horiz_filter 0.000000
+#define beam_horiz_sigma 0.35
+#define beam_horiz_linear_rgb_weight 1.000000
+#define convergence_offset_x_r -0.000000
+#define convergence_offset_x_g 0.000000
+#define convergence_offset_x_b 0.000000
+#define convergence_offset_y_r 0.000000
+#define convergence_offset_y_g -0.000000
+#define convergence_offset_y_b 0.000000
+#define mask_type 1.000000
+#define mask_sample_mode_desired 0.000000
+#define mask_specify_num_triads 0.000000
+#define mask_triad_size_desired 3.000000
+#define mask_num_triads_desired 480.000000
+#define aa_subpixel_r_offset_x_runtime -0.0
+#define aa_subpixel_r_offset_y_runtime 0.000000
+#define aa_cubic_c 0.500000
+#define aa_gauss_sigma 0.500000
+#define geom_mode_runtime 0.000000
+#define geom_radius 2.000000
+#define geom_view_dist 2.000000
+#define geom_tilt_angle_x 0.000000
+#define geom_tilt_angle_y 0.000000
+#define geom_aspect_ratio_x 432.000000
+#define geom_aspect_ratio_y 329.000000
+#define geom_overscan_x 1.000000
+#define geom_overscan_y 1.000000
+#define border_size 0.015
+#define border_darkness 2.0
+#define border_compress 2.500000
+#define interlace_bff 0.000000
+#define interlace_1080i 0.000000
+
+// END USER SETTINGS BLOCK //
+
+// compatibility macros for transparently converting HLSLisms into GLSLisms
+#define mul(a,b) (b*a)
+#define lerp(a,b,c) mix(a,b,c)
+#define saturate(c) clamp(c, 0.0, 1.0)
+#define frac(x) (fract(x))
+#define float2 vec2
+#define float3 vec3
+#define float4 vec4
+#define bool2 bvec2
+#define bool3 bvec3
+#define bool4 bvec4
+#define float2x2 mat2x2
+#define float3x3 mat3x3
+#define float4x4 mat4x4
+#define float4x3 mat4x3
+#define float2x4 mat2x4
+#define IN params
+#define texture_size sourceSize[0].xy
+#define video_size sourceSize[0].xy
+#define output_size targetSize.xy
+#define frame_count phase
+#define static  
+#define inline  
+#define const  
+#define fmod(x,y) mod(x,y)
+#define ddx(c) dFdx(c)
+#define ddy(c) dFdy(c)
+#define atan2(x,y) atan(y,x)
+#define rsqrt(c) inversesqrt(c)
+
+#define input_texture source[0]
+
+#if defined(GL_ES)
+	#define COMPAT_PRECISION mediump
+#else
+	#define COMPAT_PRECISION
+#endif
+
+#if __VERSION__ >= 130
+	#define COMPAT_TEXTURE texture
+#else
+	#define COMPAT_TEXTURE texture2D
+#endif
+
+#define ORIG_LINEARIZEDvideo_size sourceSize[1].xy
+#define ORIG_LINEARIZEDtexture_size sourceSize[1].xy
+#define ORIG_LINEARIZED source[1]
+
+float bloom_approx_scale_x = targetSize.x / sourceSize[0].y;
+const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0);
+
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+////////////////////////////  END USER-SETTINGS  //////////////////////////
+
+//#include "bind-shader-h"
+
+/////////////////////////////  BEGIN BIND-SHADER-PARAMS  ////////////////////////////
+
+#ifndef BIND_SHADER_PARAMS_H
+#define BIND_SHADER_PARAMS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+/////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+////////////////////   END DERIVED-SETTINGS-AND-CONSTANTS   /////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+//  Override some parameters for gamma-management.h and tex2Dantialias.h:
+#define OVERRIDE_DEVICE_GAMMA
+static const float gba_gamma = 3.5; //  Irrelevant but necessary to define.
+#define ANTIALIAS_OVERRIDE_BASICS
+#define ANTIALIAS_OVERRIDE_PARAMETERS
+
+//  Provide accessors for vector constants that pack scalar uniforms:
+inline float2 get_aspect_vector(const float geom_aspect_ratio)
+{
+    //  Get an aspect ratio vector.  Enforce geom_max_aspect_ratio, and prevent
+    //  the absolute scale from affecting the uv-mapping for curvature:
+    const float geom_clamped_aspect_ratio =
+        min(geom_aspect_ratio, geom_max_aspect_ratio);
+    const float2 geom_aspect =
+        normalize(float2(geom_clamped_aspect_ratio, 1.0));
+    return geom_aspect;
+}
+
+inline float2 get_geom_overscan_vector()
+{
+    return float2(geom_overscan_x, geom_overscan_y);
+}
+
+inline float2 get_geom_tilt_angle_vector()
+{
+    return float2(geom_tilt_angle_x, geom_tilt_angle_y);
+}
+
+inline float3 get_convergence_offsets_x_vector()
+{
+    return float3(convergence_offset_x_r, convergence_offset_x_g,
+        convergence_offset_x_b);
+}
+
+inline float3 get_convergence_offsets_y_vector()
+{
+    return float3(convergence_offset_y_r, convergence_offset_y_g,
+        convergence_offset_y_b);
+}
+
+inline float2 get_convergence_offsets_r_vector()
+{
+    return float2(convergence_offset_x_r, convergence_offset_y_r);
+}
+
+inline float2 get_convergence_offsets_g_vector()
+{
+    return float2(convergence_offset_x_g, convergence_offset_y_g);
+}
+
+inline float2 get_convergence_offsets_b_vector()
+{
+    return float2(convergence_offset_x_b, convergence_offset_y_b);
+}
+
+inline float2 get_aa_subpixel_r_offset()
+{
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+            //  WARNING: THIS IS EXTREMELY EXPENSIVE.
+            return float2(aa_subpixel_r_offset_x_runtime,
+                aa_subpixel_r_offset_y_runtime);
+        #else
+            return aa_subpixel_r_offset_static;
+        #endif
+    #else
+        return aa_subpixel_r_offset_static;
+    #endif
+}
+
+//  Provide accessors settings which still need "cooking:"
+inline float get_mask_amplify()
+{
+    static const float mask_grille_amplify = 1.0/mask_grille_avg_color;
+    static const float mask_slot_amplify = 1.0/mask_slot_avg_color;
+    static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color;
+    return mask_type < 0.5 ? mask_grille_amplify :
+        mask_type < 1.5 ? mask_slot_amplify :
+        mask_shadow_amplify;
+}
+
+inline float get_mask_sample_mode()
+{
+    #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_desired;
+        #else
+            return clamp(mask_sample_mode_desired, 1.0, 2.0);
+        #endif
+    #else
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_static;
+        #else
+            return clamp(mask_sample_mode_static, 1.0, 2.0);
+        #endif
+    #endif
+}
+
+#endif  //  BIND_SHADER_PARAMS_H
+
+////////////////////////////  END BIND-SHADER-PARAMS  ///////////////////////////
+
+//#include "../../../../include/gamma-management.h"
+
+////////////////////////////  BEGIN GAMMA-MANAGEMENT  //////////////////////////
+
+#ifndef GAMMA_MANAGEMENT_H
+#define GAMMA_MANAGEMENT_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//  
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides gamma-aware tex*D*() and encode_output() functions.
+//  Requires:   Before #include-ing this file, the including file must #define
+//              the following macros when applicable and follow their rules:
+//              1.) #define FIRST_PASS if this is the first pass.
+//              2.) #define LAST_PASS if this is the last pass.
+//              3.) If sRGB is available, set srgb_framebufferN = "true" for
+//                  every pass except the last in your .cgp preset.
+//              4.) If sRGB isn't available but you want gamma-correctness with
+//                  no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
+//              5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
+//              6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
+//              7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
+//              8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
+//              If an option in [5, 8] is #defined in the first or last pass, it
+//              should be #defined for both.  It shouldn't make a difference
+//              whether it's #defined for intermediate passes or not.
+//  Optional:   The including file (or an earlier included file) may optionally
+//              #define a number of macros indicating it will override certain
+//              macros and associated constants are as follows:
+//              static constants with either static or uniform constants.  The
+//              1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
+//                  static const float ntsc_gamma
+//                  static const float pal_gamma
+//                  static const float crt_reference_gamma_high
+//                  static const float crt_reference_gamma_low
+//                  static const float lcd_reference_gamma
+//                  static const float crt_office_gamma
+//                  static const float lcd_office_gamma
+//              2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
+//                  static const float crt_gamma
+//                  static const float gba_gamma
+//                  static const float lcd_gamma
+//              3.) OVERRIDE_FINAL_GAMMA: The user must first define:
+//                  static const float input_gamma
+//                  static const float intermediate_gamma
+//                  static const float output_gamma
+//                  (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
+//              4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
+//                  static const bool assume_opaque_alpha
+//              The gamma constant overrides must be used in every pass or none,
+//              and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
+//              OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
+//  Usage:      After setting macros appropriately, ignore gamma correction and
+//              replace all tex*D*() calls with equivalent gamma-aware
+//              tex*D*_linearize calls, except:
+//              1.) When you read an LUT, use regular tex*D or a gamma-specified
+//                  function, depending on its gamma encoding:
+//                      tex*D*_linearize_gamma (takes a runtime gamma parameter)
+//              2.) If you must read pass0's original input in a later pass, use
+//                  tex2D_linearize_ntsc_gamma.  If you want to read pass0's
+//                  input with gamma-corrected bilinear filtering, consider
+//                  creating a first linearizing pass and reading from the input
+//                  of pass1 later.
+//              Then, return encode_output(color) from every fragment shader.
+//              Finally, use the global gamma_aware_bilinear boolean if you want
+//              to statically branch based on whether bilinear filtering is
+//              gamma-correct or not (e.g. for placing Gaussian blur samples).
+//
+//  Detailed Policy:
+//  tex*D*_linearize() functions enforce a consistent gamma-management policy
+//  based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings.  They assume
+//  their input texture has the same encoding characteristics as the input for
+//  the current pass (which doesn't apply to the exceptions listed above).
+//  Similarly, encode_output() enforces a policy based on the LAST_PASS and
+//  GAMMA_ENCODE_EVERY_FBO settings.  Together, they result in one of the
+//  following two pipelines.
+//  Typical pipeline with intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = linear_color;     //  Automatic sRGB encoding
+//      linear_color = intermediate_output;     //  Automatic sRGB decoding
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Typical pipeline without intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
+//      linear_color = pow(intermediate_output, intermediate_gamma);
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
+//  easily get gamma-correctness without banding on devices where sRGB isn't
+//  supported.
+//
+//  Use This Header to Maximize Code Reuse:
+//  The purpose of this header is to provide a consistent interface for texture
+//  reads and output gamma-encoding that localizes and abstracts away all the
+//  annoying details.  This greatly reduces the amount of code in each shader
+//  pass that depends on the pass number in the .cgp preset or whether sRGB
+//  FBO's are being used: You can trivially change the gamma behavior of your
+//  whole pass by commenting or uncommenting 1-3 #defines.  To reuse the same
+//  code in your first, Nth, and last passes, you can even put it all in another
+//  header file and #include it from skeleton .cg files that #define the
+//  appropriate pass-specific settings.
+//
+//  Rationale for Using Three Macros:
+//  This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
+//  SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
+//  a lower maintenance burden on each pass.  At first glance it seems we could
+//  accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
+//  This works for simple use cases where input_gamma == output_gamma, but it
+//  breaks down for more complex scenarios like CRT simulation, where the pass
+//  number determines the gamma encoding of the input and output.
+
+
+///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
+
+//  Set standard gamma constants, but allow users to override them:
+#ifndef OVERRIDE_STANDARD_GAMMA
+    //  Standard encoding gammas:
+    static const float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
+    static const float pal_gamma = 2.8;     //  Never actually 2.8 in practice
+    //  Typical device decoding gammas (only use for emulating devices):
+    //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
+    //  gammas: The standards purposely undercorrected for an analog CRT's
+    //  assumed 2.5 reference display gamma to maintain contrast in assumed
+    //  [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
+    //  These unstated assumptions about display gamma and perceptual rendering
+    //  intent caused a lot of confusion, and more modern CRT's seemed to target
+    //  NTSC 2.2 gamma with circuitry.  LCD displays seem to have followed suit
+    //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
+    //  displays designed to view sRGB in bright environments.  (Standards are
+    //  also in flux again with BT.1886, but it's underspecified for displays.)
+    static const float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
+    static const float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
+    static const float lcd_reference_gamma = 2.5;       //  To match CRT
+    static const float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
+    static const float lcd_office_gamma = 2.2;  //  Approximates sRGB
+#endif  //  OVERRIDE_STANDARD_GAMMA
+
+//  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
+//  but only if they're aware of it.
+#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
+    static const bool assume_opaque_alpha = false;
+#endif
+
+
+///////////////////////  DERIVED CONSTANTS AS FUNCTIONS  ///////////////////////
+
+//  gamma-management.h should be compatible with overriding gamma values with
+//  runtime user parameters, but we can only define other global constants in
+//  terms of static constants, not uniform user parameters.  To get around this
+//  limitation, we need to define derived constants using functions.
+
+//  Set device gamma constants, but allow users to override them:
+#ifdef OVERRIDE_DEVICE_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_crt_gamma()    {   return crt_gamma;   }
+    inline float get_gba_gamma()    {   return gba_gamma;   }
+    inline float get_lcd_gamma()    {   return lcd_gamma;   }
+#else
+    inline float get_crt_gamma()    {   return crt_reference_gamma_high;    }
+    inline float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
+    inline float get_lcd_gamma()    {   return lcd_office_gamma;            }
+#endif  //  OVERRIDE_DEVICE_GAMMA
+
+//  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
+#ifdef OVERRIDE_FINAL_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_intermediate_gamma()   {   return intermediate_gamma;  }
+    inline float get_input_gamma()          {   return input_gamma;         }
+    inline float get_output_gamma()         {   return output_gamma;        }
+#else
+    //  If we gamma-correct every pass, always use ntsc_gamma between passes to
+    //  ensure middle passes don't need to care if anything is being simulated:
+    inline float get_intermediate_gamma()   {   return ntsc_gamma;          }
+    #ifdef SIMULATE_CRT_ON_LCD
+        inline float get_input_gamma()      {   return get_crt_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_LCD
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_LCD_ON_CRT
+        inline float get_input_gamma()      {   return get_lcd_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_CRT
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else   //  Don't simulate anything:
+        inline float get_input_gamma()      {   return ntsc_gamma;          }
+        inline float get_output_gamma()     {   return ntsc_gamma;          }
+    #endif  //  SIMULATE_GBA_ON_CRT
+    #endif  //  SIMULATE_LCD_ON_CRT
+    #endif  //  SIMULATE_GBA_ON_LCD
+    #endif  //  SIMULATE_CRT_ON_LCD
+#endif  //  OVERRIDE_FINAL_GAMMA
+
+//  Set decoding/encoding gammas for the current pass.  Use static constants for
+//  linearize_input and gamma_encode_output, because they aren't derived, and
+//  they let the compiler do dead-code elimination.
+#ifndef GAMMA_ENCODE_EVERY_FBO
+    #ifdef FIRST_PASS
+        static const bool linearize_input = true;
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        static const bool linearize_input = false;
+        inline float get_pass_input_gamma()     {   return 1.0;                 }
+    #endif
+    #ifdef LAST_PASS
+        static const bool gamma_encode_output = true;
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        static const bool gamma_encode_output = false;
+        inline float get_pass_output_gamma()    {   return 1.0;                 }
+    #endif
+#else
+    static const bool linearize_input = true;
+    static const bool gamma_encode_output = true;
+    #ifdef FIRST_PASS
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        inline float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
+    #endif
+    #ifdef LAST_PASS
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        inline float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
+    #endif
+#endif
+
+//  Users might want to know if bilinear filtering will be gamma-correct:
+static const bool gamma_aware_bilinear = !linearize_input;
+
+
+//////////////////////  COLOR ENCODING/DECODING FUNCTIONS  /////////////////////
+
+inline float4 encode_output(const float4 color)
+{
+    if(gamma_encode_output)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_input(const float4 color)
+{
+    if(linearize_input)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_gamma_input(const float4 color, const float3 gamma)
+{
+    if(assume_opaque_alpha)
+    {
+        return float4(pow(color.rgb, gamma), 1.0);
+    }
+    else
+    {
+        return float4(pow(color.rgb, gamma), color.a);
+    }
+}
+
+//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯
+//#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D)))
+// EDIT: it's the 'const' in front of the coords that's doing it
+
+///////////////////////////  TEXTURE LOOKUP WRAPPERS  //////////////////////////
+
+//  "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a wide array of linearizing texture lookup wrapper functions.  The
+//  Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
+//  lookups are provided for completeness in case that changes someday.  Nobody
+//  is likely to use the *fetch and *proj functions, but they're included just
+//  in case.  The only tex*D texture sampling functions omitted are:
+//      - tex*Dcmpbias
+//      - tex*Dcmplod
+//      - tex*DARRAY*
+//      - tex*DMS*
+//      - Variants returning integers
+//  Standard line length restrictions are ignored below for vertical brevity.
+/*
+//  tex1D:
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex1Dbias:
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dbias(tex, tex_coords));   }
+
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dbias(tex, tex_coords, texel_off));    }
+
+//  tex1Dfetch:
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
+{   return decode_input(tex1Dfetch(tex, tex_coords));  }
+
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex1Dlod:
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dlod(tex, tex_coords));    }
+
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dlod(tex, tex_coords, texel_off));     }
+
+//  tex1Dproj:
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+*/
+//  tex2D:
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords, texel_off));    }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex2Dbias:
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
+//{   return decode_input(tex2Dbias(tex, tex_coords));   }
+
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dbias(tex, tex_coords, texel_off));    }
+
+//  tex2Dfetch:
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
+//{   return decode_input(tex2Dfetch(tex, tex_coords));  }
+
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords)
+{   return decode_input(textureLod(tex, tex_coords.xy, 0.0));    }
+
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));     }
+/*
+//  tex2Dproj:
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+*/
+/*
+//  tex3D:
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
+{   return decode_input(tex3D(tex, tex_coords));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, texel_off));    }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex3Dbias:
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dbias(tex, tex_coords));   }
+
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dbias(tex, tex_coords, texel_off));    }
+
+//  tex3Dfetch:
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
+{   return decode_input(tex3Dfetch(tex, tex_coords));  }
+
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex3Dlod:
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dlod(tex, tex_coords));    }
+
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dlod(tex, tex_coords, texel_off));     }
+
+//  tex3Dproj:
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dproj(tex, tex_coords));   }
+
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dproj(tex, tex_coords, texel_off));    }
+/////////*
+
+//  NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  This narrow selection of nonstandard tex2D* functions can be useful:
+
+//  tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0)));   }
+
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off));    }
+
+
+//  MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a narrower selection of tex2D* wrapper functions that decode an
+//  input sample with a specified gamma value.  These are useful for reading
+//  LUT's and for reading the input of pass0 in a later pass.
+
+//  tex2D:
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma);   }
+
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+/*
+//  tex2Dbias:
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma);   }
+
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma);    }
+
+//  tex2Dfetch:
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma);  }
+
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma);   }
+*/
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma);    }
+
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma);     }
+
+
+#endif  //  GAMMA_MANAGEMENT_H
+
+////////////////////////////  END GAMMA-MANAGEMENT  //////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  END DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////////////
+
+//#include "scanline-functions.h"
+
+/////////////////////////////  BEGIN SCANLINE-FUNCTIONS  ////////////////////////////
+
+#ifndef SCANLINE_FUNCTIONS_H
+#define SCANLINE_FUNCTIONS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+////////////////////////////  END USER-SETTINGS  //////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  END DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////////////
+
+//#include "../../../../include/special-functions.h"
+
+///////////////////////////  BEGIN SPECIAL-FUNCTIONS  //////////////////////////
+
+#ifndef SPECIAL_FUNCTIONS_H
+#define SPECIAL_FUNCTIONS_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file implements the following mathematical special functions:
+//  1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2))
+//  2.) gamma(s), a real-numbered extension of the integer factorial function
+//  It also implements normalized_ligamma(s, z), a normalized lower incomplete
+//  gamma function for s < 0.5 only.  Both gamma() and normalized_ligamma() can
+//  be called with an _impl suffix to use an implementation version with a few
+//  extra precomputed parameters (which may be useful for the caller to reuse).
+//  See below for details.
+//
+//  Design Rationale:
+//  Pretty much every line of code in this file is duplicated four times for
+//  different input types (float4/float3/float2/float).  This is unfortunate,
+//  but Cg doesn't allow function templates.  Macros would be far less verbose,
+//  but they would make the code harder to document and read.  I don't expect
+//  these functions will require a whole lot of maintenance changes unless
+//  someone ever has need for more robust incomplete gamma functions, so code
+//  duplication seems to be the lesser evil in this case.
+
+
+///////////////////////////  GAUSSIAN ERROR FUNCTION  //////////////////////////
+
+float4 erf6(float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Return an Abramowitz/Stegun approximation of erf(), where:
+    //                  erf(x) = 2/sqrt(pi) * integral(e**(-x**2))
+    //              This approximation has a max absolute error of 2.5*10**-5
+    //              with solid numerical robustness and efficiency.  See:
+	//                  https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions
+	static const float4 one = float4(1.0);
+	const float4 sign_x = sign(x);
+	const float4 t = one/(one + 0.47047*abs(x));
+	const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float3 erf6(const float3 x)
+{
+    //  Float3 version:
+	static const float3 one = float3(1.0);
+	const float3 sign_x = sign(x);
+	const float3 t = one/(one + 0.47047*abs(x));
+	const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float2 erf6(const float2 x)
+{
+    //  Float2 version:
+	static const float2 one = float2(1.0);
+	const float2 sign_x = sign(x);
+	const float2 t = one/(one + 0.47047*abs(x));
+	const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float erf6(const float x)
+{
+    //  Float version:
+	const float sign_x = sign(x);
+	const float t = 1.0/(1.0 + 0.47047*abs(x));
+	const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float4 erft(const float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Approximate erf() with the hyperbolic tangent.  The error is
+    //              visually noticeable, but it's blazing fast and perceptually
+    //              close...at least on ATI hardware.  See:
+    //                  http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html
+    //  Warning:    Only use this if your hardware drivers correctly implement
+    //              tanh(): My nVidia 8800GTS returns garbage output.
+	return tanh(1.202760580 * x);
+}
+
+float3 erft(const float3 x)
+{
+    //  Float3 version:
+	return tanh(1.202760580 * x);
+}
+
+float2 erft(const float2 x)
+{
+    //  Float2 version:
+	return tanh(1.202760580 * x);
+}
+
+float erft(const float x)
+{
+    //  Float version:
+	return tanh(1.202760580 * x);
+}
+
+inline float4 erf(const float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Some approximation of erf(x), depending on user settings.
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float3 erf(const float3 x)
+{
+    //  Float3 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float2 erf(const float2 x)
+{
+    //  Float2 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float erf(const float x)
+{
+    //  Float version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+
+///////////////////////////  COMPLETE GAMMA FUNCTION  //////////////////////////
+
+float4 gamma_impl(const float4 s, const float4 s_inv)
+{
+    //  Requires:   1.) s is the standard parameter to the gamma function, and
+    //                  it should lie in the [0, 36] range.
+    //              2.) s_inv = 1.0/s.  This implementation function requires
+    //                  the caller to precompute this value, giving users the
+    //                  opportunity to reuse it.
+    //  Returns:    Return approximate gamma function (real-numbered factorial)
+    //              output using the Lanczos approximation with two coefficients
+    //              calculated using Paul Godfrey's method here:
+    //                  http://my.fit.edu/~gabdo/gamma.txt
+    //              An optimal g value for s in [0, 36] is ~1.12906830989, with
+    //              a maximum relative error of 0.000463 for 2**16 equally
+    //              evals.  We could use three coeffs (0.0000346 error) without
+    //              hurting latency, but this allows more parallelism with
+    //              outside instructions.
+	static const float4 g = float4(1.12906830989);
+	static const float4 c0 = float4(0.8109119309638332633713423362694399653724431);
+	static const float4 c1 = float4(0.4808354605142681877121661197951496120000040);
+	static const float4 e = float4(2.71828182845904523536028747135266249775724709);
+	const float4 sph = s + float4(0.5);
+	const float4 lanczos_sum = c0 + c1/(s + float4(1.0));
+	const float4 base = (sph + g)/e;  //  or (s + g + float4(0.5))/e
+	//  gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s).
+	//  This has less error for small s's than (s -= 1.0) at the beginning.
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float3 gamma_impl(const float3 s, const float3 s_inv)
+{
+    //  Float3 version:
+	static const float3 g = float3(1.12906830989);
+	static const float3 c0 = float3(0.8109119309638332633713423362694399653724431);
+	static const float3 c1 = float3(0.4808354605142681877121661197951496120000040);
+	static const float3 e = float3(2.71828182845904523536028747135266249775724709);
+	const float3 sph = s + float3(0.5);
+	const float3 lanczos_sum = c0 + c1/(s + float3(1.0));
+	const float3 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float2 gamma_impl(const float2 s, const float2 s_inv)
+{
+    //  Float2 version:
+	static const float2 g = float2(1.12906830989);
+	static const float2 c0 = float2(0.8109119309638332633713423362694399653724431);
+	static const float2 c1 = float2(0.4808354605142681877121661197951496120000040);
+	static const float2 e = float2(2.71828182845904523536028747135266249775724709);
+	const float2 sph = s + float2(0.5);
+	const float2 lanczos_sum = c0 + c1/(s + float2(1.0));
+	const float2 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float gamma_impl(const float s, const float s_inv)
+{
+    //  Float version:
+	static const float g = 1.12906830989;
+	static const float c0 = 0.8109119309638332633713423362694399653724431;
+	static const float c1 = 0.4808354605142681877121661197951496120000040;
+	static const float e = 2.71828182845904523536028747135266249775724709;
+	const float sph = s + 0.5;
+	const float lanczos_sum = c0 + c1/(s + 1.0);
+	const float base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float4 gamma(const float4 s)
+{
+    //  Requires:   s is the standard parameter to the gamma function, and it
+    //              should lie in the [0, 36] range.
+    //  Returns:    Return approximate gamma function output with a maximum
+    //              relative error of 0.000463.  See gamma_impl for details.
+	return gamma_impl(s, float4(1.0)/s);
+}
+
+float3 gamma(const float3 s)
+{
+    //  Float3 version:
+	return gamma_impl(s, float3(1.0)/s);
+}
+
+float2 gamma(const float2 s)
+{
+    //  Float2 version:
+	return gamma_impl(s, float2(1.0)/s);
+}
+
+float gamma(const float s)
+{
+    //  Float version:
+	return gamma_impl(s, 1.0/s);
+}
+
+
+////////////////  INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT)  ///////////////
+
+//  Lower incomplete gamma function for small s and z (implementation):
+float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z <= ~0.775075
+    //              3.) s_inv = 1.0/s (precomputed for outside reuse)
+    //  Returns:    A series representation for the lower incomplete gamma
+    //              function for small s and small z (4 terms).
+    //  The actual "rolled up" summation looks like:
+	//      last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0;
+	//      sum = last_sign * last_pow / ((s + k) * last_factorial)
+	//      for(int i = 0; i < 4; ++i)
+	//      {
+	//          last_sign *= -1.0; last_pow *= z; last_factorial *= i;
+	//          sum += last_sign * last_pow / ((s + k) * last_factorial);
+	//      }
+	//  Unrolled, constant-unfolded and arranged for madds and parallelism:
+	const float4 scale = pow(z, s);
+	float4 sum = s_inv;  //  Summation iteration 0 result
+	//  Summation iterations 1, 2, and 3:
+	const float4 z_sq = z*z;
+	const float4 denom1 = s + float4(1.0);
+	const float4 denom2 = 2.0*s + float4(4.0);
+	const float4 denom3 = 6.0*s + float4(18.0);
+	//float4 denom4 = 24.0*s + float4(96.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	//sum += z_sq * z_sq / denom4;
+	//  Scale and return:
+	return scale * sum;
+}
+
+float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv)
+{
+    //  Float3 version:
+	const float3 scale = pow(z, s);
+	float3 sum = s_inv;
+	const float3 z_sq = z*z;
+	const float3 denom1 = s + float3(1.0);
+	const float3 denom2 = 2.0*s + float3(4.0);
+	const float3 denom3 = 6.0*s + float3(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv)
+{
+    //  Float2 version:
+	const float2 scale = pow(z, s);
+	float2 sum = s_inv;
+	const float2 z_sq = z*z;
+	const float2 denom1 = s + float2(1.0);
+	const float2 denom2 = 2.0*s + float2(4.0);
+	const float2 denom3 = 6.0*s + float2(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float ligamma_small_z_impl(const float s, const float z, const float s_inv)
+{
+    //  Float version:
+	const float scale = pow(z, s);
+	float sum = s_inv;
+	const float z_sq = z*z;
+	const float denom1 = s + 1.0;
+	const float denom2 = 2.0*s + 4.0;
+	const float denom3 = 6.0*s + 18.0;
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+//  Upper incomplete gamma function for small s and large z (implementation):
+float4 uigamma_large_z_impl(const float4 s, const float4 z)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z > ~0.775075
+    //  Returns:    Gauss's continued fraction representation for the upper
+    //              incomplete gamma function (4 terms).
+	//  The "rolled up" continued fraction looks like this.  The denominator
+    //  is truncated, and it's calculated "from the bottom up:"
+	//      denom = float4('inf');
+	//      float4 one = float4(1.0);
+	//      for(int i = 4; i > 0; --i)
+	//      {
+	//          denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom;
+	//      }
+	//  Unrolled and constant-unfolded for madds and parallelism:
+	const float4 numerator = pow(z, s) * exp(-z);
+	float4 denom = float4(7.0) + z - s;
+	denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom;
+	denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom;
+	denom = float4(1.0) + z - s + (s - float4(1.0))/denom;
+	return numerator / denom;
+}
+
+float3 uigamma_large_z_impl(const float3 s, const float3 z)
+{
+    //  Float3 version:
+	const float3 numerator = pow(z, s) * exp(-z);
+	float3 denom = float3(7.0) + z - s;
+	denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom;
+	denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom;
+	denom = float3(1.0) + z - s + (s - float3(1.0))/denom;
+	return numerator / denom;
+}
+
+float2 uigamma_large_z_impl(const float2 s, const float2 z)
+{
+    //  Float2 version:
+	const float2 numerator = pow(z, s) * exp(-z);
+	float2 denom = float2(7.0) + z - s;
+	denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom;
+	denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom;
+	denom = float2(1.0) + z - s + (s - float2(1.0))/denom;
+	return numerator / denom;
+}
+
+float uigamma_large_z_impl(const float s, const float z)
+{
+    //  Float version:
+	const float numerator = pow(z, s) * exp(-z);
+	float denom = 7.0 + z - s;
+	denom = 5.0 + z - s + (3.0*s - 9.0)/denom;
+	denom = 3.0 + z - s + (2.0*s - 4.0)/denom;
+	denom = 1.0 + z - s + (s - 1.0)/denom;
+	return numerator / denom;
+}
+
+//  Normalized lower incomplete gamma function for small s (implementation):
+float4 normalized_ligamma_impl(const float4 s, const float4 z,
+    const float4 s_inv, const float4 gamma_s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) s_inv = 1/s (precomputed for outside reuse)
+    //              3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse)
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  Since we only care about s < 0.5, we only need
+    //              to evaluate two branches (not four) based on z.  Each branch
+    //              uses four terms, with a max relative error of ~0.00182.  The
+    //              branch threshold and specifics were adapted for fewer terms
+    //              from Gil/Segura/Temme's paper here:
+    //                  http://oai.cwi.nl/oai/asset/20433/20433B.pdf
+	//  Evaluate both branches: Real branches test slower even when available.
+	static const float4 thresh = float4(0.775075);
+	bool4 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	z_is_large.w = z.w > thresh.w;
+	const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	//  Combine the results from both branches:
+	bool4 inverse_z_is_large = not(z_is_large);
+	return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large);
+}
+
+float3 normalized_ligamma_impl(const float3 s, const float3 z,
+    const float3 s_inv, const float3 gamma_s_inv)
+{
+    //  Float3 version:
+	static const float3 thresh = float3(0.775075);
+	bool3 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool3 inverse_z_is_large = not(z_is_large);
+	return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large);
+}
+
+float2 normalized_ligamma_impl(const float2 s, const float2 z,
+    const float2 s_inv, const float2 gamma_s_inv)
+{
+    //  Float2 version:
+	static const float2 thresh = float2(0.775075);
+	bool2 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool2 inverse_z_is_large = not(z_is_large);
+	return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large);
+}
+
+float normalized_ligamma_impl(const float s, const float z,
+    const float s_inv, const float gamma_s_inv)
+{
+    //  Float version:
+	static const float thresh = 0.775075;
+	const bool z_is_large = z > thresh;
+	const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	return large_z * float(z_is_large) + small_z * float(!z_is_large);
+}
+
+//  Normalized lower incomplete gamma function for small s:
+float4 normalized_ligamma(const float4 s, const float4 z)
+{
+    //  Requires:   s < ~0.5
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  See normalized_ligamma_impl() for details.
+	const float4 s_inv = float4(1.0)/s;
+	const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float3 normalized_ligamma(const float3 s, const float3 z)
+{
+    //  Float3 version:
+	const float3 s_inv = float3(1.0)/s;
+	const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float2 normalized_ligamma(const float2 s, const float2 z)
+{
+    //  Float2 version:
+	const float2 s_inv = float2(1.0)/s;
+	const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float normalized_ligamma(const float s, const float z)
+{
+    //  Float version:
+	const float s_inv = 1.0/s;
+	const float gamma_s_inv = 1.0/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+#endif  //  SPECIAL_FUNCTIONS_H
+
+////////////////////////////  END SPECIAL-FUNCTIONS  ///////////////////////////
+
+//#include "../../../../include/gamma-management.h"
+
+////////////////////////////  BEGIN GAMMA-MANAGEMENT  //////////////////////////
+
+#ifndef GAMMA_MANAGEMENT_H
+#define GAMMA_MANAGEMENT_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//  
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides gamma-aware tex*D*() and encode_output() functions.
+//  Requires:   Before #include-ing this file, the including file must #define
+//              the following macros when applicable and follow their rules:
+//              1.) #define FIRST_PASS if this is the first pass.
+//              2.) #define LAST_PASS if this is the last pass.
+//              3.) If sRGB is available, set srgb_framebufferN = "true" for
+//                  every pass except the last in your .cgp preset.
+//              4.) If sRGB isn't available but you want gamma-correctness with
+//                  no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
+//              5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
+//              6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
+//              7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
+//              8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
+//              If an option in [5, 8] is #defined in the first or last pass, it
+//              should be #defined for both.  It shouldn't make a difference
+//              whether it's #defined for intermediate passes or not.
+//  Optional:   The including file (or an earlier included file) may optionally
+//              #define a number of macros indicating it will override certain
+//              macros and associated constants are as follows:
+//              static constants with either static or uniform constants.  The
+//              1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
+//                  static const float ntsc_gamma
+//                  static const float pal_gamma
+//                  static const float crt_reference_gamma_high
+//                  static const float crt_reference_gamma_low
+//                  static const float lcd_reference_gamma
+//                  static const float crt_office_gamma
+//                  static const float lcd_office_gamma
+//              2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
+//                  static const float crt_gamma
+//                  static const float gba_gamma
+//                  static const float lcd_gamma
+//              3.) OVERRIDE_FINAL_GAMMA: The user must first define:
+//                  static const float input_gamma
+//                  static const float intermediate_gamma
+//                  static const float output_gamma
+//                  (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
+//              4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
+//                  static const bool assume_opaque_alpha
+//              The gamma constant overrides must be used in every pass or none,
+//              and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
+//              OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
+//  Usage:      After setting macros appropriately, ignore gamma correction and
+//              replace all tex*D*() calls with equivalent gamma-aware
+//              tex*D*_linearize calls, except:
+//              1.) When you read an LUT, use regular tex*D or a gamma-specified
+//                  function, depending on its gamma encoding:
+//                      tex*D*_linearize_gamma (takes a runtime gamma parameter)
+//              2.) If you must read pass0's original input in a later pass, use
+//                  tex2D_linearize_ntsc_gamma.  If you want to read pass0's
+//                  input with gamma-corrected bilinear filtering, consider
+//                  creating a first linearizing pass and reading from the input
+//                  of pass1 later.
+//              Then, return encode_output(color) from every fragment shader.
+//              Finally, use the global gamma_aware_bilinear boolean if you want
+//              to statically branch based on whether bilinear filtering is
+//              gamma-correct or not (e.g. for placing Gaussian blur samples).
+//
+//  Detailed Policy:
+//  tex*D*_linearize() functions enforce a consistent gamma-management policy
+//  based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings.  They assume
+//  their input texture has the same encoding characteristics as the input for
+//  the current pass (which doesn't apply to the exceptions listed above).
+//  Similarly, encode_output() enforces a policy based on the LAST_PASS and
+//  GAMMA_ENCODE_EVERY_FBO settings.  Together, they result in one of the
+//  following two pipelines.
+//  Typical pipeline with intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = linear_color;     //  Automatic sRGB encoding
+//      linear_color = intermediate_output;     //  Automatic sRGB decoding
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Typical pipeline without intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
+//      linear_color = pow(intermediate_output, intermediate_gamma);
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
+//  easily get gamma-correctness without banding on devices where sRGB isn't
+//  supported.
+//
+//  Use This Header to Maximize Code Reuse:
+//  The purpose of this header is to provide a consistent interface for texture
+//  reads and output gamma-encoding that localizes and abstracts away all the
+//  annoying details.  This greatly reduces the amount of code in each shader
+//  pass that depends on the pass number in the .cgp preset or whether sRGB
+//  FBO's are being used: You can trivially change the gamma behavior of your
+//  whole pass by commenting or uncommenting 1-3 #defines.  To reuse the same
+//  code in your first, Nth, and last passes, you can even put it all in another
+//  header file and #include it from skeleton .cg files that #define the
+//  appropriate pass-specific settings.
+//
+//  Rationale for Using Three Macros:
+//  This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
+//  SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
+//  a lower maintenance burden on each pass.  At first glance it seems we could
+//  accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
+//  This works for simple use cases where input_gamma == output_gamma, but it
+//  breaks down for more complex scenarios like CRT simulation, where the pass
+//  number determines the gamma encoding of the input and output.
+
+
+///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
+
+//  Set standard gamma constants, but allow users to override them:
+#ifndef OVERRIDE_STANDARD_GAMMA
+    //  Standard encoding gammas:
+    static const float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
+    static const float pal_gamma = 2.8;     //  Never actually 2.8 in practice
+    //  Typical device decoding gammas (only use for emulating devices):
+    //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
+    //  gammas: The standards purposely undercorrected for an analog CRT's
+    //  assumed 2.5 reference display gamma to maintain contrast in assumed
+    //  [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
+    //  These unstated assumptions about display gamma and perceptual rendering
+    //  intent caused a lot of confusion, and more modern CRT's seemed to target
+    //  NTSC 2.2 gamma with circuitry.  LCD displays seem to have followed suit
+    //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
+    //  displays designed to view sRGB in bright environments.  (Standards are
+    //  also in flux again with BT.1886, but it's underspecified for displays.)
+    static const float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
+    static const float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
+    static const float lcd_reference_gamma = 2.5;       //  To match CRT
+    static const float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
+    static const float lcd_office_gamma = 2.2;  //  Approximates sRGB
+#endif  //  OVERRIDE_STANDARD_GAMMA
+
+//  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
+//  but only if they're aware of it.
+#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
+    static const bool assume_opaque_alpha = false;
+#endif
+
+
+///////////////////////  DERIVED CONSTANTS AS FUNCTIONS  ///////////////////////
+
+//  gamma-management.h should be compatible with overriding gamma values with
+//  runtime user parameters, but we can only define other global constants in
+//  terms of static constants, not uniform user parameters.  To get around this
+//  limitation, we need to define derived constants using functions.
+
+//  Set device gamma constants, but allow users to override them:
+#ifdef OVERRIDE_DEVICE_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_crt_gamma()    {   return crt_gamma;   }
+    inline float get_gba_gamma()    {   return gba_gamma;   }
+    inline float get_lcd_gamma()    {   return lcd_gamma;   }
+#else
+    inline float get_crt_gamma()    {   return crt_reference_gamma_high;    }
+    inline float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
+    inline float get_lcd_gamma()    {   return lcd_office_gamma;            }
+#endif  //  OVERRIDE_DEVICE_GAMMA
+
+//  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
+#ifdef OVERRIDE_FINAL_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_intermediate_gamma()   {   return intermediate_gamma;  }
+    inline float get_input_gamma()          {   return input_gamma;         }
+    inline float get_output_gamma()         {   return output_gamma;        }
+#else
+    //  If we gamma-correct every pass, always use ntsc_gamma between passes to
+    //  ensure middle passes don't need to care if anything is being simulated:
+    inline float get_intermediate_gamma()   {   return ntsc_gamma;          }
+    #ifdef SIMULATE_CRT_ON_LCD
+        inline float get_input_gamma()      {   return get_crt_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_LCD
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_LCD_ON_CRT
+        inline float get_input_gamma()      {   return get_lcd_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_CRT
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else   //  Don't simulate anything:
+        inline float get_input_gamma()      {   return ntsc_gamma;          }
+        inline float get_output_gamma()     {   return ntsc_gamma;          }
+    #endif  //  SIMULATE_GBA_ON_CRT
+    #endif  //  SIMULATE_LCD_ON_CRT
+    #endif  //  SIMULATE_GBA_ON_LCD
+    #endif  //  SIMULATE_CRT_ON_LCD
+#endif  //  OVERRIDE_FINAL_GAMMA
+
+//  Set decoding/encoding gammas for the current pass.  Use static constants for
+//  linearize_input and gamma_encode_output, because they aren't derived, and
+//  they let the compiler do dead-code elimination.
+#ifndef GAMMA_ENCODE_EVERY_FBO
+    #ifdef FIRST_PASS
+        static const bool linearize_input = true;
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        static const bool linearize_input = false;
+        inline float get_pass_input_gamma()     {   return 1.0;                 }
+    #endif
+    #ifdef LAST_PASS
+        static const bool gamma_encode_output = true;
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        static const bool gamma_encode_output = false;
+        inline float get_pass_output_gamma()    {   return 1.0;                 }
+    #endif
+#else
+    static const bool linearize_input = true;
+    static const bool gamma_encode_output = true;
+    #ifdef FIRST_PASS
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        inline float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
+    #endif
+    #ifdef LAST_PASS
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        inline float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
+    #endif
+#endif
+
+//  Users might want to know if bilinear filtering will be gamma-correct:
+static const bool gamma_aware_bilinear = !linearize_input;
+
+
+//////////////////////  COLOR ENCODING/DECODING FUNCTIONS  /////////////////////
+
+inline float4 encode_output(const float4 color)
+{
+    if(gamma_encode_output)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_input(const float4 color)
+{
+    if(linearize_input)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_gamma_input(const float4 color, const float3 gamma)
+{
+    if(assume_opaque_alpha)
+    {
+        return float4(pow(color.rgb, gamma), 1.0);
+    }
+    else
+    {
+        return float4(pow(color.rgb, gamma), color.a);
+    }
+}
+
+//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯
+//#define tex2D_linearize(C, D) decode_input(vec4(texture(C, D)))
+// EDIT: it's the 'const' in front of the coords that's doing it
+
+///////////////////////////  TEXTURE LOOKUP WRAPPERS  //////////////////////////
+
+//  "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a wide array of linearizing texture lookup wrapper functions.  The
+//  Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
+//  lookups are provided for completeness in case that changes someday.  Nobody
+//  is likely to use the *fetch and *proj functions, but they're included just
+//  in case.  The only tex*D texture sampling functions omitted are:
+//      - tex*Dcmpbias
+//      - tex*Dcmplod
+//      - tex*DARRAY*
+//      - tex*DMS*
+//      - Variants returning integers
+//  Standard line length restrictions are ignored below for vertical brevity.
+/*
+//  tex1D:
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex1Dbias:
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dbias(tex, tex_coords));   }
+
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dbias(tex, tex_coords, texel_off));    }
+
+//  tex1Dfetch:
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
+{   return decode_input(tex1Dfetch(tex, tex_coords));  }
+
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex1Dlod:
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dlod(tex, tex_coords));    }
+
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dlod(tex, tex_coords, texel_off));     }
+
+//  tex1Dproj:
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+*/
+//  tex2D:
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords, texel_off));    }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex2Dbias:
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
+//{   return decode_input(tex2Dbias(tex, tex_coords));   }
+
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dbias(tex, tex_coords, texel_off));    }
+
+//  tex2Dfetch:
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
+//{   return decode_input(tex2Dfetch(tex, tex_coords));  }
+
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords)
+{   return decode_input(textureLod(tex, tex_coords.xy, 0.0));    }
+
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));     }
+/*
+//  tex2Dproj:
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+*/
+/*
+//  tex3D:
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
+{   return decode_input(tex3D(tex, tex_coords));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, texel_off));    }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex3Dbias:
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dbias(tex, tex_coords));   }
+
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dbias(tex, tex_coords, texel_off));    }
+
+//  tex3Dfetch:
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
+{   return decode_input(tex3Dfetch(tex, tex_coords));  }
+
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex3Dlod:
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dlod(tex, tex_coords));    }
+
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dlod(tex, tex_coords, texel_off));     }
+
+//  tex3Dproj:
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dproj(tex, tex_coords));   }
+
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dproj(tex, tex_coords, texel_off));    }
+/////////*
+
+//  NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  This narrow selection of nonstandard tex2D* functions can be useful:
+
+//  tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0)));   }
+
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off));    }
+
+
+//  MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a narrower selection of tex2D* wrapper functions that decode an
+//  input sample with a specified gamma value.  These are useful for reading
+//  LUT's and for reading the input of pass0 in a later pass.
+
+//  tex2D:
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma);   }
+
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+/*
+//  tex2Dbias:
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma);   }
+
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma);    }
+
+//  tex2Dfetch:
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma);  }
+
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma);   }
+*/
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma);    }
+
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma);     }
+
+
+#endif  //  GAMMA_MANAGEMENT_H
+
+////////////////////////////  END GAMMA-MANAGEMENT  //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+/////////////////////////////  SCANLINE FUNCTIONS  /////////////////////////////
+
+inline float3 get_gaussian_sigma(const float3 color, const float sigma_range)
+{
+    //  Requires:   Globals:
+    //              1.) beam_min_sigma and beam_max_sigma are global floats
+    //                  containing the desired minimum and maximum beam standard
+    //                  deviations, for dim and bright colors respectively.
+    //              2.) beam_max_sigma must be > 0.0
+    //              3.) beam_min_sigma must be in (0.0, beam_max_sigma]
+    //              4.) beam_spot_power must be defined as a global float.
+    //              Parameters:
+    //              1.) color is the underlying source color along a scanline
+    //              2.) sigma_range = beam_max_sigma - beam_min_sigma; we take
+    //                  sigma_range as a parameter to avoid repeated computation
+    //                  when beam_{min, max}_sigma are runtime shader parameters
+    //  Optional:   Users may set beam_spot_shape_function to 1 to define the
+    //              inner f(color) subfunction (see below) as:
+    //                  f(color) = sqrt(1.0 - (color - 1.0)*(color - 1.0))
+    //              Otherwise (technically, if beam_spot_shape_function < 0.5):
+    //                  f(color) = pow(color, beam_spot_power)
+    //  Returns:    The standard deviation of the Gaussian beam for "color:"
+    //                  sigma = beam_min_sigma + sigma_range * f(color)
+    //  Details/Discussion:
+    //  The beam's spot shape vaguely resembles an aspect-corrected f() in the
+    //  range [0, 1] (not quite, but it's related).  f(color) = color makes
+    //  spots look like diamonds, and a spherical function or cube balances
+    //  between variable width and a soft/realistic shape.   A beam_spot_power
+    //  > 1.0 can produce an ugly spot shape and more initial clipping, but the
+    //  final shape also differs based on the horizontal resampling filter and
+    //  the phosphor bloom.  For instance, resampling horizontally in nonlinear
+    //  light and/or with a sharp (e.g. Lanczos) filter will sharpen the spot
+    //  shape, but a sixth root is still quite soft.  A power function (default
+    //  1.0/3.0 beam_spot_power) is most flexible, but a fixed spherical curve
+    //  has the highest variability without an awful spot shape.
+    //
+    //  beam_min_sigma affects scanline sharpness/aliasing in dim areas, and its
+    //  difference from beam_max_sigma affects beam width variability.  It only
+    //  affects clipping [for pure Gaussians] if beam_spot_power > 1.0 (which is
+    //  a conservative estimate for a more complex constraint).
+    //
+    //  beam_max_sigma affects clipping and increasing scanline width/softness
+    //  as color increases.  The wider this is, the more scanlines need to be
+    //  evaluated to avoid distortion.  For a pure Gaussian, the max_beam_sigma
+    //  at which the first unused scanline always has a weight < 1.0/255.0 is:
+    //      num scanlines = 2, max_beam_sigma = 0.2089; distortions begin ~0.34
+    //      num scanlines = 3, max_beam_sigma = 0.3879; distortions begin ~0.52
+    //      num scanlines = 4, max_beam_sigma = 0.5723; distortions begin ~0.70
+    //      num scanlines = 5, max_beam_sigma = 0.7591; distortions begin ~0.89
+    //      num scanlines = 6, max_beam_sigma = 0.9483; distortions begin ~1.08
+    //  Generalized Gaussians permit more leeway here as steepness increases.
+    if(beam_spot_shape_function < 0.5)
+    {
+        //  Use a power function:
+        return float3(beam_min_sigma) + sigma_range *
+            pow(color, float3(beam_spot_power));
+    }
+    else
+    {
+        //  Use a spherical function:
+        const float3 color_minus_1 = color - float3(1.0);
+        return float3(beam_min_sigma) + sigma_range *
+            sqrt(float3(1.0) - color_minus_1*color_minus_1);
+    }
+}
+
+inline float3 get_generalized_gaussian_beta(const float3 color,
+    const float shape_range)
+{
+    //  Requires:   Globals:
+    //              1.) beam_min_shape and beam_max_shape are global floats
+    //                  containing the desired min/max generalized Gaussian
+    //                  beta parameters, for dim and bright colors respectively.
+    //              2.) beam_max_shape must be >= 2.0
+    //              3.) beam_min_shape must be in [2.0, beam_max_shape]
+    //              4.) beam_shape_power must be defined as a global float.
+    //              Parameters:
+    //              1.) color is the underlying source color along a scanline
+    //              2.) shape_range = beam_max_shape - beam_min_shape; we take
+    //                  shape_range as a parameter to avoid repeated computation
+    //                  when beam_{min, max}_shape are runtime shader parameters
+    //  Returns:    The type-I generalized Gaussian "shape" parameter beta for
+    //              the given color.
+    //  Details/Discussion:
+    //  Beta affects the scanline distribution as follows:
+    //  a.) beta < 2.0 narrows the peak to a spike with a discontinuous slope
+    //  b.) beta == 2.0 just degenerates to a Gaussian
+    //  c.) beta > 2.0 flattens and widens the peak, then drops off more steeply
+    //      than a Gaussian.  Whereas high sigmas widen and soften peaks, high
+    //      beta widen and sharpen peaks at the risk of aliasing.
+    //  Unlike high beam_spot_powers, high beam_shape_powers actually soften shape
+    //  transitions, whereas lower ones sharpen them (at the risk of aliasing).
+    return beam_min_shape + shape_range * pow(color, float3(beam_shape_power));
+}
+
+float3 scanline_gaussian_integral_contrib(const float3 dist,
+    const float3 color, const float pixel_height, const float sigma_range)
+{
+    //  Requires:   1.) dist is the distance of the [potentially separate R/G/B]
+    //                  point(s) from a scanline in units of scanlines, where
+    //                  1.0 means the sample point straddles the next scanline.
+    //              2.) color is the underlying source color along a scanline.
+    //              3.) pixel_height is the output pixel height in scanlines.
+    //              4.) Requirements of get_gaussian_sigma() must be met.
+    //  Returns:    Return a scanline's light output over a given pixel.
+    //  Details:
+    //  The CRT beam profile follows a roughly Gaussian distribution which is
+    //  wider for bright colors than dark ones.  The integral over the full
+    //  range of a Gaussian function is always 1.0, so we can vary the beam
+    //  with a standard deviation without affecting brightness.  'x' = distance:
+    //      gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2))
+    //      gaussian integral = 0.5 (1.0 + erf(x/(sigma * sqrt(2))))
+    //  Use a numerical approximation of the "error function" (the Gaussian
+    //  indefinite integral) to find the definite integral of the scanline's
+    //  average brightness over a given pixel area.  Even if curved coords were
+    //  used in this pass, a flat scalar pixel height works almost as well as a
+    //  pixel height computed from a full pixel-space to scanline-space matrix.
+    const float3 sigma = get_gaussian_sigma(color, sigma_range);
+    const float3 ph_offset = float3(pixel_height * 0.5);
+    const float3 denom_inv = 1.0/(sigma*sqrt(2.0));
+    const float3 integral_high = erf((dist + ph_offset)*denom_inv);
+    const float3 integral_low = erf((dist - ph_offset)*denom_inv);
+    return color * 0.5*(integral_high - integral_low)/pixel_height;
+}
+
+float3 scanline_generalized_gaussian_integral_contrib(float3 dist,
+    float3 color, float pixel_height, float sigma_range,
+    float shape_range)
+{
+    //  Requires:   1.) Requirements of scanline_gaussian_integral_contrib()
+    //                  must be met.
+    //              2.) Requirements of get_gaussian_sigma() must be met.
+    //              3.) Requirements of get_generalized_gaussian_beta() must be
+    //                  met.
+    //  Returns:    Return a scanline's light output over a given pixel.
+    //  A generalized Gaussian distribution allows the shape (beta) to vary
+    //  as well as the width (alpha).  "gamma" refers to the gamma function:
+    //      generalized sample =
+    //          beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta)
+    //  ligamma(s, z) is the lower incomplete gamma function, for which we only
+    //  implement two of four branches (because we keep 1/beta <= 0.5):
+    //      generalized integral = 0.5 + 0.5* sign(x) *
+    //          ligamma(1/beta, (|x|/alpha)**beta)/gamma(1/beta)
+    //  See get_generalized_gaussian_beta() for a discussion of beta.
+    //  We base alpha on the intended Gaussian sigma, but it only strictly
+    //  models models standard deviation at beta == 2, because the standard
+    //  deviation depends on both alpha and beta (keeping alpha independent is
+    //  faster and preserves intuitive behavior and a full spectrum of results).
+    const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
+    const float3 beta = get_generalized_gaussian_beta(color, shape_range);
+    const float3 alpha_inv = float3(1.0)/alpha;
+    const float3 s = float3(1.0)/beta;
+    const float3 ph_offset = float3(pixel_height * 0.5);
+    //  Pass beta to gamma_impl to avoid repeated divides.  Similarly pass
+    //  beta (i.e. 1/s) and 1/gamma(s) to normalized_ligamma_impl.
+    const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, beta);
+    const float3 dist1 = dist + ph_offset;
+    const float3 dist0 = dist - ph_offset;
+    const float3 integral_high = sign(dist1) * normalized_ligamma_impl(
+        s, pow(abs(dist1)*alpha_inv, beta), beta, gamma_s_inv);
+    const float3 integral_low = sign(dist0) * normalized_ligamma_impl(
+        s, pow(abs(dist0)*alpha_inv, beta), beta, gamma_s_inv);
+    return color * 0.5*(integral_high - integral_low)/pixel_height;
+}
+
+float3 scanline_gaussian_sampled_contrib(const float3 dist, const float3 color,
+    const float pixel_height, const float sigma_range)
+{
+    //  See scanline_gaussian integral_contrib() for detailed comments!
+    //  gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2))
+    const float3 sigma = get_gaussian_sigma(color, sigma_range);
+    //  Avoid repeated divides:
+    const float3 sigma_inv = float3(1.0)/sigma;
+    const float3 inner_denom_inv = 0.5 * sigma_inv * sigma_inv;
+    const float3 outer_denom_inv = sigma_inv/sqrt(2.0*pi);
+    if(beam_antialias_level > 0.5)
+    {
+        //  Sample 1/3 pixel away in each direction as well:
+        const float3 sample_offset = float3(pixel_height/3.0);
+        const float3 dist2 = dist + sample_offset;
+        const float3 dist3 = abs(dist - sample_offset);
+        //  Average three pure Gaussian samples:
+        const float3 scale = color/3.0 * outer_denom_inv;
+        const float3 weight1 = exp(-(dist*dist)*inner_denom_inv);
+        const float3 weight2 = exp(-(dist2*dist2)*inner_denom_inv);
+        const float3 weight3 = exp(-(dist3*dist3)*inner_denom_inv);
+        return scale * (weight1 + weight2 + weight3);
+    }
+    else
+    {
+        return color*exp(-(dist*dist)*inner_denom_inv)*outer_denom_inv;
+    }
+}
+
+float3 scanline_generalized_gaussian_sampled_contrib(float3 dist,
+    float3 color, float pixel_height, float sigma_range,
+    float shape_range)
+{
+    //  See scanline_generalized_gaussian_integral_contrib() for details!
+    //  generalized sample =
+    //      beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta)
+    const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
+    const float3 beta = get_generalized_gaussian_beta(color, shape_range);
+    //  Avoid repeated divides:
+    const float3 alpha_inv = float3(1.0)/alpha;
+    const float3 beta_inv = float3(1.0)/beta;
+    const float3 scale = color * beta * 0.5 * alpha_inv /
+        gamma_impl(beta_inv, beta);
+    if(beam_antialias_level > 0.5)
+    {
+        //  Sample 1/3 pixel closer to and farther from the scanline too.
+        const float3 sample_offset = float3(pixel_height/3.0);
+        const float3 dist2 = dist + sample_offset;
+        const float3 dist3 = abs(dist - sample_offset);
+        //  Average three generalized Gaussian samples:
+        const float3 weight1 = exp(-pow(abs(dist*alpha_inv), beta));
+        const float3 weight2 = exp(-pow(abs(dist2*alpha_inv), beta));
+        const float3 weight3 = exp(-pow(abs(dist3*alpha_inv), beta));
+        return scale/3.0 * (weight1 + weight2 + weight3);
+    }
+    else
+    {
+        return scale * exp(-pow(abs(dist*alpha_inv), beta));
+    }
+}
+
+inline float3 scanline_contrib(float3 dist, float3 color,
+    float pixel_height, const float sigma_range, const float shape_range)
+{
+    //  Requires:   1.) Requirements of scanline_gaussian_integral_contrib()
+    //                  must be met.
+    //              2.) Requirements of get_gaussian_sigma() must be met.
+    //              3.) Requirements of get_generalized_gaussian_beta() must be
+    //                  met.
+    //  Returns:    Return a scanline's light output over a given pixel, using
+    //              a generalized or pure Gaussian distribution and sampling or
+    //              integrals as desired by user codepath choices.
+    if(beam_generalized_gaussian)
+    {
+        if(beam_antialias_level > 1.5)
+        {
+            return scanline_generalized_gaussian_integral_contrib(
+                dist, color, pixel_height, sigma_range, shape_range);
+        }
+        else
+        {
+            return scanline_generalized_gaussian_sampled_contrib(
+                dist, color, pixel_height, sigma_range, shape_range);
+        }
+    }
+    else
+    {
+        if(beam_antialias_level > 1.5)
+        {
+            return scanline_gaussian_integral_contrib(
+                dist, color, pixel_height, sigma_range);
+        }
+        else
+        {
+            return scanline_gaussian_sampled_contrib(
+                dist, color, pixel_height, sigma_range);
+        }
+    }
+}
+
+inline float3 get_raw_interpolated_color(const float3 color0,
+    const float3 color1, const float3 color2, const float3 color3,
+    const float4 weights)
+{
+    //  Use max to avoid bizarre artifacts from negative colors:
+    return max(mul(weights, float4x3(color0, color1, color2, color3)), 0.0);
+}
+
+float3 get_interpolated_linear_color(const float3 color0, const float3 color1,
+    const float3 color2, const float3 color3, const float4 weights)
+{
+    //  Requires:   1.) Requirements of include/gamma-management.h must be met:
+    //                  intermediate_gamma must be globally defined, and input
+    //                  colors are interpreted as linear RGB unless you #define
+    //                  GAMMA_ENCODE_EVERY_FBO (in which case they are
+    //                  interpreted as gamma-encoded with intermediate_gamma).
+    //              2.) color0-3 are colors sampled from a texture with tex2D().
+    //                  They are interpreted as defined in requirement 1.
+    //              3.) weights contains weights for each color, summing to 1.0.
+    //              4.) beam_horiz_linear_rgb_weight must be defined as a global
+    //                  float in [0.0, 1.0] describing how much blending should
+    //                  be done in linear RGB (rest is gamma-corrected RGB).
+    //              5.) RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE must be #defined
+    //                  if beam_horiz_linear_rgb_weight is anything other than a
+    //                  static constant, or we may try branching at runtime
+    //                  without dynamic branches allowed (slow).
+    //  Returns:    Return an interpolated color lookup between the four input
+    //              colors based on the weights in weights.  The final color will
+    //              be a linear RGB value, but the blending will be done as
+    //              indicated above.
+    const float intermediate_gamma = get_intermediate_gamma();
+    //  Branch if beam_horiz_linear_rgb_weight is static (for free) or if the
+    //  profile allows dynamic branches (faster than computing extra pows):
+    #ifndef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+    #else
+        #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+            #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+        #endif
+    #endif
+    #ifdef SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+        //  beam_horiz_linear_rgb_weight is static, so we can branch:
+        #ifdef GAMMA_ENCODE_EVERY_FBO
+            const float3 gamma_mixed_color = pow(get_raw_interpolated_color(
+                color0, color1, color2, color3, weights), float3(intermediate_gamma));
+            if(beam_horiz_linear_rgb_weight > 0.0)
+            {
+                const float3 linear_mixed_color = get_raw_interpolated_color(
+                    pow(color0, float3(intermediate_gamma)),
+                    pow(color1, float3(intermediate_gamma)),
+                    pow(color2, float3(intermediate_gamma)),
+                    pow(color3, float3(intermediate_gamma)),
+                    weights);
+                return lerp(gamma_mixed_color, linear_mixed_color,
+                    beam_horiz_linear_rgb_weight);
+            }
+            else
+            {
+                return gamma_mixed_color;
+            }
+        #else
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                color0, color1, color2, color3, weights);
+            if(beam_horiz_linear_rgb_weight < 1.0)
+            {
+                const float3 gamma_mixed_color = get_raw_interpolated_color(
+                    pow(color0, float3(1.0/intermediate_gamma)),
+                    pow(color1, float3(1.0/intermediate_gamma)),
+                    pow(color2, float3(1.0/intermediate_gamma)),
+                    pow(color3, float3(1.0/intermediate_gamma)),
+                    weights);
+                return lerp(gamma_mixed_color, linear_mixed_color,
+                    beam_horiz_linear_rgb_weight);
+            }
+            else
+            {
+                return linear_mixed_color;
+            }
+        #endif  //  GAMMA_ENCODE_EVERY_FBO
+    #else
+        #ifdef GAMMA_ENCODE_EVERY_FBO
+            //  Inputs: color0-3 are colors in gamma-encoded RGB.
+            const float3 gamma_mixed_color = pow(get_raw_interpolated_color(
+                color0, color1, color2, color3, weights), intermediate_gamma);
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                pow(color0, float3(intermediate_gamma)),
+                pow(color1, float3(intermediate_gamma)),
+                pow(color2, float3(intermediate_gamma)),
+                pow(color3, float3(intermediate_gamma)),
+                weights);
+            return lerp(gamma_mixed_color, linear_mixed_color,
+                beam_horiz_linear_rgb_weight);
+        #else
+            //  Inputs: color0-3 are colors in linear RGB.
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                color0, color1, color2, color3, weights);
+            const float3 gamma_mixed_color = get_raw_interpolated_color(
+                    pow(color0, float3(1.0/intermediate_gamma)),
+                    pow(color1, float3(1.0/intermediate_gamma)),
+                    pow(color2, float3(1.0/intermediate_gamma)),
+                    pow(color3, float3(1.0/intermediate_gamma)),
+                    weights);
+			// wtf fixme
+//			const float beam_horiz_linear_rgb_weight1 = 1.0;
+            return lerp(gamma_mixed_color, linear_mixed_color,
+                beam_horiz_linear_rgb_weight);
+        #endif  //  GAMMA_ENCODE_EVERY_FBO
+    #endif  //  SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+}
+
+float3 get_scanline_color(const sampler2D tex, const float2 scanline_uv,
+    const float2 uv_step_x, const float4 weights)
+{
+    //  Requires:   1.) scanline_uv must be vertically snapped to the caller's
+    //                  desired line or scanline and horizontally snapped to the
+    //                  texel just left of the output pixel (color1)
+    //              2.) uv_step_x must contain the horizontal uv distance
+    //                  between texels.
+    //              3.) weights must contain interpolation filter weights for
+    //                  color0, color1, color2, and color3, where color1 is just
+    //                  left of the output pixel.
+    //  Returns:    Return a horizontally interpolated texture lookup using 2-4
+    //              nearby texels, according to weights and the conventions of
+    //              get_interpolated_linear_color().
+    //  We can ignore the outside texture lookups for Quilez resampling.
+    const float3 color1 = COMPAT_TEXTURE(tex, scanline_uv).rgb;
+    const float3 color2 = COMPAT_TEXTURE(tex, scanline_uv + uv_step_x).rgb;
+    float3 color0 = float3(0.0);
+    float3 color3 = float3(0.0);
+    if(beam_horiz_filter > 0.5)
+    {
+        color0 = COMPAT_TEXTURE(tex, scanline_uv - uv_step_x).rgb;
+        color3 = COMPAT_TEXTURE(tex, scanline_uv + 2.0 * uv_step_x).rgb;
+    }
+    //  Sample the texture as-is, whether it's linear or gamma-encoded:
+    //  get_interpolated_linear_color() will handle the difference.
+    return get_interpolated_linear_color(color0, color1, color2, color3, weights);
+}
+
+float3 sample_single_scanline_horizontal(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size,
+    const float2 texture_size_inv)
+{
+    //  TODO: Add function requirements.
+    //  Snap to the previous texel and get sample dists from 2/4 nearby texels:
+    const float2 curr_texel = tex_uv * tex_size;
+    //  Use under_half to fix a rounding bug right around exact texel locations.
+    const float2 prev_texel =
+        floor(curr_texel - float2(under_half)) + float2(0.5);
+    const float2 prev_texel_hor = float2(prev_texel.x, curr_texel.y);
+    const float2 prev_texel_hor_uv = prev_texel_hor * texture_size_inv;
+    const float prev_dist = curr_texel.x - prev_texel_hor.x;
+    const float4 sample_dists = float4(1.0 + prev_dist, prev_dist,
+        1.0 - prev_dist, 2.0 - prev_dist);
+    //  Get Quilez, Lanczos2, or Gaussian resize weights for 2/4 nearby texels:
+    float4 weights;
+    if(beam_horiz_filter < 0.5)
+    {
+        //  Quilez:
+        const float x = sample_dists.y;
+        const float w2 = x*x*x*(x*(x*6.0 - 15.0) + 10.0);
+        weights = float4(0.0, 1.0 - w2, w2, 0.0);
+    }
+    else if(beam_horiz_filter < 1.5)
+    {
+        //  Gaussian:
+        float inner_denom_inv = 1.0/(2.0*beam_horiz_sigma*beam_horiz_sigma);
+        weights = exp(-(sample_dists*sample_dists)*inner_denom_inv);
+    }
+    else
+    {
+        //  Lanczos2:
+        const float4 pi_dists = FIX_ZERO(sample_dists * pi);
+        weights = 2.0 * sin(pi_dists) * sin(pi_dists * 0.5) /
+            (pi_dists * pi_dists);
+    }
+    //  Ensure the weight sum == 1.0:
+    const float4 final_weights = weights/dot(weights, float4(1.0));
+    //  Get the interpolated horizontal scanline color:
+    const float2 uv_step_x = float2(texture_size_inv.x, 0.0);
+    return get_scanline_color(
+        tex, prev_texel_hor_uv, uv_step_x, final_weights);
+}
+
+float3 sample_rgb_scanline_horizontal(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size,
+    const float2 texture_size_inv)
+{
+    //  TODO: Add function requirements.
+    //  Rely on a helper to make convergence easier.
+    if(beam_misconvergence)
+    {
+        const float3 convergence_offsets_rgb =
+            get_convergence_offsets_x_vector();
+        const float3 offset_u_rgb =
+            convergence_offsets_rgb * texture_size_inv.xxx;
+        const float2 scanline_uv_r = tex_uv - float2(offset_u_rgb.r, 0.0);
+        const float2 scanline_uv_g = tex_uv - float2(offset_u_rgb.g, 0.0);
+        const float2 scanline_uv_b = tex_uv - float2(offset_u_rgb.b, 0.0);
+        const float3 sample_r = sample_single_scanline_horizontal(
+            tex, scanline_uv_r, tex_size, texture_size_inv);
+        const float3 sample_g = sample_single_scanline_horizontal(
+            tex, scanline_uv_g, tex_size, texture_size_inv);
+        const float3 sample_b = sample_single_scanline_horizontal(
+            tex, scanline_uv_b, tex_size, texture_size_inv);
+        return float3(sample_r.r, sample_g.g, sample_b.b);
+    }
+    else
+    {
+        return sample_single_scanline_horizontal(tex, tex_uv, tex_size,
+            texture_size_inv);
+    }
+}
+
+float2 get_last_scanline_uv(const float2 tex_uv, const float2 tex_size,
+    const float2 texture_size_inv, const float2 il_step_multiple,
+    const float frame_count, out float dist)
+{
+    //  Compute texture coords for the last/upper scanline, accounting for
+    //  interlacing: With interlacing, only consider even/odd scanlines every
+    //  other frame.  Top-field first (TFF) order puts even scanlines on even
+    //  frames, and BFF order puts them on odd frames.  Texels are centered at:
+    //      frac(tex_uv * tex_size) == x.5
+    //  Caution: If these coordinates ever seem incorrect, first make sure it's
+    //  not because anisotropic filtering is blurring across field boundaries.
+    //  Note: TFF/BFF won't matter for sources that double-weave or similar.
+	// wtf fixme
+//	const float interlace_bff1 = 1.0;
+    const float field_offset = floor(il_step_multiple.y * 0.75) *
+        fmod(frame_count + float(interlace_bff), 2.0);
+    const float2 curr_texel = tex_uv * tex_size;
+    //  Use under_half to fix a rounding bug right around exact texel locations.
+    const float2 prev_texel_num = floor(curr_texel - float2(under_half));
+    const float wrong_field = fmod(
+        prev_texel_num.y + field_offset, il_step_multiple.y);
+    const float2 scanline_texel_num = prev_texel_num - float2(0.0, wrong_field);
+    //  Snap to the center of the previous scanline in the current field:
+    const float2 scanline_texel = scanline_texel_num + float2(0.5);
+    const float2 scanline_uv = scanline_texel * texture_size_inv;
+    //  Save the sample's distance from the scanline, in units of scanlines:
+    dist = (curr_texel.y - scanline_texel.y)/il_step_multiple.y;
+    return scanline_uv;
+}
+
+inline bool is_interlaced(float num_lines)
+{
+    //  Detect interlacing based on the number of lines in the source.
+    if(interlace_detect)
+    {
+        //  NTSC: 525 lines, 262.5/field; 486 active (2 half-lines), 243/field
+        //  NTSC Emulators: Typically 224 or 240 lines
+        //  PAL: 625 lines, 312.5/field; 576 active (typical), 288/field
+        //  PAL Emulators: ?
+        //  ATSC: 720p, 1080i, 1080p
+        //  Where do we place our cutoffs?  Assumptions:
+        //  1.) We only need to care about active lines.
+        //  2.) Anything > 288 and <= 576 lines is probably interlaced.
+        //  3.) Anything > 576 lines is probably not interlaced...
+        //  4.) ...except 1080 lines, which is a crapshoot (user decision).
+        //  5.) Just in case the main program uses calculated video sizes,
+        //      we should nudge the float thresholds a bit.
+        const bool sd_interlace = ((num_lines > 288.5) && (num_lines < 576.5));
+        const bool hd_interlace = bool(interlace_1080i) ?
+            ((num_lines > 1079.5) && (num_lines < 1080.5)) :
+            false;
+        return (sd_interlace || hd_interlace);
+    }
+    else
+    {
+        return false;
+    }
+}
+
+#endif  //  SCANLINE_FUNCTIONS_H
+
+/////////////////////////////  END SCANLINE-FUNCTIONS  ////////////////////////////
+
+///////////////////////////////  END VERTEX INCLUDES  /////////////////////////////
+
+//////////////////////////////  FRAGMENT INCLUDES  //////////////////////////////
+
+//#include "../../../../include/blur-functions.h"
+
+////////////////////////////  BEGIN BLUR-FUNCTIONS  ///////////////////////////
+
+#ifndef BLUR_FUNCTIONS_H
+#define BLUR_FUNCTIONS_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides reusable one-pass and separable (two-pass) blurs.
+//  Requires:   All blurs share these requirements (dxdy requirement is split):
+//              1.) All requirements of gamma-management.h must be satisfied!
+//              2.) filter_linearN must == "true" in your .cgp preset unless
+//                  you're using tex2DblurNresize at 1x scale.
+//              3.) mipmap_inputN must == "true" in your .cgp preset if
+//                  output_size < video_size.
+//              4.) output_size == video_size / pow(2, M), where M is some
+//                  positive integer.  tex2Dblur*resize can resize arbitrarily
+//                  (and the blur will be done after resizing), but arbitrary
+//                  resizes "fail" with other blurs due to the way they mix
+//                  static weights with bilinear sample exploitation.
+//              5.) In general, dxdy should contain the uv pixel spacing:
+//                      dxdy = (video_size/output_size)/texture_size
+//              6.) For separable blurs (tex2DblurNresize and tex2DblurNfast),
+//                  zero out the dxdy component in the unblurred dimension:
+//                      dxdy = float2(dxdy.x, 0.0) or float2(0.0, dxdy.y)
+//              Many blurs share these requirements:
+//              1.) One-pass blurs require scale_xN == scale_yN or scales > 1.0,
+//                  or they will blur more in the lower-scaled dimension.
+//              2.) One-pass shared sample blurs require ddx(), ddy(), and
+//                  tex2Dlod() to be supported by the current Cg profile, and
+//                  the drivers must support high-quality derivatives.
+//              3.) One-pass shared sample blurs require:
+//                      tex_uv.w == log2(video_size/output_size).y;
+//              Non-wrapper blurs share this requirement:
+//              1.) sigma is the intended standard deviation of the blur
+//              Wrapper blurs share this requirement, which is automatically
+//              met (unless OVERRIDE_BLUR_STD_DEVS is #defined; see below):
+//              1.) blurN_std_dev must be global static const float values
+//                  specifying standard deviations for Nx blurs in units
+//                  of destination pixels
+//  Optional:   1.) The including file (or an earlier included file) may
+//                  optionally #define USE_BINOMIAL_BLUR_STD_DEVS to replace
+//                  default standard deviations with those matching a binomial
+//                  distribution.  (See below for details/properties.)
+//              2.) The including file (or an earlier included file) may
+//                  optionally #define OVERRIDE_BLUR_STD_DEVS and override:
+//                      static const float blur3_std_dev
+//                      static const float blur4_std_dev
+//                      static const float blur5_std_dev
+//                      static const float blur6_std_dev
+//                      static const float blur7_std_dev
+//                      static const float blur8_std_dev
+//                      static const float blur9_std_dev
+//                      static const float blur10_std_dev
+//                      static const float blur11_std_dev
+//                      static const float blur12_std_dev
+//                      static const float blur17_std_dev
+//                      static const float blur25_std_dev
+//                      static const float blur31_std_dev
+//                      static const float blur43_std_dev
+//              3.) The including file (or an earlier included file) may
+//                  optionally #define OVERRIDE_ERROR_BLURRING and override:
+//                      static const float error_blurring
+//                  This tuning value helps mitigate weighting errors from one-
+//                  pass shared-sample blurs sharing bilinear samples between
+//                  fragments.  Values closer to 0.0 have "correct" blurriness
+//                  but allow more artifacts, and values closer to 1.0 blur away
+//                  artifacts by sampling closer to halfway between texels.
+//              UPDATE 6/21/14: The above static constants may now be overridden
+//              by non-static uniform constants.  This permits exposing blur
+//              standard deviations as runtime GUI shader parameters.  However,
+//              using them keeps weights from being statically computed, and the
+//              speed hit depends on the blur: On my machine, uniforms kill over
+//              53% of the framerate with tex2Dblur12x12shared, but they only
+//              drop the framerate by about 18% with tex2Dblur11fast.
+//  Quality and Performance Comparisons:
+//  For the purposes of the following discussion, "no sRGB" means
+//  GAMMA_ENCODE_EVERY_FBO is #defined, and "sRGB" means it isn't.
+//  1.) tex2DblurNfast is always faster than tex2DblurNresize.
+//  2.) tex2DblurNresize functions are the only ones that can arbitrarily resize
+//      well, because they're the only ones that don't exploit bilinear samples.
+//      This also means they're the only functions which can be truly gamma-
+//      correct without linear (or sRGB FBO) input, but only at 1x scale.
+//  3.) One-pass shared sample blurs only have a speed advantage without sRGB.
+//      They also have some inaccuracies due to their shared-[bilinear-]sample
+//      design, which grow increasingly bothersome for smaller blurs and higher-
+//      frequency source images (relative to their resolution).  I had high
+//      hopes for them, but their most realistic use case is limited to quickly
+//      reblurring an already blurred input at full resolution.  Otherwise:
+//      a.) If you're blurring a low-resolution source, you want a better blur.
+//      b.) If you're blurring a lower mipmap, you want a better blur.
+//      c.) If you're blurring a high-resolution, high-frequency source, you
+//          want a better blur.
+//  4.) The one-pass blurs without shared samples grow slower for larger blurs,
+//      but they're competitive with separable blurs at 5x5 and smaller, and
+//      even tex2Dblur7x7 isn't bad if you're wanting to conserve passes.
+//  Here are some framerates from a GeForce 8800GTS.  The first pass resizes to
+//  viewport size (4x in this test) and linearizes for sRGB codepaths, and the
+//  remaining passes perform 6 full blurs.  Mipmapped tests are performed at the
+//  same scale, so they just measure the cost of mipmapping each FBO (only every
+//  other FBO is mipmapped for separable blurs, to mimic realistic usage).
+//  Mipmap      Neither     sRGB+Mipmap sRGB        Function
+//  76.0        92.3        131.3       193.7       tex2Dblur3fast
+//  63.2        74.4        122.4       175.5       tex2Dblur3resize
+//  93.7        121.2       159.3       263.2       tex2Dblur3x3
+//  59.7        68.7        115.4       162.1       tex2Dblur3x3resize
+//  63.2        74.4        122.4       175.5       tex2Dblur5fast
+//  49.3        54.8        100.0       132.7       tex2Dblur5resize
+//  59.7        68.7        115.4       162.1       tex2Dblur5x5
+//  64.9        77.2        99.1        137.2       tex2Dblur6x6shared
+//  55.8        63.7        110.4       151.8       tex2Dblur7fast
+//  39.8        43.9        83.9        105.8       tex2Dblur7resize
+//  40.0        44.2        83.2        104.9       tex2Dblur7x7
+//  56.4        65.5        71.9        87.9        tex2Dblur8x8shared
+//  49.3        55.1        99.9        132.5       tex2Dblur9fast
+//  33.3        36.2        72.4        88.0        tex2Dblur9resize
+//  27.8        29.7        61.3        72.2        tex2Dblur9x9
+//  37.2        41.1        52.6        60.2        tex2Dblur10x10shared
+//  44.4        49.5        91.3        117.8       tex2Dblur11fast
+//  28.8        30.8        63.6        75.4        tex2Dblur11resize
+//  33.6        36.5        40.9        45.5        tex2Dblur12x12shared
+//  TODO: Fill in benchmarks for new untested blurs.
+//                                                  tex2Dblur17fast
+//                                                  tex2Dblur25fast
+//                                                  tex2Dblur31fast
+//                                                  tex2Dblur43fast
+//                                                  tex2Dblur3x3resize
+
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+//  Set static standard deviations, but allow users to override them with their
+//  own constants (even non-static uniforms if they're okay with the speed hit):
+#ifndef OVERRIDE_BLUR_STD_DEVS
+    //  blurN_std_dev values are specified in terms of dxdy strides.
+    #ifdef USE_BINOMIAL_BLUR_STD_DEVS
+        //  By request, we can define standard deviations corresponding to a
+        //  binomial distribution with p = 0.5 (related to Pascal's triangle).
+        //  This distribution works such that blurring multiple times should
+        //  have the same result as a single larger blur.  These values are
+        //  larger than default for blurs up to 6x and smaller thereafter.
+        static const float blur3_std_dev = 0.84931640625;
+        static const float blur4_std_dev = 0.84931640625;
+        static const float blur5_std_dev = 1.0595703125;
+        static const float blur6_std_dev = 1.06591796875;
+        static const float blur7_std_dev = 1.17041015625;
+        static const float blur8_std_dev = 1.1720703125;
+        static const float blur9_std_dev = 1.2259765625;
+        static const float blur10_std_dev = 1.21982421875;
+        static const float blur11_std_dev = 1.25361328125;
+        static const float blur12_std_dev = 1.2423828125;
+        static const float blur17_std_dev = 1.27783203125;
+        static const float blur25_std_dev = 1.2810546875;
+        static const float blur31_std_dev = 1.28125;
+        static const float blur43_std_dev = 1.28125;
+    #else
+        //  The defaults are the largest values that keep the largest unused
+        //  blur term on each side <= 1.0/256.0.  (We could get away with more
+        //  or be more conservative, but this compromise is pretty reasonable.)
+        static const float blur3_std_dev = 0.62666015625;
+        static const float blur4_std_dev = 0.66171875;
+        static const float blur5_std_dev = 0.9845703125;
+        static const float blur6_std_dev = 1.02626953125;
+        static const float blur7_std_dev = 1.36103515625;
+        static const float blur8_std_dev = 1.4080078125;
+        static const float blur9_std_dev = 1.7533203125;
+        static const float blur10_std_dev = 1.80478515625;
+        static const float blur11_std_dev = 2.15986328125;
+        static const float blur12_std_dev = 2.215234375;
+        static const float blur17_std_dev = 3.45535583496;
+        static const float blur25_std_dev = 5.3409576416;
+        static const float blur31_std_dev = 6.86488037109;
+        static const float blur43_std_dev = 10.1852050781;
+    #endif  //  USE_BINOMIAL_BLUR_STD_DEVS
+#endif  //  OVERRIDE_BLUR_STD_DEVS
+
+#ifndef OVERRIDE_ERROR_BLURRING
+    //  error_blurring should be in [0.0, 1.0].  Higher values reduce ringing
+    //  in shared-sample blurs but increase blurring and feature shifting.
+    static const float error_blurring = 0.5;
+#endif
+
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//  gamma-management.h relies on pass-specific settings to guide its behavior:
+//  FIRST_PASS, LAST_PASS, GAMMA_ENCODE_EVERY_FBO, etc.  See it for details.
+//#include "gamma-management.h"
+
+////////////////////////////  BEGIN GAMMA-MANAGEMENT  //////////////////////////
+
+#ifndef GAMMA_MANAGEMENT_H
+#define GAMMA_MANAGEMENT_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//  
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides gamma-aware tex*D*() and encode_output() functions.
+//  Requires:   Before #include-ing this file, the including file must #define
+//              the following macros when applicable and follow their rules:
+//              1.) #define FIRST_PASS if this is the first pass.
+//              2.) #define LAST_PASS if this is the last pass.
+//              3.) If sRGB is available, set srgb_framebufferN = "true" for
+//                  every pass except the last in your .cgp preset.
+//              4.) If sRGB isn't available but you want gamma-correctness with
+//                  no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
+//              5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
+//              6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
+//              7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
+//              8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
+//              If an option in [5, 8] is #defined in the first or last pass, it
+//              should be #defined for both.  It shouldn't make a difference
+//              whether it's #defined for intermediate passes or not.
+//  Optional:   The including file (or an earlier included file) may optionally
+//              #define a number of macros indicating it will override certain
+//              macros and associated constants are as follows:
+//              static constants with either static or uniform constants.  The
+//              1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
+//                  static const float ntsc_gamma
+//                  static const float pal_gamma
+//                  static const float crt_reference_gamma_high
+//                  static const float crt_reference_gamma_low
+//                  static const float lcd_reference_gamma
+//                  static const float crt_office_gamma
+//                  static const float lcd_office_gamma
+//              2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
+//                  static const float crt_gamma
+//                  static const float gba_gamma
+//                  static const float lcd_gamma
+//              3.) OVERRIDE_FINAL_GAMMA: The user must first define:
+//                  static const float input_gamma
+//                  static const float intermediate_gamma
+//                  static const float output_gamma
+//                  (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
+//              4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
+//                  static const bool assume_opaque_alpha
+//              The gamma constant overrides must be used in every pass or none,
+//              and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
+//              OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
+//  Usage:      After setting macros appropriately, ignore gamma correction and
+//              replace all tex*D*() calls with equivalent gamma-aware
+//              tex*D*_linearize calls, except:
+//              1.) When you read an LUT, use regular tex*D or a gamma-specified
+//                  function, depending on its gamma encoding:
+//                      tex*D*_linearize_gamma (takes a runtime gamma parameter)
+//              2.) If you must read pass0's original input in a later pass, use
+//                  tex2D_linearize_ntsc_gamma.  If you want to read pass0's
+//                  input with gamma-corrected bilinear filtering, consider
+//                  creating a first linearizing pass and reading from the input
+//                  of pass1 later.
+//              Then, return encode_output(color) from every fragment shader.
+//              Finally, use the global gamma_aware_bilinear boolean if you want
+//              to statically branch based on whether bilinear filtering is
+//              gamma-correct or not (e.g. for placing Gaussian blur samples).
+//
+//  Detailed Policy:
+//  tex*D*_linearize() functions enforce a consistent gamma-management policy
+//  based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings.  They assume
+//  their input texture has the same encoding characteristics as the input for
+//  the current pass (which doesn't apply to the exceptions listed above).
+//  Similarly, encode_output() enforces a policy based on the LAST_PASS and
+//  GAMMA_ENCODE_EVERY_FBO settings.  Together, they result in one of the
+//  following two pipelines.
+//  Typical pipeline with intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = linear_color;     //  Automatic sRGB encoding
+//      linear_color = intermediate_output;     //  Automatic sRGB decoding
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Typical pipeline without intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
+//      linear_color = pow(intermediate_output, intermediate_gamma);
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
+//  easily get gamma-correctness without banding on devices where sRGB isn't
+//  supported.
+//
+//  Use This Header to Maximize Code Reuse:
+//  The purpose of this header is to provide a consistent interface for texture
+//  reads and output gamma-encoding that localizes and abstracts away all the
+//  annoying details.  This greatly reduces the amount of code in each shader
+//  pass that depends on the pass number in the .cgp preset or whether sRGB
+//  FBO's are being used: You can trivially change the gamma behavior of your
+//  whole pass by commenting or uncommenting 1-3 #defines.  To reuse the same
+//  code in your first, Nth, and last passes, you can even put it all in another
+//  header file and #include it from skeleton .cg files that #define the
+//  appropriate pass-specific settings.
+//
+//  Rationale for Using Three Macros:
+//  This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
+//  SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
+//  a lower maintenance burden on each pass.  At first glance it seems we could
+//  accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
+//  This works for simple use cases where input_gamma == output_gamma, but it
+//  breaks down for more complex scenarios like CRT simulation, where the pass
+//  number determines the gamma encoding of the input and output.
+
+
+///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
+
+//  Set standard gamma constants, but allow users to override them:
+#ifndef OVERRIDE_STANDARD_GAMMA
+    //  Standard encoding gammas:
+    static const float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
+    static const float pal_gamma = 2.8;     //  Never actually 2.8 in practice
+    //  Typical device decoding gammas (only use for emulating devices):
+    //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
+    //  gammas: The standards purposely undercorrected for an analog CRT's
+    //  assumed 2.5 reference display gamma to maintain contrast in assumed
+    //  [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
+    //  These unstated assumptions about display gamma and perceptual rendering
+    //  intent caused a lot of confusion, and more modern CRT's seemed to target
+    //  NTSC 2.2 gamma with circuitry.  LCD displays seem to have followed suit
+    //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
+    //  displays designed to view sRGB in bright environments.  (Standards are
+    //  also in flux again with BT.1886, but it's underspecified for displays.)
+    static const float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
+    static const float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
+    static const float lcd_reference_gamma = 2.5;       //  To match CRT
+    static const float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
+    static const float lcd_office_gamma = 2.2;  //  Approximates sRGB
+#endif  //  OVERRIDE_STANDARD_GAMMA
+
+//  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
+//  but only if they're aware of it.
+#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
+    static const bool assume_opaque_alpha = false;
+#endif
+
+
+///////////////////////  DERIVED CONSTANTS AS FUNCTIONS  ///////////////////////
+
+//  gamma-management.h should be compatible with overriding gamma values with
+//  runtime user parameters, but we can only define other global constants in
+//  terms of static constants, not uniform user parameters.  To get around this
+//  limitation, we need to define derived constants using functions.
+
+//  Set device gamma constants, but allow users to override them:
+#ifdef OVERRIDE_DEVICE_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_crt_gamma()    {   return crt_gamma;   }
+    inline float get_gba_gamma()    {   return gba_gamma;   }
+    inline float get_lcd_gamma()    {   return lcd_gamma;   }
+#else
+    inline float get_crt_gamma()    {   return crt_reference_gamma_high;    }
+    inline float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
+    inline float get_lcd_gamma()    {   return lcd_office_gamma;            }
+#endif  //  OVERRIDE_DEVICE_GAMMA
+
+//  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
+#ifdef OVERRIDE_FINAL_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_intermediate_gamma()   {   return intermediate_gamma;  }
+    inline float get_input_gamma()          {   return input_gamma;         }
+    inline float get_output_gamma()         {   return output_gamma;        }
+#else
+    //  If we gamma-correct every pass, always use ntsc_gamma between passes to
+    //  ensure middle passes don't need to care if anything is being simulated:
+    inline float get_intermediate_gamma()   {   return ntsc_gamma;          }
+    #ifdef SIMULATE_CRT_ON_LCD
+        inline float get_input_gamma()      {   return get_crt_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_LCD
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_LCD_ON_CRT
+        inline float get_input_gamma()      {   return get_lcd_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_CRT
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else   //  Don't simulate anything:
+        inline float get_input_gamma()      {   return ntsc_gamma;          }
+        inline float get_output_gamma()     {   return ntsc_gamma;          }
+    #endif  //  SIMULATE_GBA_ON_CRT
+    #endif  //  SIMULATE_LCD_ON_CRT
+    #endif  //  SIMULATE_GBA_ON_LCD
+    #endif  //  SIMULATE_CRT_ON_LCD
+#endif  //  OVERRIDE_FINAL_GAMMA
+
+//  Set decoding/encoding gammas for the current pass.  Use static constants for
+//  linearize_input and gamma_encode_output, because they aren't derived, and
+//  they let the compiler do dead-code elimination.
+#ifndef GAMMA_ENCODE_EVERY_FBO
+    #ifdef FIRST_PASS
+        static const bool linearize_input = true;
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        static const bool linearize_input = false;
+        inline float get_pass_input_gamma()     {   return 1.0;                 }
+    #endif
+    #ifdef LAST_PASS
+        static const bool gamma_encode_output = true;
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        static const bool gamma_encode_output = false;
+        inline float get_pass_output_gamma()    {   return 1.0;                 }
+    #endif
+#else
+    static const bool linearize_input = true;
+    static const bool gamma_encode_output = true;
+    #ifdef FIRST_PASS
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        inline float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
+    #endif
+    #ifdef LAST_PASS
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        inline float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
+    #endif
+#endif
+
+//  Users might want to know if bilinear filtering will be gamma-correct:
+static const bool gamma_aware_bilinear = !linearize_input;
+
+
+//////////////////////  COLOR ENCODING/DECODING FUNCTIONS  /////////////////////
+
+inline float4 encode_output(const float4 color)
+{
+    if(gamma_encode_output)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_input(const float4 color)
+{
+    if(linearize_input)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_gamma_input(const float4 color, const float3 gamma)
+{
+    if(assume_opaque_alpha)
+    {
+        return float4(pow(color.rgb, gamma), 1.0);
+    }
+    else
+    {
+        return float4(pow(color.rgb, gamma), color.a);
+    }
+}
+
+//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯
+//#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D)))
+// EDIT: it's the 'const' in front of the coords that's doing it
+
+///////////////////////////  TEXTURE LOOKUP WRAPPERS  //////////////////////////
+
+//  "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a wide array of linearizing texture lookup wrapper functions.  The
+//  Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
+//  lookups are provided for completeness in case that changes someday.  Nobody
+//  is likely to use the *fetch and *proj functions, but they're included just
+//  in case.  The only tex*D texture sampling functions omitted are:
+//      - tex*Dcmpbias
+//      - tex*Dcmplod
+//      - tex*DARRAY*
+//      - tex*DMS*
+//      - Variants returning integers
+//  Standard line length restrictions are ignored below for vertical brevity.
+/*
+//  tex1D:
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex1Dbias:
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dbias(tex, tex_coords));   }
+
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dbias(tex, tex_coords, texel_off));    }
+
+//  tex1Dfetch:
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
+{   return decode_input(tex1Dfetch(tex, tex_coords));  }
+
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex1Dlod:
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dlod(tex, tex_coords));    }
+
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dlod(tex, tex_coords, texel_off));     }
+
+//  tex1Dproj:
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+*/
+//  tex2D:
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords, texel_off));    }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex2Dbias:
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
+//{   return decode_input(tex2Dbias(tex, tex_coords));   }
+
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dbias(tex, tex_coords, texel_off));    }
+
+//  tex2Dfetch:
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
+//{   return decode_input(tex2Dfetch(tex, tex_coords));  }
+
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords)
+{   return decode_input(textureLod(tex, tex_coords.xy, 0.0));    }
+
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));     }
+/*
+//  tex2Dproj:
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+*/
+/*
+//  tex3D:
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
+{   return decode_input(tex3D(tex, tex_coords));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, texel_off));    }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex3Dbias:
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dbias(tex, tex_coords));   }
+
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dbias(tex, tex_coords, texel_off));    }
+
+//  tex3Dfetch:
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
+{   return decode_input(tex3Dfetch(tex, tex_coords));  }
+
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex3Dlod:
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dlod(tex, tex_coords));    }
+
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dlod(tex, tex_coords, texel_off));     }
+
+//  tex3Dproj:
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dproj(tex, tex_coords));   }
+
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dproj(tex, tex_coords, texel_off));    }
+/////////*
+
+//  NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  This narrow selection of nonstandard tex2D* functions can be useful:
+
+//  tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0)));   }
+
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off));    }
+
+
+//  MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a narrower selection of tex2D* wrapper functions that decode an
+//  input sample with a specified gamma value.  These are useful for reading
+//  LUT's and for reading the input of pass0 in a later pass.
+
+//  tex2D:
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma);   }
+
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+/*
+//  tex2Dbias:
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma);   }
+
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma);    }
+
+//  tex2Dfetch:
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma);  }
+
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma);   }
+*/
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma);    }
+
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma);     }
+
+
+#endif  //  GAMMA_MANAGEMENT_H
+
+////////////////////////////  END GAMMA-MANAGEMENT  //////////////////////////
+
+//#include "quad-pixel-communication.h"
+
+///////////////////////  BEGIN QUAD-PIXEL-COMMUNICATION  //////////////////////
+
+#ifndef QUAD_PIXEL_COMMUNICATION_H
+#define QUAD_PIXEL_COMMUNICATION_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey*
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DISCLAIMER  /////////////////////////////////
+
+//  *This code was inspired by "Shader Amortization using Pixel Quad Message
+//  Passing" by Eric Penner, published in GPU Pro 2, Chapter VI.2.  My intent
+//  is not to plagiarize his fundamentally similar code and assert my own
+//  copyright, but the algorithmic helper functions require so little code that
+//  implementations can't vary by much except bugfixes and conventions.  I just
+//  wanted to license my own particular code here to avoid ambiguity and make it
+//  clear that as far as I'm concerned, people can do as they please with it.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  Given screen pixel numbers, derive a "quad vector" describing a fragment's
+//  position in its 2x2 pixel quad.  Given that vector, obtain the values of any
+//  variable at neighboring fragments.
+//  Requires:   Using this file in general requires:
+//              1.) ddx() and ddy() are present in the current Cg profile.
+//              2.) The GPU driver is using fine/high-quality derivatives.
+//                  Functions will give incorrect results if this is not true,
+//                  so a test function is included.
+
+
+/////////////////////  QUAD-PIXEL COMMUNICATION PRIMITIVES  ////////////////////
+
+float4 get_quad_vector_naive(float4 output_pixel_num_wrt_uvxy)
+{
+    //  Requires:   Two measures of the current fragment's output pixel number
+    //              in the range ([0, output_size.x), [0, output_size.y)):
+    //              1.) output_pixel_num_wrt_uvxy.xy increase with uv coords.
+    //              2.) output_pixel_num_wrt_uvxy.zw increase with screen xy.
+    //  Returns:    Two measures of the fragment's position in its 2x2 quad:
+    //              1.) The .xy components are its 2x2 placement with respect to
+    //                  uv direction (the origin (0, 0) is at the top-left):
+    //                  top-left     = (-1.0, -1.0) top-right    = ( 1.0, -1.0)
+    //                  bottom-left  = (-1.0,  1.0) bottom-right = ( 1.0,  1.0)
+    //                  You need this to arrange/weight shared texture samples.
+    //              2.) The .zw components are its 2x2 placement with respect to
+    //                  screen xy direction (position); the origin varies.
+    //                  quad_gather needs this measure to work correctly.
+    //              Note: quad_vector.zw = quad_vector.xy * float2(
+    //                      ddx(output_pixel_num_wrt_uvxy.x),
+    //                      ddy(output_pixel_num_wrt_uvxy.y));
+    //  Caveats:    This function assumes the GPU driver always starts 2x2 pixel
+    //              quads at even pixel numbers.  This assumption can be wrong
+    //              for odd output resolutions (nondeterministically so).
+    float4 pixel_odd = frac(output_pixel_num_wrt_uvxy * 0.5) * 2.0;
+    float4 quad_vector = pixel_odd * 2.0 - float4(1.0);
+    return quad_vector;
+}
+
+float4 get_quad_vector(float4 output_pixel_num_wrt_uvxy)
+{
+    //  Requires:   Same as get_quad_vector_naive() (see that first).
+    //  Returns:    Same as get_quad_vector_naive() (see that first), but it's
+    //              correct even if the 2x2 pixel quad starts at an odd pixel,
+    //              which can occur at odd resolutions.
+    float4 quad_vector_guess =
+        get_quad_vector_naive(output_pixel_num_wrt_uvxy);
+    //  If quad_vector_guess.zw doesn't increase with screen xy, we know
+    //  the 2x2 pixel quad starts at an odd pixel:
+    float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_guess.z),
+                                                ddy(quad_vector_guess.w));
+    return quad_vector_guess * odd_start_mirror.xyxy;
+}
+
+float4 get_quad_vector(float2 output_pixel_num_wrt_uv)
+{
+    //  Requires:   1.) ddx() and ddy() are present in the current Cg profile.
+    //              2.) output_pixel_num_wrt_uv must increase with uv coords and
+    //                  measure the current fragment's output pixel number in:
+    //                      ([0, output_size.x), [0, output_size.y))
+    //  Returns:    Same as get_quad_vector_naive() (see that first), but it's
+    //              correct even if the 2x2 pixel quad starts at an odd pixel,
+    //              which can occur at odd resolutions.
+    //  Caveats:    This function requires less information than the version
+    //              taking a float4, but it's potentially slower.
+    //  Do screen coords increase with or against uv?  Get the direction
+    //  with respect to (uv.x, uv.y) for (screen.x, screen.y) in {-1, 1}.
+    float2 screen_uv_mirror = float2(ddx(output_pixel_num_wrt_uv.x),
+                                        ddy(output_pixel_num_wrt_uv.y));
+    float2 pixel_odd_wrt_uv = frac(output_pixel_num_wrt_uv * 0.5) * 2.0;
+    float2 quad_vector_uv_guess = (pixel_odd_wrt_uv - float2(0.5)) * 2.0;
+    float2 quad_vector_screen_guess = quad_vector_uv_guess * screen_uv_mirror;
+    //  If quad_vector_screen_guess doesn't increase with screen xy, we know
+    //  the 2x2 pixel quad starts at an odd pixel:
+    float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_screen_guess.x),
+                                                ddy(quad_vector_screen_guess.y));
+    float4 quad_vector_guess = float4(
+        quad_vector_uv_guess, quad_vector_screen_guess);
+    return quad_vector_guess * odd_start_mirror.xyxy;
+}
+
+void quad_gather(float4 quad_vector, float4 curr,
+    out float4 adjx, out float4 adjy, out float4 diag)
+{
+    //  Requires:   1.) ddx() and ddy() are present in the current Cg profile.
+    //              2.) The GPU driver is using fine/high-quality derivatives.
+    //              3.) quad_vector describes the current fragment's location in
+    //                  its 2x2 pixel quad using get_quad_vector()'s conventions.
+    //              4.) curr is any vector you wish to get neighboring values of.
+    //  Returns:    Values of an input vector (curr) at neighboring fragments
+    //              adjacent x, adjacent y, and diagonal (via out parameters).
+    adjx = curr - ddx(curr) * quad_vector.z;
+    adjy = curr - ddy(curr) * quad_vector.w;
+    diag = adjx - ddy(adjx) * quad_vector.w;
+}
+
+void quad_gather(float4 quad_vector, float3 curr,
+    out float3 adjx, out float3 adjy, out float3 diag)
+{
+    //  Float3 version
+    adjx = curr - ddx(curr) * quad_vector.z;
+    adjy = curr - ddy(curr) * quad_vector.w;
+    diag = adjx - ddy(adjx) * quad_vector.w;
+}
+
+void quad_gather(float4 quad_vector, float2 curr,
+    out float2 adjx, out float2 adjy, out float2 diag)
+{
+    //  Float2 version
+    adjx = curr - ddx(curr) * quad_vector.z;
+    adjy = curr - ddy(curr) * quad_vector.w;
+    diag = adjx - ddy(adjx) * quad_vector.w;
+}
+
+float4 quad_gather(float4 quad_vector, float curr)
+{
+    //  Float version:
+    //  Returns:    return.x == current
+    //              return.y == adjacent x
+    //              return.z == adjacent y
+    //              return.w == diagonal
+    float4 all = float4(curr);
+    all.y = all.x - ddx(all.x) * quad_vector.z;
+    all.zw = all.xy - ddy(all.xy) * quad_vector.w;
+    return all;
+}
+
+float4 quad_gather_sum(float4 quad_vector, float4 curr)
+{
+    //  Requires:   Same as quad_gather()
+    //  Returns:    Sum of an input vector (curr) at all fragments in a quad.
+    float4 adjx, adjy, diag;
+    quad_gather(quad_vector, curr, adjx, adjy, diag);
+    return (curr + adjx + adjy + diag);
+}
+
+float3 quad_gather_sum(float4 quad_vector, float3 curr)
+{
+    //  Float3 version:
+    float3 adjx, adjy, diag;
+    quad_gather(quad_vector, curr, adjx, adjy, diag);
+    return (curr + adjx + adjy + diag);
+}
+
+float2 quad_gather_sum(float4 quad_vector, float2 curr)
+{
+    //  Float2 version:
+    float2 adjx, adjy, diag;
+    quad_gather(quad_vector, curr, adjx, adjy, diag);
+    return (curr + adjx + adjy + diag);
+}
+
+float quad_gather_sum(float4 quad_vector, float curr)
+{
+    //  Float version:
+    float4 all_values = quad_gather(quad_vector, curr);
+    return (all_values.x + all_values.y + all_values.z + all_values.w);
+}
+
+bool fine_derivatives_working(float4 quad_vector, float4 curr)
+{
+    //  Requires:   1.) ddx() and ddy() are present in the current Cg profile.
+    //              2.) quad_vector describes the current fragment's location in
+    //                  its 2x2 pixel quad using get_quad_vector()'s conventions.
+    //              3.) curr must be a test vector with non-constant derivatives
+    //                  (its value should change nonlinearly across fragments).
+    //  Returns:    true if fine/hybrid/high-quality derivatives are used, or
+    //              false if coarse derivatives are used or inconclusive
+    //  Usage:      Test whether quad-pixel communication is working!
+    //  Method:     We can confirm fine derivatives are used if the following
+    //              holds (ever, for any value at any fragment):
+    //                  (ddy(curr) != ddy(adjx)) or (ddx(curr) != ddx(adjy))
+    //              The more values we test (e.g. test a float4 two ways), the
+    //              easier it is to demonstrate fine derivatives are working.
+    //  TODO: Check for floating point exact comparison issues!
+    float4 ddx_curr = ddx(curr);
+    float4 ddy_curr = ddy(curr);
+    float4 adjx = curr - ddx_curr * quad_vector.z;
+    float4 adjy = curr - ddy_curr * quad_vector.w;
+    bool ddy_different = any(bool4(ddy_curr.x != ddy(adjx).x, ddy_curr.y != ddy(adjx).y, ddy_curr.z != ddy(adjx).z, ddy_curr.w != ddy(adjx).w));
+    bool ddx_different = any(bool4(ddx_curr.x != ddx(adjy).x, ddx_curr.y != ddx(adjy).y, ddx_curr.z != ddx(adjy).z, ddx_curr.w != ddx(adjy).w));
+    return any(bool2(ddy_different, ddx_different));
+}
+
+bool fine_derivatives_working_fast(float4 quad_vector, float curr)
+{
+    //  Requires:   Same as fine_derivatives_working()
+    //  Returns:    Same as fine_derivatives_working()
+    //  Usage:      This is faster than fine_derivatives_working() but more
+    //              likely to return false negatives, so it's less useful for
+    //              offline testing/debugging.  It's also useless as the basis
+    //              for dynamic runtime branching as of May 2014: Derivatives
+    //              (and quad-pixel communication) are currently disallowed in
+    //              branches.  However, future GPU's may allow you to use them
+    //              in dynamic branches if you promise the branch condition
+    //              evaluates the same for every fragment in the quad (and/or if
+    //              the driver enforces that promise by making a single fragment
+    //              control branch decisions).  If that ever happens, this
+    //              version may become a more economical choice.
+    float ddx_curr = ddx(curr);
+    float ddy_curr = ddy(curr);
+    float adjx = curr - ddx_curr * quad_vector.z;
+    return (ddy_curr != ddy(adjx));
+}
+
+#endif  //  QUAD_PIXEL_COMMUNICATION_H
+
+////////////////////////  END QUAD-PIXEL-COMMUNICATION  ///////////////////////
+
+//#include "special-functions.h"
+
+///////////////////////////  BEGIN SPECIAL-FUNCTIONS  //////////////////////////
+
+#ifndef SPECIAL_FUNCTIONS_H
+#define SPECIAL_FUNCTIONS_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file implements the following mathematical special functions:
+//  1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2))
+//  2.) gamma(s), a real-numbered extension of the integer factorial function
+//  It also implements normalized_ligamma(s, z), a normalized lower incomplete
+//  gamma function for s < 0.5 only.  Both gamma() and normalized_ligamma() can
+//  be called with an _impl suffix to use an implementation version with a few
+//  extra precomputed parameters (which may be useful for the caller to reuse).
+//  See below for details.
+//
+//  Design Rationale:
+//  Pretty much every line of code in this file is duplicated four times for
+//  different input types (float4/float3/float2/float).  This is unfortunate,
+//  but Cg doesn't allow function templates.  Macros would be far less verbose,
+//  but they would make the code harder to document and read.  I don't expect
+//  these functions will require a whole lot of maintenance changes unless
+//  someone ever has need for more robust incomplete gamma functions, so code
+//  duplication seems to be the lesser evil in this case.
+
+
+///////////////////////////  GAUSSIAN ERROR FUNCTION  //////////////////////////
+
+float4 erf6(float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Return an Abramowitz/Stegun approximation of erf(), where:
+    //                  erf(x) = 2/sqrt(pi) * integral(e**(-x**2))
+    //              This approximation has a max absolute error of 2.5*10**-5
+    //              with solid numerical robustness and efficiency.  See:
+	//                  https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions
+	static const float4 one = float4(1.0);
+	const float4 sign_x = sign(x);
+	const float4 t = one/(one + 0.47047*abs(x));
+	const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float3 erf6(const float3 x)
+{
+    //  Float3 version:
+	static const float3 one = float3(1.0);
+	const float3 sign_x = sign(x);
+	const float3 t = one/(one + 0.47047*abs(x));
+	const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float2 erf6(const float2 x)
+{
+    //  Float2 version:
+	static const float2 one = float2(1.0);
+	const float2 sign_x = sign(x);
+	const float2 t = one/(one + 0.47047*abs(x));
+	const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float erf6(const float x)
+{
+    //  Float version:
+	const float sign_x = sign(x);
+	const float t = 1.0/(1.0 + 0.47047*abs(x));
+	const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float4 erft(const float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Approximate erf() with the hyperbolic tangent.  The error is
+    //              visually noticeable, but it's blazing fast and perceptually
+    //              close...at least on ATI hardware.  See:
+    //                  http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html
+    //  Warning:    Only use this if your hardware drivers correctly implement
+    //              tanh(): My nVidia 8800GTS returns garbage output.
+	return tanh(1.202760580 * x);
+}
+
+float3 erft(const float3 x)
+{
+    //  Float3 version:
+	return tanh(1.202760580 * x);
+}
+
+float2 erft(const float2 x)
+{
+    //  Float2 version:
+	return tanh(1.202760580 * x);
+}
+
+float erft(const float x)
+{
+    //  Float version:
+	return tanh(1.202760580 * x);
+}
+
+inline float4 erf(const float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Some approximation of erf(x), depending on user settings.
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float3 erf(const float3 x)
+{
+    //  Float3 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float2 erf(const float2 x)
+{
+    //  Float2 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float erf(const float x)
+{
+    //  Float version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+
+///////////////////////////  COMPLETE GAMMA FUNCTION  //////////////////////////
+
+float4 gamma_impl(const float4 s, const float4 s_inv)
+{
+    //  Requires:   1.) s is the standard parameter to the gamma function, and
+    //                  it should lie in the [0, 36] range.
+    //              2.) s_inv = 1.0/s.  This implementation function requires
+    //                  the caller to precompute this value, giving users the
+    //                  opportunity to reuse it.
+    //  Returns:    Return approximate gamma function (real-numbered factorial)
+    //              output using the Lanczos approximation with two coefficients
+    //              calculated using Paul Godfrey's method here:
+    //                  http://my.fit.edu/~gabdo/gamma.txt
+    //              An optimal g value for s in [0, 36] is ~1.12906830989, with
+    //              a maximum relative error of 0.000463 for 2**16 equally
+    //              evals.  We could use three coeffs (0.0000346 error) without
+    //              hurting latency, but this allows more parallelism with
+    //              outside instructions.
+	static const float4 g = float4(1.12906830989);
+	static const float4 c0 = float4(0.8109119309638332633713423362694399653724431);
+	static const float4 c1 = float4(0.4808354605142681877121661197951496120000040);
+	static const float4 e = float4(2.71828182845904523536028747135266249775724709);
+	const float4 sph = s + float4(0.5);
+	const float4 lanczos_sum = c0 + c1/(s + float4(1.0));
+	const float4 base = (sph + g)/e;  //  or (s + g + float4(0.5))/e
+	//  gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s).
+	//  This has less error for small s's than (s -= 1.0) at the beginning.
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float3 gamma_impl(const float3 s, const float3 s_inv)
+{
+    //  Float3 version:
+	static const float3 g = float3(1.12906830989);
+	static const float3 c0 = float3(0.8109119309638332633713423362694399653724431);
+	static const float3 c1 = float3(0.4808354605142681877121661197951496120000040);
+	static const float3 e = float3(2.71828182845904523536028747135266249775724709);
+	const float3 sph = s + float3(0.5);
+	const float3 lanczos_sum = c0 + c1/(s + float3(1.0));
+	const float3 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float2 gamma_impl(const float2 s, const float2 s_inv)
+{
+    //  Float2 version:
+	static const float2 g = float2(1.12906830989);
+	static const float2 c0 = float2(0.8109119309638332633713423362694399653724431);
+	static const float2 c1 = float2(0.4808354605142681877121661197951496120000040);
+	static const float2 e = float2(2.71828182845904523536028747135266249775724709);
+	const float2 sph = s + float2(0.5);
+	const float2 lanczos_sum = c0 + c1/(s + float2(1.0));
+	const float2 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float gamma_impl(const float s, const float s_inv)
+{
+    //  Float version:
+	static const float g = 1.12906830989;
+	static const float c0 = 0.8109119309638332633713423362694399653724431;
+	static const float c1 = 0.4808354605142681877121661197951496120000040;
+	static const float e = 2.71828182845904523536028747135266249775724709;
+	const float sph = s + 0.5;
+	const float lanczos_sum = c0 + c1/(s + 1.0);
+	const float base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float4 gamma(const float4 s)
+{
+    //  Requires:   s is the standard parameter to the gamma function, and it
+    //              should lie in the [0, 36] range.
+    //  Returns:    Return approximate gamma function output with a maximum
+    //              relative error of 0.000463.  See gamma_impl for details.
+	return gamma_impl(s, float4(1.0)/s);
+}
+
+float3 gamma(const float3 s)
+{
+    //  Float3 version:
+	return gamma_impl(s, float3(1.0)/s);
+}
+
+float2 gamma(const float2 s)
+{
+    //  Float2 version:
+	return gamma_impl(s, float2(1.0)/s);
+}
+
+float gamma(const float s)
+{
+    //  Float version:
+	return gamma_impl(s, 1.0/s);
+}
+
+
+////////////////  INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT)  ///////////////
+
+//  Lower incomplete gamma function for small s and z (implementation):
+float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z <= ~0.775075
+    //              3.) s_inv = 1.0/s (precomputed for outside reuse)
+    //  Returns:    A series representation for the lower incomplete gamma
+    //              function for small s and small z (4 terms).
+    //  The actual "rolled up" summation looks like:
+	//      last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0;
+	//      sum = last_sign * last_pow / ((s + k) * last_factorial)
+	//      for(int i = 0; i < 4; ++i)
+	//      {
+	//          last_sign *= -1.0; last_pow *= z; last_factorial *= i;
+	//          sum += last_sign * last_pow / ((s + k) * last_factorial);
+	//      }
+	//  Unrolled, constant-unfolded and arranged for madds and parallelism:
+	const float4 scale = pow(z, s);
+	float4 sum = s_inv;  //  Summation iteration 0 result
+	//  Summation iterations 1, 2, and 3:
+	const float4 z_sq = z*z;
+	const float4 denom1 = s + float4(1.0);
+	const float4 denom2 = 2.0*s + float4(4.0);
+	const float4 denom3 = 6.0*s + float4(18.0);
+	//float4 denom4 = 24.0*s + float4(96.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	//sum += z_sq * z_sq / denom4;
+	//  Scale and return:
+	return scale * sum;
+}
+
+float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv)
+{
+    //  Float3 version:
+	const float3 scale = pow(z, s);
+	float3 sum = s_inv;
+	const float3 z_sq = z*z;
+	const float3 denom1 = s + float3(1.0);
+	const float3 denom2 = 2.0*s + float3(4.0);
+	const float3 denom3 = 6.0*s + float3(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv)
+{
+    //  Float2 version:
+	const float2 scale = pow(z, s);
+	float2 sum = s_inv;
+	const float2 z_sq = z*z;
+	const float2 denom1 = s + float2(1.0);
+	const float2 denom2 = 2.0*s + float2(4.0);
+	const float2 denom3 = 6.0*s + float2(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float ligamma_small_z_impl(const float s, const float z, const float s_inv)
+{
+    //  Float version:
+	const float scale = pow(z, s);
+	float sum = s_inv;
+	const float z_sq = z*z;
+	const float denom1 = s + 1.0;
+	const float denom2 = 2.0*s + 4.0;
+	const float denom3 = 6.0*s + 18.0;
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+//  Upper incomplete gamma function for small s and large z (implementation):
+float4 uigamma_large_z_impl(const float4 s, const float4 z)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z > ~0.775075
+    //  Returns:    Gauss's continued fraction representation for the upper
+    //              incomplete gamma function (4 terms).
+	//  The "rolled up" continued fraction looks like this.  The denominator
+    //  is truncated, and it's calculated "from the bottom up:"
+	//      denom = float4('inf');
+	//      float4 one = float4(1.0);
+	//      for(int i = 4; i > 0; --i)
+	//      {
+	//          denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom;
+	//      }
+	//  Unrolled and constant-unfolded for madds and parallelism:
+	const float4 numerator = pow(z, s) * exp(-z);
+	float4 denom = float4(7.0) + z - s;
+	denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom;
+	denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom;
+	denom = float4(1.0) + z - s + (s - float4(1.0))/denom;
+	return numerator / denom;
+}
+
+float3 uigamma_large_z_impl(const float3 s, const float3 z)
+{
+    //  Float3 version:
+	const float3 numerator = pow(z, s) * exp(-z);
+	float3 denom = float3(7.0) + z - s;
+	denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom;
+	denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom;
+	denom = float3(1.0) + z - s + (s - float3(1.0))/denom;
+	return numerator / denom;
+}
+
+float2 uigamma_large_z_impl(const float2 s, const float2 z)
+{
+    //  Float2 version:
+	const float2 numerator = pow(z, s) * exp(-z);
+	float2 denom = float2(7.0) + z - s;
+	denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom;
+	denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom;
+	denom = float2(1.0) + z - s + (s - float2(1.0))/denom;
+	return numerator / denom;
+}
+
+float uigamma_large_z_impl(const float s, const float z)
+{
+    //  Float version:
+	const float numerator = pow(z, s) * exp(-z);
+	float denom = 7.0 + z - s;
+	denom = 5.0 + z - s + (3.0*s - 9.0)/denom;
+	denom = 3.0 + z - s + (2.0*s - 4.0)/denom;
+	denom = 1.0 + z - s + (s - 1.0)/denom;
+	return numerator / denom;
+}
+
+//  Normalized lower incomplete gamma function for small s (implementation):
+float4 normalized_ligamma_impl(const float4 s, const float4 z,
+    const float4 s_inv, const float4 gamma_s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) s_inv = 1/s (precomputed for outside reuse)
+    //              3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse)
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  Since we only care about s < 0.5, we only need
+    //              to evaluate two branches (not four) based on z.  Each branch
+    //              uses four terms, with a max relative error of ~0.00182.  The
+    //              branch threshold and specifics were adapted for fewer terms
+    //              from Gil/Segura/Temme's paper here:
+    //                  http://oai.cwi.nl/oai/asset/20433/20433B.pdf
+	//  Evaluate both branches: Real branches test slower even when available.
+	static const float4 thresh = float4(0.775075);
+	bool4 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	z_is_large.w = z.w > thresh.w;
+	const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	//  Combine the results from both branches:
+	bool4 inverse_z_is_large = not(z_is_large);
+	return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large);
+}
+
+float3 normalized_ligamma_impl(const float3 s, const float3 z,
+    const float3 s_inv, const float3 gamma_s_inv)
+{
+    //  Float3 version:
+	static const float3 thresh = float3(0.775075);
+	bool3 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool3 inverse_z_is_large = not(z_is_large);
+	return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large);
+}
+
+float2 normalized_ligamma_impl(const float2 s, const float2 z,
+    const float2 s_inv, const float2 gamma_s_inv)
+{
+    //  Float2 version:
+	static const float2 thresh = float2(0.775075);
+	bool2 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool2 inverse_z_is_large = not(z_is_large);
+	return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large);
+}
+
+float normalized_ligamma_impl(const float s, const float z,
+    const float s_inv, const float gamma_s_inv)
+{
+    //  Float version:
+	static const float thresh = 0.775075;
+	const bool z_is_large = z > thresh;
+	const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	return large_z * float(z_is_large) + small_z * float(!z_is_large);
+}
+
+//  Normalized lower incomplete gamma function for small s:
+float4 normalized_ligamma(const float4 s, const float4 z)
+{
+    //  Requires:   s < ~0.5
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  See normalized_ligamma_impl() for details.
+	const float4 s_inv = float4(1.0)/s;
+	const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float3 normalized_ligamma(const float3 s, const float3 z)
+{
+    //  Float3 version:
+	const float3 s_inv = float3(1.0)/s;
+	const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float2 normalized_ligamma(const float2 s, const float2 z)
+{
+    //  Float2 version:
+	const float2 s_inv = float2(1.0)/s;
+	const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float normalized_ligamma(const float s, const float z)
+{
+    //  Float version:
+	const float s_inv = 1.0/s;
+	const float gamma_s_inv = 1.0/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+#endif  //  SPECIAL_FUNCTIONS_H
+
+////////////////////////////  END SPECIAL-FUNCTIONS  ///////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////////  HELPERS  //////////////////////////////////
+
+inline float4 uv2_to_uv4(float2 tex_uv)
+{
+    //  Make a float2 uv offset safe for adding to float4 tex2Dlod coords:
+    return float4(tex_uv, 0.0, 0.0);
+}
+
+//  Make a length squared helper macro (for usage with static constants):
+#define LENGTH_SQ(vec) (dot(vec, vec))
+
+inline float get_fast_gaussian_weight_sum_inv(const float sigma)
+{
+    //  We can use the Gaussian integral to calculate the asymptotic weight for
+    //  the center pixel.  Since the unnormalized center pixel weight is 1.0,
+    //  the normalized weight is the same as the weight sum inverse.  Given a
+    //  large enough blur (9+), the asymptotic weight sum is close and faster:
+    //      center_weight = 0.5 *
+    //          (erf(0.5/(sigma*sqrt(2.0))) - erf(-0.5/(sigma*sqrt(2.0))))
+    //      erf(-x) == -erf(x), so we get 0.5 * (2.0 * erf(blah blah)):
+    //  However, we can get even faster results with curve-fitting.  These are
+    //  also closer than the asymptotic results, because they were constructed
+    //  from 64 blurs sizes from [3, 131) and 255 equally-spaced sigmas from
+    //  (0, blurN_std_dev), so the results for smaller sigmas are biased toward
+    //  smaller blurs.  The max error is 0.0031793913.
+    //  Relative FPS: 134.3 with erf, 135.8 with curve-fitting.
+    //static const float temp = 0.5/sqrt(2.0);
+    //return erf(temp/sigma);
+    return min(exp(exp(0.348348412457428/
+        (sigma - 0.0860587260734721))), 0.399334576340352/sigma);
+}
+
+
+////////////////////  ARBITRARILY RESIZABLE SEPARABLE BLURS  ///////////////////
+
+float3 tex2Dblur11resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 11x Gaussian blurred texture lookup using a 11-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  Calculate Gaussian blur kernel weights and a normalization factor for
+    //  distances of 0-4, ignoring constant factors (since we're normalizing).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float weight_sum_inv = 1.0 /
+        (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5));
+    //  Statically normalize weights, sum weighted samples, and return.  Blurs are
+    //  currently optimized for dynamic weights.
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w5 * tex2D_linearize(tex, tex_uv - 5.0 * dxdy).rgb;
+    sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
+    sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb;
+    sum += w5 * tex2D_linearize(tex, tex_uv + 5.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur9resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 9x Gaussian blurred texture lookup using a 9-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
+    sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur7resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 7x Gaussian blurred texture lookup using a 7-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3));
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur5resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 5x Gaussian blurred texture lookup using a 5-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2));
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur3resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 3x Gaussian blurred texture lookup using a 3-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1);
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+
+///////////////////////////  FAST SEPARABLE BLURS  ///////////////////////////
+
+float3 tex2Dblur11fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   1.) Global requirements must be met (see file description).
+    //              2.) filter_linearN must = "true" in your .cgp file.
+    //              3.) For gamma-correct bilinear filtering, global
+    //                  gamma_aware_bilinear == true (from gamma-management.h)
+    //  Returns:    A 1D 11x Gaussian blurred texture lookup using 6 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float weight_sum_inv = 1.0 /
+        (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    const float w01 = w0 * 0.5 + w1;
+    const float w23 = w2 + w3;
+    const float w45 = w4 + w5;
+    const float w01_ratio = w1/w01;
+    const float w23_ratio = w3/w23;
+    const float w45_ratio = w5/w45;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w45 * tex2D_linearize(tex, tex_uv - (4.0 + w45_ratio) * dxdy).rgb;
+    sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb;
+    sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb;
+    sum += w45 * tex2D_linearize(tex, tex_uv + (4.0 + w45_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur9fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 9x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 4 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    const float w12 = w1 + w2;
+    const float w34 = w3 + w4;
+    const float w12_ratio = w2/w12;
+    const float w34_ratio = w4/w34;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w34 * tex2D_linearize(tex, tex_uv - (3.0 + w34_ratio) * dxdy).rgb;
+    sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb;
+    sum += w34 * tex2D_linearize(tex, tex_uv + (3.0 + w34_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur7fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 7x Gaussian blurred texture lookup using 4 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    const float w01 = w0 * 0.5 + w1;
+    const float w23 = w2 + w3;
+    const float w01_ratio = w1/w01;
+    const float w23_ratio = w3/w23;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb;
+    sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur5fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 5x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 2 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    const float w12 = w1 + w2;
+    const float w12_ratio = w2/w12;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur3fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 3x Gaussian blurred texture lookup using 2 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    const float w01 = w0 * 0.5 + w1;
+    const float w01_ratio = w1/w01;
+    //  Weights for all samples are the same, so just average them:
+    return 0.5 * (
+        tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb +
+        tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb);
+}
+
+
+////////////////////////////  HUGE SEPARABLE BLURS  ////////////////////////////
+
+//  Huge separable blurs come only in "fast" versions.
+float3 tex2Dblur43fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 43x Gaussian blurred texture lookup using 22 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float w6 = exp(-36.0 * denom_inv);
+    const float w7 = exp(-49.0 * denom_inv);
+    const float w8 = exp(-64.0 * denom_inv);
+    const float w9 = exp(-81.0 * denom_inv);
+    const float w10 = exp(-100.0 * denom_inv);
+    const float w11 = exp(-121.0 * denom_inv);
+    const float w12 = exp(-144.0 * denom_inv);
+    const float w13 = exp(-169.0 * denom_inv);
+    const float w14 = exp(-196.0 * denom_inv);
+    const float w15 = exp(-225.0 * denom_inv);
+    const float w16 = exp(-256.0 * denom_inv);
+    const float w17 = exp(-289.0 * denom_inv);
+    const float w18 = exp(-324.0 * denom_inv);
+    const float w19 = exp(-361.0 * denom_inv);
+    const float w20 = exp(-400.0 * denom_inv);
+    const float w21 = exp(-441.0 * denom_inv);
+    //const float weight_sum_inv = 1.0 /
+    //    (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 +
+    //        w12 + w13 + w14 + w15 + w16 + w17 + w18 + w19 + w20 + w21));
+    const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    const float w0_1 = w0 * 0.5 + w1;
+    const float w2_3 = w2 + w3;
+    const float w4_5 = w4 + w5;
+    const float w6_7 = w6 + w7;
+    const float w8_9 = w8 + w9;
+    const float w10_11 = w10 + w11;
+    const float w12_13 = w12 + w13;
+    const float w14_15 = w14 + w15;
+    const float w16_17 = w16 + w17;
+    const float w18_19 = w18 + w19;
+    const float w20_21 = w20 + w21;
+    const float w0_1_ratio = w1/w0_1;
+    const float w2_3_ratio = w3/w2_3;
+    const float w4_5_ratio = w5/w4_5;
+    const float w6_7_ratio = w7/w6_7;
+    const float w8_9_ratio = w9/w8_9;
+    const float w10_11_ratio = w11/w10_11;
+    const float w12_13_ratio = w13/w12_13;
+    const float w14_15_ratio = w15/w14_15;
+    const float w16_17_ratio = w17/w16_17;
+    const float w18_19_ratio = w19/w18_19;
+    const float w20_21_ratio = w21/w20_21;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w20_21 * tex2D_linearize(tex, tex_uv - (20.0 + w20_21_ratio) * dxdy).rgb;
+    sum += w18_19 * tex2D_linearize(tex, tex_uv - (18.0 + w18_19_ratio) * dxdy).rgb;
+    sum += w16_17 * tex2D_linearize(tex, tex_uv - (16.0 + w16_17_ratio) * dxdy).rgb;
+    sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb;
+    sum += w16_17 * tex2D_linearize(tex, tex_uv + (16.0 + w16_17_ratio) * dxdy).rgb;
+    sum += w18_19 * tex2D_linearize(tex, tex_uv + (18.0 + w18_19_ratio) * dxdy).rgb;
+    sum += w20_21 * tex2D_linearize(tex, tex_uv + (20.0 + w20_21_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur31fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 31x Gaussian blurred texture lookup using 16 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float w6 = exp(-36.0 * denom_inv);
+    const float w7 = exp(-49.0 * denom_inv);
+    const float w8 = exp(-64.0 * denom_inv);
+    const float w9 = exp(-81.0 * denom_inv);
+    const float w10 = exp(-100.0 * denom_inv);
+    const float w11 = exp(-121.0 * denom_inv);
+    const float w12 = exp(-144.0 * denom_inv);
+    const float w13 = exp(-169.0 * denom_inv);
+    const float w14 = exp(-196.0 * denom_inv);
+    const float w15 = exp(-225.0 * denom_inv);
+    //const float weight_sum_inv = 1.0 /
+    //    (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 +
+    //        w9 + w10 + w11 + w12 + w13 + w14 + w15));
+    const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    const float w0_1 = w0 * 0.5 + w1;
+    const float w2_3 = w2 + w3;
+    const float w4_5 = w4 + w5;
+    const float w6_7 = w6 + w7;
+    const float w8_9 = w8 + w9;
+    const float w10_11 = w10 + w11;
+    const float w12_13 = w12 + w13;
+    const float w14_15 = w14 + w15;
+    const float w0_1_ratio = w1/w0_1;
+    const float w2_3_ratio = w3/w2_3;
+    const float w4_5_ratio = w5/w4_5;
+    const float w6_7_ratio = w7/w6_7;
+    const float w8_9_ratio = w9/w8_9;
+    const float w10_11_ratio = w11/w10_11;
+    const float w12_13_ratio = w13/w12_13;
+    const float w14_15_ratio = w15/w14_15;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur25fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 25x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 12 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float w6 = exp(-36.0 * denom_inv);
+    const float w7 = exp(-49.0 * denom_inv);
+    const float w8 = exp(-64.0 * denom_inv);
+    const float w9 = exp(-81.0 * denom_inv);
+    const float w10 = exp(-100.0 * denom_inv);
+    const float w11 = exp(-121.0 * denom_inv);
+    const float w12 = exp(-144.0 * denom_inv);
+    //const float weight_sum_inv = 1.0 / (w0 + 2.0 * (
+    //    w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12));
+    const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    const float w1_2 = w1 + w2;
+    const float w3_4 = w3 + w4;
+    const float w5_6 = w5 + w6;
+    const float w7_8 = w7 + w8;
+    const float w9_10 = w9 + w10;
+    const float w11_12 = w11 + w12;
+    const float w1_2_ratio = w2/w1_2;
+    const float w3_4_ratio = w4/w3_4;
+    const float w5_6_ratio = w6/w5_6;
+    const float w7_8_ratio = w8/w7_8;
+    const float w9_10_ratio = w10/w9_10;
+    const float w11_12_ratio = w12/w11_12;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w11_12 * tex2D_linearize(tex, tex_uv - (11.0 + w11_12_ratio) * dxdy).rgb;
+    sum += w9_10 * tex2D_linearize(tex, tex_uv - (9.0 + w9_10_ratio) * dxdy).rgb;
+    sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w9_10 * tex2D_linearize(tex, tex_uv + (9.0 + w9_10_ratio) * dxdy).rgb;
+    sum += w11_12 * tex2D_linearize(tex, tex_uv + (11.0 + w11_12_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur17fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 17x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 8 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float w6 = exp(-36.0 * denom_inv);
+    const float w7 = exp(-49.0 * denom_inv);
+    const float w8 = exp(-64.0 * denom_inv);
+    //const float weight_sum_inv = 1.0 / (w0 + 2.0 * (
+    //    w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8));
+    const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    const float w1_2 = w1 + w2;
+    const float w3_4 = w3 + w4;
+    const float w5_6 = w5 + w6;
+    const float w7_8 = w7 + w8;
+    const float w1_2_ratio = w2/w1_2;
+    const float w3_4_ratio = w4/w3_4;
+    const float w5_6_ratio = w6/w5_6;
+    const float w7_8_ratio = w8/w7_8;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+
+////////////////////  ARBITRARILY RESIZABLE ONE-PASS BLURS  ////////////////////
+
+float3 tex2Dblur3x3resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 3x3 Gaussian blurred mipmapped texture lookup of the
+    //              resized input.
+    //  Description:
+    //  This is the only arbitrarily resizable one-pass blur; tex2Dblur5x5resize
+    //  would perform like tex2Dblur9x9, MUCH slower than tex2Dblur5resize.
+    const float denom_inv = 0.5/(sigma*sigma);
+    //  Load each sample.  We need all 3x3 samples.  Quad-pixel communication
+    //  won't help either: This should perform like tex2Dblur5x5, but sharing a
+    //  4x4 sample field would perform more like tex2Dblur8x8shared (worse).
+    const float2 sample4_uv = tex_uv;
+    const float2 dx = float2(dxdy.x, 0.0);
+    const float2 dy = float2(0.0, dxdy.y);
+    const float2 sample1_uv = sample4_uv - dy;
+    const float2 sample7_uv = sample4_uv + dy;
+    const float3 sample0 = tex2D_linearize(tex, sample1_uv - dx).rgb;
+    const float3 sample1 = tex2D_linearize(tex, sample1_uv).rgb;
+    const float3 sample2 = tex2D_linearize(tex, sample1_uv + dx).rgb;
+    const float3 sample3 = tex2D_linearize(tex, sample4_uv - dx).rgb;
+    const float3 sample4 = tex2D_linearize(tex, sample4_uv).rgb;
+    const float3 sample5 = tex2D_linearize(tex, sample4_uv + dx).rgb;
+    const float3 sample6 = tex2D_linearize(tex, sample7_uv - dx).rgb;
+    const float3 sample7 = tex2D_linearize(tex, sample7_uv).rgb;
+    const float3 sample8 = tex2D_linearize(tex, sample7_uv + dx).rgb;
+    //  Statically compute Gaussian sample weights:
+    const float w4 = 1.0;
+    const float w1_3_5_7 = exp(-LENGTH_SQ(float2(1.0, 0.0)) * denom_inv);
+    const float w0_2_6_8 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
+    const float weight_sum_inv = 1.0/(w4 + 4.0 * (w1_3_5_7 + w0_2_6_8));
+    //  Weight and sum the samples:
+    const float3 sum = w4 * sample4 +
+        w1_3_5_7 * (sample1 + sample3 + sample5 + sample7) +
+        w0_2_6_8 * (sample0 + sample2 + sample6 + sample8);
+    return sum * weight_sum_inv;
+}
+
+
+////////////////////////////  FASTER ONE-PASS BLURS  ///////////////////////////
+
+float3 tex2Dblur9x9(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Perform a 1-pass 9x9 blur with 5x5 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 9x9 Gaussian blurred mipmapped texture lookup composed of
+    //              5x5 carefully selected bilinear samples.
+    //  Description:
+    //  Perform a 1-pass 9x9 blur with 5x5 bilinear samples.  Adjust the
+    //  bilinear sample location to reflect the true Gaussian weights for each
+    //  underlying texel.  The following diagram illustrates the relative
+    //  locations of bilinear samples.  Each sample with the same number has the
+    //  same weight (notice the symmetry).  The letters a, b, c, d distinguish
+    //  quadrants, and the letters U, D, L, R, C (up, down, left, right, center)
+    //  distinguish 1D directions along the line containing the pixel center:
+    //      6a 5a 2U 5b 6b
+    //      4a 3a 1U 3b 4b
+    //      2L 1L 0C 1R 2R
+    //      4c 3c 1D 3d 4d
+    //      6c 5c 2D 5d 6d
+    //  The following diagram illustrates the underlying equally spaced texels,
+    //  named after the sample that accesses them and subnamed by their location
+    //  within their 2x2, 2x1, 1x2, or 1x1 texel block:
+    //      6a4 6a3 5a4 5a3 2U2 5b3 5b4 6b3 6b4
+    //      6a2 6a1 5a2 5a1 2U1 5b1 5b2 6b1 6b2
+    //      4a4 4a3 3a4 3a3 1U2 3b3 3b4 4b3 4b4
+    //      4a2 4a1 3a2 3a1 1U1 3b1 3b2 4b1 4b2
+    //      2L2 2L1 1L2 1L1 0C1 1R1 1R2 2R1 2R2
+    //      4c2 4c1 3c2 3c1 1D1 3d1 3d2 4d1 4d2
+    //      4c4 4c3 3c4 3c3 1D2 3d3 3d4 4d3 4d4
+    //      6c2 6c1 5c2 5c1 2D1 5d1 5d2 6d1 6d2
+    //      6c4 6c3 5c4 5c3 2D2 5d3 5d4 6d3 6d4
+    //  Note there is only one C texel and only two texels for each U, D, L, or
+    //  R sample.  The center sample is effectively a nearest neighbor sample,
+    //  and the U/D/L/R samples use 1D linear filtering.  All other texels are
+    //  read with bilinear samples somewhere within their 2x2 texel blocks.
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute sampling offsets within each 2x2 texel block, based
+    //  on 1D sampling ratios between texels [1, 2] and [3, 4] texels away from
+    //  the center, and reuse them independently for both dimensions.  Compute
+    //  these offsets based on the relative 1D Gaussian weights of the texels
+    //  in question.  (w1off means "Gaussian weight for the texel 1.0 texels
+    //  away from the pixel center," etc.).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w1off = exp(-1.0 * denom_inv);
+    const float w2off = exp(-4.0 * denom_inv);
+    const float w3off = exp(-9.0 * denom_inv);
+    const float w4off = exp(-16.0 * denom_inv);
+    const float texel1to2ratio = w2off/(w1off + w2off);
+    const float texel3to4ratio = w4off/(w3off + w4off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including x-axis-aligned:
+    const float2 sample1R_texel_offset = float2(1.0, 0.0) + float2(texel1to2ratio, 0.0);
+    const float2 sample2R_texel_offset = float2(3.0, 0.0) + float2(texel3to4ratio, 0.0);
+    const float2 sample3d_texel_offset = float2(1.0, 1.0) + float2(texel1to2ratio, texel1to2ratio);
+    const float2 sample4d_texel_offset = float2(3.0, 1.0) + float2(texel3to4ratio, texel1to2ratio);
+    const float2 sample5d_texel_offset = float2(1.0, 3.0) + float2(texel1to2ratio, texel3to4ratio);
+    const float2 sample6d_texel_offset = float2(3.0, 3.0) + float2(texel3to4ratio, texel3to4ratio);
+
+    //  CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
+    //  Statically compute Gaussian texel weights for the bottom-right quadrant.
+    //  Read underscores as "and."
+    const float w1R1 = w1off;
+    const float w1R2 = w2off;
+    const float w2R1 = w3off;
+    const float w2R2 = w4off;
+    const float w3d1 =     exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
+    const float w3d2_3d3 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv);
+    const float w3d4 =     exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv);
+    const float w4d1_5d1 = exp(-LENGTH_SQ(float2(3.0, 1.0)) * denom_inv);
+    const float w4d2_5d3 = exp(-LENGTH_SQ(float2(4.0, 1.0)) * denom_inv);
+    const float w4d3_5d2 = exp(-LENGTH_SQ(float2(3.0, 2.0)) * denom_inv);
+    const float w4d4_5d4 = exp(-LENGTH_SQ(float2(4.0, 2.0)) * denom_inv);
+    const float w6d1 =     exp(-LENGTH_SQ(float2(3.0, 3.0)) * denom_inv);
+    const float w6d2_6d3 = exp(-LENGTH_SQ(float2(4.0, 3.0)) * denom_inv);
+    const float w6d4 =     exp(-LENGTH_SQ(float2(4.0, 4.0)) * denom_inv);
+    //  Statically add texel weights in each sample to get sample weights:
+    const float w0 = 1.0;
+    const float w1 = w1R1 + w1R2;
+    const float w2 = w2R1 + w2R2;
+    const float w3 = w3d1 + 2.0 * w3d2_3d3 + w3d4;
+    const float w4 = w4d1_5d1 + w4d2_5d3 + w4d3_5d2 + w4d4_5d4;
+    const float w5 = w4;
+    const float w6 = w6d1 + 2.0 * w6d2_6d3 + w6d4;
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv =
+        1.0/(w0 + 4.0 * (w1 + w2 + w3 + w4 + w5 + w6));
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 25 samples (1 nearest, 8 linear, 16 bilinear) using symmetry:
+    const float2 mirror_x = float2(-1.0, 1.0);
+    const float2 mirror_y = float2(1.0, -1.0);
+    const float2 mirror_xy = float2(-1.0, -1.0);
+    const float2 dxdy_mirror_x = dxdy * mirror_x;
+    const float2 dxdy_mirror_y = dxdy * mirror_y;
+    const float2 dxdy_mirror_xy = dxdy * mirror_xy;
+    //  Sampling order doesn't seem to affect performance, so just be clear:
+    const float3 sample0C = tex2D_linearize(tex, tex_uv).rgb;
+    const float3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb;
+    const float3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb;
+    const float3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb;
+    const float3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb;
+    const float3 sample2R = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset).rgb;
+    const float3 sample2D = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset.yx).rgb;
+    const float3 sample2L = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset).rgb;
+    const float3 sample2U = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset.yx).rgb;
+    const float3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb;
+    const float3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb;
+    const float3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb;
+    const float3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb;
+    const float3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb;
+    const float3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb;
+    const float3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb;
+    const float3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb;
+    const float3 sample5d = tex2D_linearize(tex, tex_uv + dxdy * sample5d_texel_offset).rgb;
+    const float3 sample5c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample5d_texel_offset).rgb;
+    const float3 sample5b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample5d_texel_offset).rgb;
+    const float3 sample5a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample5d_texel_offset).rgb;
+    const float3 sample6d = tex2D_linearize(tex, tex_uv + dxdy * sample6d_texel_offset).rgb;
+    const float3 sample6c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample6d_texel_offset).rgb;
+    const float3 sample6b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample6d_texel_offset).rgb;
+    const float3 sample6a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample6d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    float3 sum = w0 * sample0C;
+    sum += w1 * (sample1R + sample1D + sample1L + sample1U);
+    sum += w2 * (sample2R + sample2D + sample2L + sample2U);
+    sum += w3 * (sample3d + sample3c + sample3b + sample3a);
+    sum += w4 * (sample4d + sample4c + sample4b + sample4a);
+    sum += w5 * (sample5d + sample5c + sample5b + sample5a);
+    sum += w6 * (sample6d + sample6c + sample6b + sample6a);
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur7x7(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Perform a 1-pass 7x7 blur with 5x5 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 7x7 Gaussian blurred mipmapped texture lookup composed of
+    //              4x4 carefully selected bilinear samples.
+    //  Description:
+    //  First see the descriptions for tex2Dblur9x9() and tex2Dblur7().  This
+    //  blur mixes concepts from both.  The sample layout is as follows:
+    //      4a 3a 3b 4b
+    //      2a 1a 1b 2b
+    //      2c 1c 1d 2d
+    //      4c 3c 3d 4d
+    //  The texel layout is as follows.  Note that samples 3a/3b, 1a/1b, 1c/1d,
+    //  and 3c/3d share a vertical column of texels, and samples 2a/2c, 1a/1c,
+    //  1b/1d, and 2b/2d share a horizontal row of texels (all sample1's share
+    //  the center texel):
+    //      4a4  4a3  3a4  3ab3 3b4  4b3  4b4
+    //      4a2  4a1  3a2  3ab1 3b2  4b1  4b2
+    //      2a4  2a3  1a4  1ab3 1b4  2b3  2b4
+    //      2ac2 2ac1 1ac2 1*   1bd2 2bd1 2bd2
+    //      2c4  2c3  1c4  1cd3 1d4  2d3  2d4
+    //      4c2  4c1  3c2  3cd1 3d2  4d1  4d2
+    //      4c4  4c3  3c4  3cd3 3d4  4d3  4d4
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off = 1.0;
+    const float w1off = exp(-1.0 * denom_inv);
+    const float w2off = exp(-4.0 * denom_inv);
+    const float w3off = exp(-9.0 * denom_inv);
+    const float texel0to1ratio = w1off/(w0off * 0.5 + w1off);
+    const float texel2to3ratio = w3off/(w2off + w3off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including axis-aligned:
+    const float2 sample1d_texel_offset = float2(texel0to1ratio, texel0to1ratio);
+    const float2 sample2d_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample3d_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample4d_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+
+    //  CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
+    //  Statically compute Gaussian texel weights for the bottom-right quadrant.
+    //  Read underscores as "and."
+    const float w1abcd = 1.0;
+    const float w1bd2_1cd3 = exp(-LENGTH_SQ(float2(1.0, 0.0)) * denom_inv);
+    const float w2bd1_3cd1 = exp(-LENGTH_SQ(float2(2.0, 0.0)) * denom_inv);
+    const float w2bd2_3cd2 = exp(-LENGTH_SQ(float2(3.0, 0.0)) * denom_inv);
+    const float w1d4 =       exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
+    const float w2d3_3d2 =   exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv);
+    const float w2d4_3d4 =   exp(-LENGTH_SQ(float2(3.0, 1.0)) * denom_inv);
+    const float w4d1 =       exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv);
+    const float w4d2_4d3 =   exp(-LENGTH_SQ(float2(3.0, 2.0)) * denom_inv);
+    const float w4d4 =       exp(-LENGTH_SQ(float2(3.0, 3.0)) * denom_inv);
+    //  Statically add texel weights in each sample to get sample weights.
+    //  Split weights for shared texels between samples sharing them:
+    const float w1 = w1abcd * 0.25 + w1bd2_1cd3 + w1d4;
+    const float w2_3 = (w2bd1_3cd1 + w2bd2_3cd2) * 0.5 + w2d3_3d2 + w2d4_3d4;
+    const float w4 = w4d1 + 2.0 * w4d2_4d3 + w4d4;
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv =
+        1.0/(4.0 * (w1 + 2.0 * w2_3 + w4));
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 16 samples using symmetry:
+    const float2 mirror_x = float2(-1.0, 1.0);
+    const float2 mirror_y = float2(1.0, -1.0);
+    const float2 mirror_xy = float2(-1.0, -1.0);
+    const float2 dxdy_mirror_x = dxdy * mirror_x;
+    const float2 dxdy_mirror_y = dxdy * mirror_y;
+    const float2 dxdy_mirror_xy = dxdy * mirror_xy;
+    const float3 sample1a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample1d_texel_offset).rgb;
+    const float3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb;
+    const float3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb;
+    const float3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb;
+    const float3 sample1b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample1d_texel_offset).rgb;
+    const float3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb;
+    const float3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb;
+    const float3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb;
+    const float3 sample1c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample1d_texel_offset).rgb;
+    const float3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb;
+    const float3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb;
+    const float3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb;
+    const float3 sample1d = tex2D_linearize(tex, tex_uv + dxdy * sample1d_texel_offset).rgb;
+    const float3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb;
+    const float3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb;
+    const float3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w1 * (sample1a + sample1b + sample1c + sample1d);
+    sum += w2_3 * (sample2a + sample2b + sample2c + sample2d);
+    sum += w2_3 * (sample3a + sample3b + sample3c + sample3d);
+    sum += w4 * (sample4a + sample4b + sample4c + sample4d);
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur5x5(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Perform a 1-pass 5x5 blur with 3x3 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 5x5 Gaussian blurred mipmapped texture lookup composed of
+    //              3x3 carefully selected bilinear samples.
+    //  Description:
+    //  First see the description for tex2Dblur9x9().  This blur uses the same
+    //  concept and sample/texel locations except on a smaller scale.  Samples:
+    //      2a 1U 2b
+    //      1L 0C 1R
+    //      2c 1D 2d
+    //  Texels:
+    //      2a4 2a3 1U2 2b3 2b4
+    //      2a2 2a1 1U1 2b1 2b2
+    //      1L2 1L1 0C1 1R1 1R2
+    //      2c2 2c1 1D1 2d1 2d2
+    //      2c4 2c3 1D2 2d3 2d4
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w1off = exp(-1.0 * denom_inv);
+    const float w2off = exp(-4.0 * denom_inv);
+    const float texel1to2ratio = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including x-axis-aligned:
+    const float2 sample1R_texel_offset = float2(1.0, 0.0) + float2(texel1to2ratio, 0.0);
+    const float2 sample2d_texel_offset = float2(1.0, 1.0) + float2(texel1to2ratio, texel1to2ratio);
+
+    //  CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
+    //  Statically compute Gaussian texel weights for the bottom-right quadrant.
+    //  Read underscores as "and."
+    const float w1R1 = w1off;
+    const float w1R2 = w2off;
+    const float w2d1 =   exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
+    const float w2d2_3 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv);
+    const float w2d4 =   exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv);
+    //  Statically add texel weights in each sample to get sample weights:
+    const float w0 = 1.0;
+    const float w1 = w1R1 + w1R2;
+    const float w2 = w2d1 + 2.0 * w2d2_3 + w2d4;
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv = 1.0/(w0 + 4.0 * (w1 + w2));
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 9 samples (1 nearest, 4 linear, 4 bilinear) using symmetry:
+    const float2 mirror_x = float2(-1.0, 1.0);
+    const float2 mirror_y = float2(1.0, -1.0);
+    const float2 mirror_xy = float2(-1.0, -1.0);
+    const float2 dxdy_mirror_x = dxdy * mirror_x;
+    const float2 dxdy_mirror_y = dxdy * mirror_y;
+    const float2 dxdy_mirror_xy = dxdy * mirror_xy;
+    const float3 sample0C = tex2D_linearize(tex, tex_uv).rgb;
+    const float3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb;
+    const float3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb;
+    const float3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb;
+    const float3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb;
+    const float3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb;
+    const float3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb;
+    const float3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb;
+    const float3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    float3 sum = w0 * sample0C;
+    sum += w1 * (sample1R + sample1D + sample1L + sample1U);
+    sum += w2 * (sample2a + sample2b + sample2c + sample2d);
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur3x3(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Perform a 1-pass 3x3 blur with 5x5 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 3x3 Gaussian blurred mipmapped texture lookup composed of
+    //              2x2 carefully selected bilinear samples.
+    //  Description:
+    //  First see the descriptions for tex2Dblur9x9() and tex2Dblur7().  This
+    //  blur mixes concepts from both.  The sample layout is as follows:
+    //      0a 0b
+    //      0c 0d
+    //  The texel layout is as follows.  Note that samples 0a/0b and 0c/0d share
+    //  a vertical column of texels, and samples 0a/0c and 0b/0d share a
+    //  horizontal row of texels (all samples share the center texel):
+    //      0a3  0ab2 0b3
+    //      0ac1 0*0  0bd1
+    //      0c3  0cd2 0d3
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off = 1.0;
+    const float w1off = exp(-1.0 * denom_inv);
+    const float texel0to1ratio = w1off/(w0off * 0.5 + w1off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including axis-aligned:
+    const float2 sample0d_texel_offset = float2(texel0to1ratio, texel0to1ratio);
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 4 samples using symmetry:
+    const float2 mirror_x = float2(-1.0, 1.0);
+    const float2 mirror_y = float2(1.0, -1.0);
+    const float2 mirror_xy = float2(-1.0, -1.0);
+    const float2 dxdy_mirror_x = dxdy * mirror_x;
+    const float2 dxdy_mirror_y = dxdy * mirror_y;
+    const float2 dxdy_mirror_xy = dxdy * mirror_xy;
+    const float3 sample0a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample0d_texel_offset).rgb;
+    const float3 sample0b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample0d_texel_offset).rgb;
+    const float3 sample0c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample0d_texel_offset).rgb;
+    const float3 sample0d = tex2D_linearize(tex, tex_uv + dxdy * sample0d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Weights for all samples are the same, so just average them:
+    return 0.25 * (sample0a + sample0b + sample0c + sample0d);
+}
+
+
+//////////////////  LINEAR ONE-PASS BLURS WITH SHARED SAMPLES  /////////////////
+
+float3 tex2Dblur12x12shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
+    const float sigma)
+{
+    //  Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
+    //  Requires:   1.) Same as tex2Dblur9()
+    //              2.) ddx() and ddy() are present in the current Cg profile.
+    //              3.) The GPU driver is using fine/high-quality derivatives.
+    //              4.) quad_vector *correctly* describes the current fragment's
+    //                  location in its pixel quad, by the conventions noted in
+    //                  get_quad_vector[_naive].
+    //              5.) tex_uv.w = log2(video_size/output_size).y
+    //              6.) tex2Dlod() is present in the current Cg profile.
+    //  Optional:   Tune artifacts vs. excessive blurriness with the global
+    //              float error_blurring.
+    //  Returns:    A blurred texture lookup using a "virtual" 12x12 Gaussian
+    //              blur (a 6x6 blur of carefully selected bilinear samples)
+    //              of the given mip level.  There will be subtle inaccuracies,
+    //              especially for small or high-frequency detailed sources.
+    //  Description:
+    //  Perform a 1-pass blur with shared texture lookups across a pixel quad.
+    //  We'll get neighboring samples with high-quality ddx/ddy derivatives, as
+    //  in GPU Pro 2, Chapter VI.2, "Shader Amortization using Pixel Quad
+    //  Message Passing" by Eric Penner.
+    //
+    //  Our "virtual" 12x12 blur will be comprised of ((6 - 1)^2)/4 + 3 = 12
+    //  bilinear samples, where bilinear sampling positions are computed from
+    //  the relative Gaussian weights of the 4 surrounding texels.  The catch is
+    //  that the appropriate texel weights and sample coords differ for each
+    //  fragment, but we're reusing most of the same samples across a quad of
+    //  destination fragments.  (We do use unique coords for the four nearest
+    //  samples at each fragment.)  Mixing bilinear filtering and sample-sharing
+    //  therefore introduces some error into the weights, and this can get nasty
+    //  when the source image is small or high-frequency.  Computing bilinear
+    //  ratios based on weights at the sample field center results in sharpening
+    //  and ringing artifacts, but we can move samples closer to halfway between
+    //  texels to try blurring away the error (which can move features around by
+    //  a texel or so).  Tune this with the global float "error_blurring".
+    //
+    //  The pixel quad's sample field covers 12x12 texels, accessed through 6x6
+    //  bilinear (2x2 texel) taps.  Each fragment depends on a window of 10x10
+    //  texels (5x5 bilinear taps), and each fragment is responsible for loading
+    //  a 6x6 texel quadrant as a 3x3 block of bilinear taps, plus 3 more taps
+    //  to use unique bilinear coords for sample0* for each fragment.  This
+    //  diagram illustrates the relative locations of bilinear samples 1-9 for
+    //  each quadrant a, b, c, d (note samples will not be equally spaced):
+    //      8a 7a 6a 6b 7b 8b
+    //      5a 4a 3a 3b 4b 5b
+    //      2a 1a 0a 0b 1b 2b
+    //      2c 1c 0c 0d 1d 2d
+    //      5c 4c 3c 3d 4d 5d
+    //      8c 7c 6c 6d 7d 8d
+    //  The following diagram illustrates the underlying equally spaced texels,
+    //  named after the sample that accesses them and subnamed by their location
+    //  within their 2x2 texel block:
+    //      8a3 8a2 7a3 7a2 6a3 6a2 6b2 6b3 7b2 7b3 8b2 8b3
+    //      8a1 8a0 7a1 7a0 6a1 6a0 6b0 6b1 7b0 7b1 8b0 8b1
+    //      5a3 5a2 4a3 4a2 3a3 3a2 3b2 3b3 4b2 4b3 5b2 5b3
+    //      5a1 5a0 4a1 4a0 3a1 3a0 3b0 3b1 4b0 4b1 5b0 5b1
+    //      2a3 2a2 1a3 1a2 0a3 0a2 0b2 0b3 1b2 1b3 2b2 2b3
+    //      2a1 2a0 1a1 1a0 0a1 0a0 0b0 0b1 1b0 1b1 2b0 2b1
+    //      2c1 2c0 1c1 1c0 0c1 0c0 0d0 0d1 1d0 1d1 2d0 2d1
+    //      2c3 2c2 1c3 1c2 0c3 0c2 0d2 0d3 1d2 1d3 2d2 2d3
+    //      5c1 5c0 4c1 4c0 3c1 3c0 3d0 3d1 4d0 4d1 5d0 5d1
+    //      5c3 5c2 4c3 4c2 3c3 3c2 3d2 3d3 4d2 4d3 5d2 5d3
+    //      8c1 8c0 7c1 7c0 6c1 6c0 6d0 6d1 7d0 7d1 8d0 8d1
+    //      8c3 8c2 7c3 7c2 6c3 6c2 6d2 6d3 7d2 7d3 8d2 8d3
+    //  With this symmetric arrangement, we don't have to know which absolute
+    //  quadrant a sample lies in to assign kernel weights; it's enough to know
+    //  the sample number and the relative quadrant of the sample (relative to
+    //  the current quadrant):
+    //      {current, adjacent x, adjacent y, diagonal}
+
+    //  COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Statically compute sampling offsets within each 2x2 texel block, based
+    //  on appropriate 1D Gaussian sampling ratio between texels [0, 1], [2, 3],
+    //  and [4, 5] away from the fragment, and reuse them independently for both
+    //  dimensions.  Use the sample field center as the estimated destination,
+    //  but nudge the result closer to halfway between texels to blur error.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off   = 1.0;
+    const float w0_5off = exp(-(0.5*0.5) * denom_inv);
+    const float w1off   = exp(-(1.0*1.0) * denom_inv);
+    const float w1_5off = exp(-(1.5*1.5) * denom_inv);
+    const float w2off   = exp(-(2.0*2.0) * denom_inv);
+    const float w2_5off = exp(-(2.5*2.5) * denom_inv);
+    const float w3_5off = exp(-(3.5*3.5) * denom_inv);
+    const float w4_5off = exp(-(4.5*4.5) * denom_inv);
+    const float w5_5off = exp(-(5.5*5.5) * denom_inv);
+    const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
+    const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
+    const float texel4to5ratio = lerp(w5_5off/(w4_5off + w5_5off), 0.5, error_blurring);
+    //  We don't share sample0*, so use the nearest destination fragment:
+    const float texel0to1ratio_nearest = w1off/(w0off + w1off);
+    const float texel1to2ratio_nearest = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the bottom-right fragment to each
+    //  bilinear sample in the bottom-right quadrant:
+    const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample2_texel_offset = float2(4.0, 0.0) + float2(texel4to5ratio, texel0to1ratio);
+    const float2 sample3_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample4_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+    const float2 sample5_texel_offset = float2(4.0, 2.0) + float2(texel4to5ratio, texel2to3ratio);
+    const float2 sample6_texel_offset = float2(0.0, 4.0) + float2(texel0to1ratio, texel4to5ratio);
+    const float2 sample7_texel_offset = float2(2.0, 4.0) + float2(texel2to3ratio, texel4to5ratio);
+    const float2 sample8_texel_offset = float2(4.0, 4.0) + float2(texel4to5ratio, texel4to5ratio);
+
+    //  CALCULATE KERNEL WEIGHTS:
+    //  Statically compute bilinear sample weights at each destination fragment
+    //  based on the sum of their 4 underlying texel weights.  Assume a same-
+    //  resolution blur, so each symmetrically named sample weight will compute
+    //  the same at every fragment in the pixel quad: We can therefore compute
+    //  texel weights based only on the bottom-right quadrant (fragment at 0d0).
+    //  Too avoid too much boilerplate code, use a macro to get all 4 texel
+    //  weights for a bilinear sample based on the offset of its top-left texel:
+    #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
+        (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
+    const float w8diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -6.0);
+    const float w7diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -6.0);
+    const float w6diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -6.0);
+    const float w6adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -6.0);
+    const float w7adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -6.0);
+    const float w8adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -6.0);
+    const float w5diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -4.0);
+    const float w4diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0);
+    const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0);
+    const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0);
+    const float w4adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0);
+    const float w5adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -4.0);
+    const float w2diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -2.0);
+    const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0);
+    const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
+    const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
+    const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
+    const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -2.0);
+    const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 0.0);
+    const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0);
+    const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
+    const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
+    const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
+    const float w2curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 0.0);
+    const float w5adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 2.0);
+    const float w4adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0);
+    const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
+    const float w3curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
+    const float w4curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
+    const float w5curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 2.0);
+    const float w8adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 4.0);
+    const float w7adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 4.0);
+    const float w6adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 4.0);
+    const float w6curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 4.0);
+    const float w7curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 4.0);
+    const float w8curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 4.0);
+    #undef GET_TEXEL_QUAD_WEIGHTS
+    //  Statically pack weights for runtime:
+    const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
+    const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag);
+    const float4 w2 = float4(w2curr, w2adjx, w2adjy, w2diag);
+    const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag);
+    const float4 w4 = float4(w4curr, w4adjx, w4adjy, w4diag);
+    const float4 w5 = float4(w5curr, w5adjx, w5adjy, w5diag);
+    const float4 w6 = float4(w6curr, w6adjx, w6adjy, w6diag);
+    const float4 w7 = float4(w7curr, w7adjx, w7adjy, w7diag);
+    const float4 w8 = float4(w8curr, w8adjx, w8adjy, w8diag);
+    //  Get the weight sum inverse (normalization factor):
+    const float4 weight_sum4 = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8;
+    const float2 weight_sum2 = weight_sum4.xy + weight_sum4.zw;
+    const float weight_sum = weight_sum2.x + weight_sum2.y;
+    const float weight_sum_inv = 1.0/(weight_sum);
+
+    //  LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
+    const float2 dxdy_curr = dxdy * quad_vector.xy;
+    //  Load bilinear samples for the current quadrant (for this fragment):
+    const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
+    const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
+    const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
+    const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
+    const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
+    const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
+    const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
+    const float3 sample4curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample4_texel_offset)).rgb;
+    const float3 sample5curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample5_texel_offset)).rgb;
+    const float3 sample6curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample6_texel_offset)).rgb;
+    const float3 sample7curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample7_texel_offset)).rgb;
+    const float3 sample8curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample8_texel_offset)).rgb;
+
+    //  GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
+    //  Fetch the samples from other fragments in the 2x2 quad:
+    float3 sample1adjx, sample1adjy, sample1diag;
+    float3 sample2adjx, sample2adjy, sample2diag;
+    float3 sample3adjx, sample3adjy, sample3diag;
+    float3 sample4adjx, sample4adjy, sample4diag;
+    float3 sample5adjx, sample5adjy, sample5diag;
+    float3 sample6adjx, sample6adjy, sample6diag;
+    float3 sample7adjx, sample7adjy, sample7diag;
+    float3 sample8adjx, sample8adjy, sample8diag;
+    quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
+    quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
+    quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag);
+    quad_gather(quad_vector, sample4curr, sample4adjx, sample4adjy, sample4diag);
+    quad_gather(quad_vector, sample5curr, sample5adjx, sample5adjy, sample5diag);
+    quad_gather(quad_vector, sample6curr, sample6adjx, sample6adjy, sample6diag);
+    quad_gather(quad_vector, sample7curr, sample7adjx, sample7adjy, sample7diag);
+    quad_gather(quad_vector, sample8curr, sample8adjx, sample8adjy, sample8diag);
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    //  Fill each row of a matrix with an rgb sample and pre-multiply by the
+    //  weights to obtain a weighted result:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
+    sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag));
+    sum += mul(w2, float4x3(sample2curr, sample2adjx, sample2adjy, sample2diag));
+    sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag));
+    sum += mul(w4, float4x3(sample4curr, sample4adjx, sample4adjy, sample4diag));
+    sum += mul(w5, float4x3(sample5curr, sample5adjx, sample5adjy, sample5diag));
+    sum += mul(w6, float4x3(sample6curr, sample6adjx, sample6adjy, sample6diag));
+    sum += mul(w7, float4x3(sample7curr, sample7adjx, sample7adjy, sample7diag));
+    sum += mul(w8, float4x3(sample8curr, sample8adjx, sample8adjy, sample8diag));
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur10x10shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
+    const float sigma)
+{
+    //  Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
+    //  Requires:   Same as tex2Dblur12x12shared()
+    //  Returns:    A blurred texture lookup using a "virtual" 10x10 Gaussian
+    //              blur (a 5x5 blur of carefully selected bilinear samples)
+    //              of the given mip level.  There will be subtle inaccuracies,
+    //              especially for small or high-frequency detailed sources.
+    //  Description:
+    //  First see the description for tex2Dblur12x12shared().  This
+    //  function shares the same concept and sample placement, but each fragment
+    //  only uses 25 of the 36 samples taken across the pixel quad (to cover a
+    //  5x5 sample area, or 10x10 texel area), and it uses a lower standard
+    //  deviation to compensate.  Thanks to symmetry, the 11 omitted samples
+    //  are always the "same:"
+    //      8adjx, 2adjx, 5adjx,
+    //      6adjy, 7adjy, 8adjy,
+    //      2diag, 5diag, 6diag, 7diag, 8diag
+
+    //  COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off   = 1.0;
+    const float w0_5off = exp(-(0.5*0.5) * denom_inv);
+    const float w1off   = exp(-(1.0*1.0) * denom_inv);
+    const float w1_5off = exp(-(1.5*1.5) * denom_inv);
+    const float w2off   = exp(-(2.0*2.0) * denom_inv);
+    const float w2_5off = exp(-(2.5*2.5) * denom_inv);
+    const float w3_5off = exp(-(3.5*3.5) * denom_inv);
+    const float w4_5off = exp(-(4.5*4.5) * denom_inv);
+    const float w5_5off = exp(-(5.5*5.5) * denom_inv);
+    const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
+    const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
+    const float texel4to5ratio = lerp(w5_5off/(w4_5off + w5_5off), 0.5, error_blurring);
+    //  We don't share sample0*, so use the nearest destination fragment:
+    const float texel0to1ratio_nearest = w1off/(w0off + w1off);
+    const float texel1to2ratio_nearest = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the bottom-right fragment to each
+    //  bilinear sample in the bottom-right quadrant:
+    const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample2_texel_offset = float2(4.0, 0.0) + float2(texel4to5ratio, texel0to1ratio);
+    const float2 sample3_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample4_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+    const float2 sample5_texel_offset = float2(4.0, 2.0) + float2(texel4to5ratio, texel2to3ratio);
+    const float2 sample6_texel_offset = float2(0.0, 4.0) + float2(texel0to1ratio, texel4to5ratio);
+    const float2 sample7_texel_offset = float2(2.0, 4.0) + float2(texel2to3ratio, texel4to5ratio);
+    const float2 sample8_texel_offset = float2(4.0, 4.0) + float2(texel4to5ratio, texel4to5ratio);
+
+    //  CALCULATE KERNEL WEIGHTS:
+    //  Statically compute bilinear sample weights at each destination fragment
+    //  from the sum of their 4 texel weights (details in tex2Dblur12x12shared).
+    #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
+        (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
+    //  We only need 25 of the 36 sample weights.  Skip the following weights:
+    //      8adjx, 2adjx, 5adjx,
+    //      6adjy, 7adjy, 8adjy,
+    //      2diag, 5diag, 6diag, 7diag, 8diag
+    const float w4diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0);
+    const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0);
+    const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0);
+    const float w4adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0);
+    const float w5adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -4.0);
+    const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0);
+    const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
+    const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
+    const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
+    const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -2.0);
+    const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0);
+    const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
+    const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
+    const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
+    const float w2curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 0.0);
+    const float w4adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0);
+    const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
+    const float w3curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
+    const float w4curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
+    const float w5curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 2.0);
+    const float w7adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 4.0);
+    const float w6adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 4.0);
+    const float w6curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 4.0);
+    const float w7curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 4.0);
+    const float w8curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 4.0);
+    #undef GET_TEXEL_QUAD_WEIGHTS
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv = 1.0/(w0curr + w1curr + w2curr + w3curr +
+        w4curr + w5curr + w6curr + w7curr + w8curr +
+        w0adjx + w1adjx + w3adjx + w4adjx + w6adjx + w7adjx +
+        w0adjy + w1adjy + w2adjy + w3adjy + w4adjy + w5adjy +
+        w0diag + w1diag + w3diag + w4diag);
+    //  Statically pack most weights for runtime.  Note the mixed packing:
+    const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
+    const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag);
+    const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag);
+    const float4 w4 = float4(w4curr, w4adjx, w4adjy, w4diag);
+    const float4 w2and5 = float4(w2curr, w2adjy, w5curr, w5adjy);
+    const float4 w6and7 = float4(w6curr, w6adjx, w7curr, w7adjx);
+
+    //  LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
+    const float2 dxdy_curr = dxdy * quad_vector.xy;
+    //  Load bilinear samples for the current quadrant (for this fragment):
+    const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
+    const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
+    const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
+    const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
+    const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
+    const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
+    const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
+    const float3 sample4curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample4_texel_offset)).rgb;
+    const float3 sample5curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample5_texel_offset)).rgb;
+    const float3 sample6curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample6_texel_offset)).rgb;
+    const float3 sample7curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample7_texel_offset)).rgb;
+    const float3 sample8curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample8_texel_offset)).rgb;
+
+    //  GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
+    //  Fetch the samples from other fragments in the 2x2 quad in order of need:
+    float3 sample1adjx, sample1adjy, sample1diag;
+    float3 sample2adjx, sample2adjy, sample2diag;
+    float3 sample3adjx, sample3adjy, sample3diag;
+    float3 sample4adjx, sample4adjy, sample4diag;
+    float3 sample5adjx, sample5adjy, sample5diag;
+    float3 sample6adjx, sample6adjy, sample6diag;
+    float3 sample7adjx, sample7adjy, sample7diag;
+    quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
+    quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
+    quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag);
+    quad_gather(quad_vector, sample4curr, sample4adjx, sample4adjy, sample4diag);
+    quad_gather(quad_vector, sample5curr, sample5adjx, sample5adjy, sample5diag);
+    quad_gather(quad_vector, sample6curr, sample6adjx, sample6adjy, sample6diag);
+    quad_gather(quad_vector, sample7curr, sample7adjx, sample7adjy, sample7diag);
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    //  Fill each row of a matrix with an rgb sample and pre-multiply by the
+    //  weights to obtain a weighted result.  First do the simple ones:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
+    sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag));
+    sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag));
+    sum += mul(w4, float4x3(sample4curr, sample4adjx, sample4adjy, sample4diag));
+    //  Now do the mixed-sample ones:
+    sum += mul(w2and5, float4x3(sample2curr, sample2adjy, sample5curr, sample5adjy));
+    sum += mul(w6and7, float4x3(sample6curr, sample6adjx, sample7curr, sample7adjx));
+    sum += w8curr * sample8curr;
+    //  Normalize the sum (so the weights add to 1.0) and return:
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur8x8shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
+    const float sigma)
+{
+    //  Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
+    //  Requires:   Same as tex2Dblur12x12shared()
+    //  Returns:    A blurred texture lookup using a "virtual" 8x8 Gaussian
+    //              blur (a 4x4 blur of carefully selected bilinear samples)
+    //              of the given mip level.  There will be subtle inaccuracies,
+    //              especially for small or high-frequency detailed sources.
+    //  Description:
+    //  First see the description for tex2Dblur12x12shared().  This function
+    //  shares the same concept and a similar sample placement, except each
+    //  quadrant contains 4x4 texels and 2x2 samples instead of 6x6 and 3x3
+    //  respectively.  There could be a total of 16 samples, 4 of which each
+    //  fragment is responsible for, but each fragment loads 0a/0b/0c/0d with
+    //  its own offset to reduce shared sample artifacts, bringing the sample
+    //  count for each fragment to 7.  Sample placement:
+    //      3a 2a 2b 3b
+    //      1a 0a 0b 1b
+    //      1c 0c 0d 1d
+    //      3c 2c 2d 3d
+    //  Texel placement:
+    //      3a3 3a2 2a3 2a2 2b2 2b3 3b2 3b3
+    //      3a1 3a0 2a1 2a0 2b0 2b1 3b0 3b1
+    //      1a3 1a2 0a3 0a2 0b2 0b3 1b2 1b3
+    //      1a1 1a0 0a1 0a0 0b0 0b1 1b0 1b1
+    //      1c1 1c0 0c1 0c0 0d0 0d1 1d0 1d1
+    //      1c3 1c2 0c3 0c2 0d2 0d3 1d2 1d3
+    //      3c1 3c0 2c1 2c0 2d0 2d1 3d0 4d1
+    //      3c3 3c2 2c3 2c2 2d2 2d3 3d2 4d3
+    
+    //  COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off   = 1.0;
+    const float w0_5off = exp(-(0.5*0.5) * denom_inv);
+    const float w1off   = exp(-(1.0*1.0) * denom_inv);
+    const float w1_5off = exp(-(1.5*1.5) * denom_inv);
+    const float w2off   = exp(-(2.0*2.0) * denom_inv);
+    const float w2_5off = exp(-(2.5*2.5) * denom_inv);
+    const float w3_5off = exp(-(3.5*3.5) * denom_inv);
+    const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
+    const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
+    //  We don't share sample0*, so use the nearest destination fragment:
+    const float texel0to1ratio_nearest = w1off/(w0off + w1off);
+    const float texel1to2ratio_nearest = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the bottom-right fragment to each
+    //  bilinear sample in the bottom-right quadrant:
+    const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample2_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample3_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+
+    //  CALCULATE KERNEL WEIGHTS:
+    //  Statically compute bilinear sample weights at each destination fragment
+    //  from the sum of their 4 texel weights (details in tex2Dblur12x12shared).
+    #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
+        (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
+    const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0);
+    const float w2diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0);
+    const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0);
+    const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0);
+    const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0);
+    const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
+    const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
+    const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
+    const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0);
+    const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
+    const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
+    const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
+    const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0);
+    const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
+    const float w2curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
+    const float w3curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
+    #undef GET_TEXEL_QUAD_WEIGHTS
+    //  Statically pack weights for runtime:
+    const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
+    const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag);
+    const float4 w2 = float4(w2curr, w2adjx, w2adjy, w2diag);
+    const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag);
+    //  Get the weight sum inverse (normalization factor):
+    const float4 weight_sum4 = w0 + w1 + w2 + w3;
+    const float2 weight_sum2 = weight_sum4.xy + weight_sum4.zw;
+    const float weight_sum = weight_sum2.x + weight_sum2.y;
+    const float weight_sum_inv = 1.0/(weight_sum);
+
+    //  LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
+    const float2 dxdy_curr = dxdy * quad_vector.xy;
+    //  Load bilinear samples for the current quadrant (for this fragment):
+    const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
+    const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
+    const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
+    const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
+    const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
+    const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
+    const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
+
+    //  GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
+    //  Fetch the samples from other fragments in the 2x2 quad:
+    float3 sample1adjx, sample1adjy, sample1diag;
+    float3 sample2adjx, sample2adjy, sample2diag;
+    float3 sample3adjx, sample3adjy, sample3diag;
+    quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
+    quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
+    quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag);
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    //  Fill each row of a matrix with an rgb sample and pre-multiply by the
+    //  weights to obtain a weighted result:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
+    sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag));
+    sum += mul(w2, float4x3(sample2curr, sample2adjx, sample2adjy, sample2diag));
+    sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag));
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur6x6shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
+    const float sigma)
+{
+    //  Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
+    //  Requires:   Same as tex2Dblur12x12shared()
+    //  Returns:    A blurred texture lookup using a "virtual" 6x6 Gaussian
+    //              blur (a 3x3 blur of carefully selected bilinear samples)
+    //              of the given mip level.  There will be some inaccuracies,subtle inaccuracies,
+    //              especially for small or high-frequency detailed sources.
+    //  Description:
+    //  First see the description for tex2Dblur8x8shared().  This
+    //  function shares the same concept and sample placement, but each fragment
+    //  only uses 9 of the 16 samples taken across the pixel quad (to cover a
+    //  3x3 sample area, or 6x6 texel area), and it uses a lower standard
+    //  deviation to compensate.  Thanks to symmetry, the 7 omitted samples
+    //  are always the "same:"
+    //      1adjx, 3adjx
+    //      2adjy, 3adjy
+    //      1diag, 2diag, 3diag
+
+    //  COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off   = 1.0;
+    const float w0_5off = exp(-(0.5*0.5) * denom_inv);
+    const float w1off   = exp(-(1.0*1.0) * denom_inv);
+    const float w1_5off = exp(-(1.5*1.5) * denom_inv);
+    const float w2off   = exp(-(2.0*2.0) * denom_inv);
+    const float w2_5off = exp(-(2.5*2.5) * denom_inv);
+    const float w3_5off = exp(-(3.5*3.5) * denom_inv);
+    const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
+    const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
+    //  We don't share sample0*, so use the nearest destination fragment:
+    const float texel0to1ratio_nearest = w1off/(w0off + w1off);
+    const float texel1to2ratio_nearest = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the bottom-right fragment to each
+    //  bilinear sample in the bottom-right quadrant:
+    const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample2_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample3_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+
+    //  CALCULATE KERNEL WEIGHTS:
+    //  Statically compute bilinear sample weights at each destination fragment
+    //  from the sum of their 4 texel weights (details in tex2Dblur12x12shared).
+    #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
+        (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
+    //  We only need 9 of the 16 sample weights.  Skip the following weights:
+    //      1adjx, 3adjx
+    //      2adjy, 3adjy
+    //      1diag, 2diag, 3diag
+    const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
+    const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
+    const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
+    const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
+    const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
+    const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
+    const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
+    const float w2curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
+    const float w3curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
+    #undef GET_TEXEL_QUAD_WEIGHTS
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv = 1.0/(w0curr + w1curr + w2curr + w3curr +
+        w0adjx + w2adjx + w0adjy + w1adjy + w0diag);
+    //  Statically pack some weights for runtime:
+    const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
+
+    //  LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
+    const float2 dxdy_curr = dxdy * quad_vector.xy;
+    //  Load bilinear samples for the current quadrant (for this fragment):
+    const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
+    const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
+    const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
+    const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
+    const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
+    const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
+    const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
+
+    //  GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
+    //  Fetch the samples from other fragments in the 2x2 quad:
+    float3 sample1adjx, sample1adjy, sample1diag;
+    float3 sample2adjx, sample2adjy, sample2diag;
+    quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
+    quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    //  Fill each row of a matrix with an rgb sample and pre-multiply by the
+    //  weights to obtain a weighted result for sample1*, and handle the rest
+    //  of the weights more directly/verbosely:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
+    sum += w1curr * sample1curr + w1adjy * sample1adjy + w2curr * sample2curr +
+            w2adjx * sample2adjx + w3curr * sample3curr;
+    return sum * weight_sum_inv;
+}
+
+
+///////////////////////  MAX OPTIMAL SIGMA BLUR WRAPPERS  //////////////////////
+
+//  The following blurs are static wrappers around the dynamic blurs above.
+//  HOPEFULLY, the compiler will be smart enough to do constant-folding.
+
+//  Resizable separable blurs:
+inline float3 tex2Dblur11resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur11resize(tex, tex_uv, dxdy, blur11_std_dev);
+}
+inline float3 tex2Dblur9resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur9resize(tex, tex_uv, dxdy, blur9_std_dev);
+}
+inline float3 tex2Dblur7resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur7resize(tex, tex_uv, dxdy, blur7_std_dev);
+}
+inline float3 tex2Dblur5resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur5resize(tex, tex_uv, dxdy, blur5_std_dev);
+}
+inline float3 tex2Dblur3resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur3resize(tex, tex_uv, dxdy, blur3_std_dev);
+}
+//  Fast separable blurs:
+inline float3 tex2Dblur11fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur11fast(tex, tex_uv, dxdy, blur11_std_dev);
+}
+inline float3 tex2Dblur9fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur9fast(tex, tex_uv, dxdy, blur9_std_dev);
+}
+inline float3 tex2Dblur7fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur7fast(tex, tex_uv, dxdy, blur7_std_dev);
+}
+inline float3 tex2Dblur5fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur5fast(tex, tex_uv, dxdy, blur5_std_dev);
+}
+inline float3 tex2Dblur3fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur3fast(tex, tex_uv, dxdy, blur3_std_dev);
+}
+//  Huge, "fast" separable blurs:
+inline float3 tex2Dblur43fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur43fast(tex, tex_uv, dxdy, blur43_std_dev);
+}
+inline float3 tex2Dblur31fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur31fast(tex, tex_uv, dxdy, blur31_std_dev);
+}
+inline float3 tex2Dblur25fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur25fast(tex, tex_uv, dxdy, blur25_std_dev);
+}
+inline float3 tex2Dblur17fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur17fast(tex, tex_uv, dxdy, blur17_std_dev);
+}
+//  Resizable one-pass blurs:
+inline float3 tex2Dblur3x3resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur3x3resize(tex, tex_uv, dxdy, blur3_std_dev);
+}
+//  "Fast" one-pass blurs:
+inline float3 tex2Dblur9x9(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur9x9(tex, tex_uv, dxdy, blur9_std_dev);
+}
+inline float3 tex2Dblur7x7(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur7x7(tex, tex_uv, dxdy, blur7_std_dev);
+}
+inline float3 tex2Dblur5x5(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur5x5(tex, tex_uv, dxdy, blur5_std_dev);
+}
+inline float3 tex2Dblur3x3(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur3x3(tex, tex_uv, dxdy, blur3_std_dev);
+}
+//  "Fast" shared-sample one-pass blurs:
+inline float3 tex2Dblur12x12shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
+{
+    return tex2Dblur12x12shared(tex, tex_uv, dxdy, quad_vector, blur12_std_dev);
+}
+inline float3 tex2Dblur10x10shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
+{
+    return tex2Dblur10x10shared(tex, tex_uv, dxdy, quad_vector, blur10_std_dev);
+}
+inline float3 tex2Dblur8x8shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
+{
+    return tex2Dblur8x8shared(tex, tex_uv, dxdy, quad_vector, blur8_std_dev);
+}
+inline float3 tex2Dblur6x6shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
+{
+    return tex2Dblur6x6shared(tex, tex_uv, dxdy, quad_vector, blur6_std_dev);
+}
+
+
+#endif  //  BLUR_FUNCTIONS_H
+
+////////////////////////////  END BLUR-FUNCTIONS  ///////////////////////////
+
+//#include "bloom-functions.h"
+
+////////////////////////////  BEGIN BLOOM-FUNCTIONS  ///////////////////////////
+
+#ifndef BLOOM_FUNCTIONS_H
+#define BLOOM_FUNCTIONS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These utility functions and constants help several passes determine the
+//  size and center texel weight of the phosphor bloom in a uniform manner.
+
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//  We need to calculate the correct blur sigma using some .cgp constants:
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+////////////////////////////  END USER-SETTINGS  //////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  END DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////////////
+
+//#include "../../../../include/blur-functions.h"
+
+////////////////////////////  BEGIN BLUR-FUNCTIONS  ///////////////////////////
+
+#ifndef BLUR_FUNCTIONS_H
+#define BLUR_FUNCTIONS_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides reusable one-pass and separable (two-pass) blurs.
+//  Requires:   All blurs share these requirements (dxdy requirement is split):
+//              1.) All requirements of gamma-management.h must be satisfied!
+//              2.) filter_linearN must == "true" in your .cgp preset unless
+//                  you're using tex2DblurNresize at 1x scale.
+//              3.) mipmap_inputN must == "true" in your .cgp preset if
+//                  output_size < video_size.
+//              4.) output_size == video_size / pow(2, M), where M is some
+//                  positive integer.  tex2Dblur*resize can resize arbitrarily
+//                  (and the blur will be done after resizing), but arbitrary
+//                  resizes "fail" with other blurs due to the way they mix
+//                  static weights with bilinear sample exploitation.
+//              5.) In general, dxdy should contain the uv pixel spacing:
+//                      dxdy = (video_size/output_size)/texture_size
+//              6.) For separable blurs (tex2DblurNresize and tex2DblurNfast),
+//                  zero out the dxdy component in the unblurred dimension:
+//                      dxdy = float2(dxdy.x, 0.0) or float2(0.0, dxdy.y)
+//              Many blurs share these requirements:
+//              1.) One-pass blurs require scale_xN == scale_yN or scales > 1.0,
+//                  or they will blur more in the lower-scaled dimension.
+//              2.) One-pass shared sample blurs require ddx(), ddy(), and
+//                  tex2Dlod() to be supported by the current Cg profile, and
+//                  the drivers must support high-quality derivatives.
+//              3.) One-pass shared sample blurs require:
+//                      tex_uv.w == log2(video_size/output_size).y;
+//              Non-wrapper blurs share this requirement:
+//              1.) sigma is the intended standard deviation of the blur
+//              Wrapper blurs share this requirement, which is automatically
+//              met (unless OVERRIDE_BLUR_STD_DEVS is #defined; see below):
+//              1.) blurN_std_dev must be global static const float values
+//                  specifying standard deviations for Nx blurs in units
+//                  of destination pixels
+//  Optional:   1.) The including file (or an earlier included file) may
+//                  optionally #define USE_BINOMIAL_BLUR_STD_DEVS to replace
+//                  default standard deviations with those matching a binomial
+//                  distribution.  (See below for details/properties.)
+//              2.) The including file (or an earlier included file) may
+//                  optionally #define OVERRIDE_BLUR_STD_DEVS and override:
+//                      static const float blur3_std_dev
+//                      static const float blur4_std_dev
+//                      static const float blur5_std_dev
+//                      static const float blur6_std_dev
+//                      static const float blur7_std_dev
+//                      static const float blur8_std_dev
+//                      static const float blur9_std_dev
+//                      static const float blur10_std_dev
+//                      static const float blur11_std_dev
+//                      static const float blur12_std_dev
+//                      static const float blur17_std_dev
+//                      static const float blur25_std_dev
+//                      static const float blur31_std_dev
+//                      static const float blur43_std_dev
+//              3.) The including file (or an earlier included file) may
+//                  optionally #define OVERRIDE_ERROR_BLURRING and override:
+//                      static const float error_blurring
+//                  This tuning value helps mitigate weighting errors from one-
+//                  pass shared-sample blurs sharing bilinear samples between
+//                  fragments.  Values closer to 0.0 have "correct" blurriness
+//                  but allow more artifacts, and values closer to 1.0 blur away
+//                  artifacts by sampling closer to halfway between texels.
+//              UPDATE 6/21/14: The above static constants may now be overridden
+//              by non-static uniform constants.  This permits exposing blur
+//              standard deviations as runtime GUI shader parameters.  However,
+//              using them keeps weights from being statically computed, and the
+//              speed hit depends on the blur: On my machine, uniforms kill over
+//              53% of the framerate with tex2Dblur12x12shared, but they only
+//              drop the framerate by about 18% with tex2Dblur11fast.
+//  Quality and Performance Comparisons:
+//  For the purposes of the following discussion, "no sRGB" means
+//  GAMMA_ENCODE_EVERY_FBO is #defined, and "sRGB" means it isn't.
+//  1.) tex2DblurNfast is always faster than tex2DblurNresize.
+//  2.) tex2DblurNresize functions are the only ones that can arbitrarily resize
+//      well, because they're the only ones that don't exploit bilinear samples.
+//      This also means they're the only functions which can be truly gamma-
+//      correct without linear (or sRGB FBO) input, but only at 1x scale.
+//  3.) One-pass shared sample blurs only have a speed advantage without sRGB.
+//      They also have some inaccuracies due to their shared-[bilinear-]sample
+//      design, which grow increasingly bothersome for smaller blurs and higher-
+//      frequency source images (relative to their resolution).  I had high
+//      hopes for them, but their most realistic use case is limited to quickly
+//      reblurring an already blurred input at full resolution.  Otherwise:
+//      a.) If you're blurring a low-resolution source, you want a better blur.
+//      b.) If you're blurring a lower mipmap, you want a better blur.
+//      c.) If you're blurring a high-resolution, high-frequency source, you
+//          want a better blur.
+//  4.) The one-pass blurs without shared samples grow slower for larger blurs,
+//      but they're competitive with separable blurs at 5x5 and smaller, and
+//      even tex2Dblur7x7 isn't bad if you're wanting to conserve passes.
+//  Here are some framerates from a GeForce 8800GTS.  The first pass resizes to
+//  viewport size (4x in this test) and linearizes for sRGB codepaths, and the
+//  remaining passes perform 6 full blurs.  Mipmapped tests are performed at the
+//  same scale, so they just measure the cost of mipmapping each FBO (only every
+//  other FBO is mipmapped for separable blurs, to mimic realistic usage).
+//  Mipmap      Neither     sRGB+Mipmap sRGB        Function
+//  76.0        92.3        131.3       193.7       tex2Dblur3fast
+//  63.2        74.4        122.4       175.5       tex2Dblur3resize
+//  93.7        121.2       159.3       263.2       tex2Dblur3x3
+//  59.7        68.7        115.4       162.1       tex2Dblur3x3resize
+//  63.2        74.4        122.4       175.5       tex2Dblur5fast
+//  49.3        54.8        100.0       132.7       tex2Dblur5resize
+//  59.7        68.7        115.4       162.1       tex2Dblur5x5
+//  64.9        77.2        99.1        137.2       tex2Dblur6x6shared
+//  55.8        63.7        110.4       151.8       tex2Dblur7fast
+//  39.8        43.9        83.9        105.8       tex2Dblur7resize
+//  40.0        44.2        83.2        104.9       tex2Dblur7x7
+//  56.4        65.5        71.9        87.9        tex2Dblur8x8shared
+//  49.3        55.1        99.9        132.5       tex2Dblur9fast
+//  33.3        36.2        72.4        88.0        tex2Dblur9resize
+//  27.8        29.7        61.3        72.2        tex2Dblur9x9
+//  37.2        41.1        52.6        60.2        tex2Dblur10x10shared
+//  44.4        49.5        91.3        117.8       tex2Dblur11fast
+//  28.8        30.8        63.6        75.4        tex2Dblur11resize
+//  33.6        36.5        40.9        45.5        tex2Dblur12x12shared
+//  TODO: Fill in benchmarks for new untested blurs.
+//                                                  tex2Dblur17fast
+//                                                  tex2Dblur25fast
+//                                                  tex2Dblur31fast
+//                                                  tex2Dblur43fast
+//                                                  tex2Dblur3x3resize
+
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+//  Set static standard deviations, but allow users to override them with their
+//  own constants (even non-static uniforms if they're okay with the speed hit):
+#ifndef OVERRIDE_BLUR_STD_DEVS
+    //  blurN_std_dev values are specified in terms of dxdy strides.
+    #ifdef USE_BINOMIAL_BLUR_STD_DEVS
+        //  By request, we can define standard deviations corresponding to a
+        //  binomial distribution with p = 0.5 (related to Pascal's triangle).
+        //  This distribution works such that blurring multiple times should
+        //  have the same result as a single larger blur.  These values are
+        //  larger than default for blurs up to 6x and smaller thereafter.
+        static const float blur3_std_dev = 0.84931640625;
+        static const float blur4_std_dev = 0.84931640625;
+        static const float blur5_std_dev = 1.0595703125;
+        static const float blur6_std_dev = 1.06591796875;
+        static const float blur7_std_dev = 1.17041015625;
+        static const float blur8_std_dev = 1.1720703125;
+        static const float blur9_std_dev = 1.2259765625;
+        static const float blur10_std_dev = 1.21982421875;
+        static const float blur11_std_dev = 1.25361328125;
+        static const float blur12_std_dev = 1.2423828125;
+        static const float blur17_std_dev = 1.27783203125;
+        static const float blur25_std_dev = 1.2810546875;
+        static const float blur31_std_dev = 1.28125;
+        static const float blur43_std_dev = 1.28125;
+    #else
+        //  The defaults are the largest values that keep the largest unused
+        //  blur term on each side <= 1.0/256.0.  (We could get away with more
+        //  or be more conservative, but this compromise is pretty reasonable.)
+        static const float blur3_std_dev = 0.62666015625;
+        static const float blur4_std_dev = 0.66171875;
+        static const float blur5_std_dev = 0.9845703125;
+        static const float blur6_std_dev = 1.02626953125;
+        static const float blur7_std_dev = 1.36103515625;
+        static const float blur8_std_dev = 1.4080078125;
+        static const float blur9_std_dev = 1.7533203125;
+        static const float blur10_std_dev = 1.80478515625;
+        static const float blur11_std_dev = 2.15986328125;
+        static const float blur12_std_dev = 2.215234375;
+        static const float blur17_std_dev = 3.45535583496;
+        static const float blur25_std_dev = 5.3409576416;
+        static const float blur31_std_dev = 6.86488037109;
+        static const float blur43_std_dev = 10.1852050781;
+    #endif  //  USE_BINOMIAL_BLUR_STD_DEVS
+#endif  //  OVERRIDE_BLUR_STD_DEVS
+
+#ifndef OVERRIDE_ERROR_BLURRING
+    //  error_blurring should be in [0.0, 1.0].  Higher values reduce ringing
+    //  in shared-sample blurs but increase blurring and feature shifting.
+    static const float error_blurring = 0.5;
+#endif
+
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//  gamma-management.h relies on pass-specific settings to guide its behavior:
+//  FIRST_PASS, LAST_PASS, GAMMA_ENCODE_EVERY_FBO, etc.  See it for details.
+//#include "gamma-management.h"
+
+////////////////////////////  BEGIN GAMMA-MANAGEMENT  //////////////////////////
+
+#ifndef GAMMA_MANAGEMENT_H
+#define GAMMA_MANAGEMENT_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//  
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides gamma-aware tex*D*() and encode_output() functions.
+//  Requires:   Before #include-ing this file, the including file must #define
+//              the following macros when applicable and follow their rules:
+//              1.) #define FIRST_PASS if this is the first pass.
+//              2.) #define LAST_PASS if this is the last pass.
+//              3.) If sRGB is available, set srgb_framebufferN = "true" for
+//                  every pass except the last in your .cgp preset.
+//              4.) If sRGB isn't available but you want gamma-correctness with
+//                  no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
+//              5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
+//              6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
+//              7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
+//              8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
+//              If an option in [5, 8] is #defined in the first or last pass, it
+//              should be #defined for both.  It shouldn't make a difference
+//              whether it's #defined for intermediate passes or not.
+//  Optional:   The including file (or an earlier included file) may optionally
+//              #define a number of macros indicating it will override certain
+//              macros and associated constants are as follows:
+//              static constants with either static or uniform constants.  The
+//              1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
+//                  static const float ntsc_gamma
+//                  static const float pal_gamma
+//                  static const float crt_reference_gamma_high
+//                  static const float crt_reference_gamma_low
+//                  static const float lcd_reference_gamma
+//                  static const float crt_office_gamma
+//                  static const float lcd_office_gamma
+//              2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
+//                  static const float crt_gamma
+//                  static const float gba_gamma
+//                  static const float lcd_gamma
+//              3.) OVERRIDE_FINAL_GAMMA: The user must first define:
+//                  static const float input_gamma
+//                  static const float intermediate_gamma
+//                  static const float output_gamma
+//                  (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
+//              4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
+//                  static const bool assume_opaque_alpha
+//              The gamma constant overrides must be used in every pass or none,
+//              and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
+//              OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
+//  Usage:      After setting macros appropriately, ignore gamma correction and
+//              replace all tex*D*() calls with equivalent gamma-aware
+//              tex*D*_linearize calls, except:
+//              1.) When you read an LUT, use regular tex*D or a gamma-specified
+//                  function, depending on its gamma encoding:
+//                      tex*D*_linearize_gamma (takes a runtime gamma parameter)
+//              2.) If you must read pass0's original input in a later pass, use
+//                  tex2D_linearize_ntsc_gamma.  If you want to read pass0's
+//                  input with gamma-corrected bilinear filtering, consider
+//                  creating a first linearizing pass and reading from the input
+//                  of pass1 later.
+//              Then, return encode_output(color) from every fragment shader.
+//              Finally, use the global gamma_aware_bilinear boolean if you want
+//              to statically branch based on whether bilinear filtering is
+//              gamma-correct or not (e.g. for placing Gaussian blur samples).
+//
+//  Detailed Policy:
+//  tex*D*_linearize() functions enforce a consistent gamma-management policy
+//  based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings.  They assume
+//  their input texture has the same encoding characteristics as the input for
+//  the current pass (which doesn't apply to the exceptions listed above).
+//  Similarly, encode_output() enforces a policy based on the LAST_PASS and
+//  GAMMA_ENCODE_EVERY_FBO settings.  Together, they result in one of the
+//  following two pipelines.
+//  Typical pipeline with intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = linear_color;     //  Automatic sRGB encoding
+//      linear_color = intermediate_output;     //  Automatic sRGB decoding
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Typical pipeline without intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
+//      linear_color = pow(intermediate_output, intermediate_gamma);
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
+//  easily get gamma-correctness without banding on devices where sRGB isn't
+//  supported.
+//
+//  Use This Header to Maximize Code Reuse:
+//  The purpose of this header is to provide a consistent interface for texture
+//  reads and output gamma-encoding that localizes and abstracts away all the
+//  annoying details.  This greatly reduces the amount of code in each shader
+//  pass that depends on the pass number in the .cgp preset or whether sRGB
+//  FBO's are being used: You can trivially change the gamma behavior of your
+//  whole pass by commenting or uncommenting 1-3 #defines.  To reuse the same
+//  code in your first, Nth, and last passes, you can even put it all in another
+//  header file and #include it from skeleton .cg files that #define the
+//  appropriate pass-specific settings.
+//
+//  Rationale for Using Three Macros:
+//  This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
+//  SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
+//  a lower maintenance burden on each pass.  At first glance it seems we could
+//  accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
+//  This works for simple use cases where input_gamma == output_gamma, but it
+//  breaks down for more complex scenarios like CRT simulation, where the pass
+//  number determines the gamma encoding of the input and output.
+
+
+///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
+
+//  Set standard gamma constants, but allow users to override them:
+#ifndef OVERRIDE_STANDARD_GAMMA
+    //  Standard encoding gammas:
+    static const float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
+    static const float pal_gamma = 2.8;     //  Never actually 2.8 in practice
+    //  Typical device decoding gammas (only use for emulating devices):
+    //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
+    //  gammas: The standards purposely undercorrected for an analog CRT's
+    //  assumed 2.5 reference display gamma to maintain contrast in assumed
+    //  [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
+    //  These unstated assumptions about display gamma and perceptual rendering
+    //  intent caused a lot of confusion, and more modern CRT's seemed to target
+    //  NTSC 2.2 gamma with circuitry.  LCD displays seem to have followed suit
+    //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
+    //  displays designed to view sRGB in bright environments.  (Standards are
+    //  also in flux again with BT.1886, but it's underspecified for displays.)
+    static const float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
+    static const float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
+    static const float lcd_reference_gamma = 2.5;       //  To match CRT
+    static const float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
+    static const float lcd_office_gamma = 2.2;  //  Approximates sRGB
+#endif  //  OVERRIDE_STANDARD_GAMMA
+
+//  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
+//  but only if they're aware of it.
+#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
+    static const bool assume_opaque_alpha = false;
+#endif
+
+
+///////////////////////  DERIVED CONSTANTS AS FUNCTIONS  ///////////////////////
+
+//  gamma-management.h should be compatible with overriding gamma values with
+//  runtime user parameters, but we can only define other global constants in
+//  terms of static constants, not uniform user parameters.  To get around this
+//  limitation, we need to define derived constants using functions.
+
+//  Set device gamma constants, but allow users to override them:
+#ifdef OVERRIDE_DEVICE_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_crt_gamma()    {   return crt_gamma;   }
+    inline float get_gba_gamma()    {   return gba_gamma;   }
+    inline float get_lcd_gamma()    {   return lcd_gamma;   }
+#else
+    inline float get_crt_gamma()    {   return crt_reference_gamma_high;    }
+    inline float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
+    inline float get_lcd_gamma()    {   return lcd_office_gamma;            }
+#endif  //  OVERRIDE_DEVICE_GAMMA
+
+//  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
+#ifdef OVERRIDE_FINAL_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_intermediate_gamma()   {   return intermediate_gamma;  }
+    inline float get_input_gamma()          {   return input_gamma;         }
+    inline float get_output_gamma()         {   return output_gamma;        }
+#else
+    //  If we gamma-correct every pass, always use ntsc_gamma between passes to
+    //  ensure middle passes don't need to care if anything is being simulated:
+    inline float get_intermediate_gamma()   {   return ntsc_gamma;          }
+    #ifdef SIMULATE_CRT_ON_LCD
+        inline float get_input_gamma()      {   return get_crt_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_LCD
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_LCD_ON_CRT
+        inline float get_input_gamma()      {   return get_lcd_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_CRT
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else   //  Don't simulate anything:
+        inline float get_input_gamma()      {   return ntsc_gamma;          }
+        inline float get_output_gamma()     {   return ntsc_gamma;          }
+    #endif  //  SIMULATE_GBA_ON_CRT
+    #endif  //  SIMULATE_LCD_ON_CRT
+    #endif  //  SIMULATE_GBA_ON_LCD
+    #endif  //  SIMULATE_CRT_ON_LCD
+#endif  //  OVERRIDE_FINAL_GAMMA
+
+//  Set decoding/encoding gammas for the current pass.  Use static constants for
+//  linearize_input and gamma_encode_output, because they aren't derived, and
+//  they let the compiler do dead-code elimination.
+#ifndef GAMMA_ENCODE_EVERY_FBO
+    #ifdef FIRST_PASS
+        static const bool linearize_input = true;
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        static const bool linearize_input = false;
+        inline float get_pass_input_gamma()     {   return 1.0;                 }
+    #endif
+    #ifdef LAST_PASS
+        static const bool gamma_encode_output = true;
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        static const bool gamma_encode_output = false;
+        inline float get_pass_output_gamma()    {   return 1.0;                 }
+    #endif
+#else
+    static const bool linearize_input = true;
+    static const bool gamma_encode_output = true;
+    #ifdef FIRST_PASS
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        inline float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
+    #endif
+    #ifdef LAST_PASS
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        inline float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
+    #endif
+#endif
+
+//  Users might want to know if bilinear filtering will be gamma-correct:
+static const bool gamma_aware_bilinear = !linearize_input;
+
+
+//////////////////////  COLOR ENCODING/DECODING FUNCTIONS  /////////////////////
+
+inline float4 encode_output(const float4 color)
+{
+    if(gamma_encode_output)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_input(const float4 color)
+{
+    if(linearize_input)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_gamma_input(const float4 color, const float3 gamma)
+{
+    if(assume_opaque_alpha)
+    {
+        return float4(pow(color.rgb, gamma), 1.0);
+    }
+    else
+    {
+        return float4(pow(color.rgb, gamma), color.a);
+    }
+}
+
+//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯
+//#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D)))
+// EDIT: it's the 'const' in front of the coords that's doing it
+
+///////////////////////////  TEXTURE LOOKUP WRAPPERS  //////////////////////////
+
+//  "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a wide array of linearizing texture lookup wrapper functions.  The
+//  Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
+//  lookups are provided for completeness in case that changes someday.  Nobody
+//  is likely to use the *fetch and *proj functions, but they're included just
+//  in case.  The only tex*D texture sampling functions omitted are:
+//      - tex*Dcmpbias
+//      - tex*Dcmplod
+//      - tex*DARRAY*
+//      - tex*DMS*
+//      - Variants returning integers
+//  Standard line length restrictions are ignored below for vertical brevity.
+/*
+//  tex1D:
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex1Dbias:
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dbias(tex, tex_coords));   }
+
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dbias(tex, tex_coords, texel_off));    }
+
+//  tex1Dfetch:
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
+{   return decode_input(tex1Dfetch(tex, tex_coords));  }
+
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex1Dlod:
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dlod(tex, tex_coords));    }
+
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dlod(tex, tex_coords, texel_off));     }
+
+//  tex1Dproj:
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+*/
+//  tex2D:
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords, texel_off));    }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex2Dbias:
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
+//{   return decode_input(tex2Dbias(tex, tex_coords));   }
+
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dbias(tex, tex_coords, texel_off));    }
+
+//  tex2Dfetch:
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
+//{   return decode_input(tex2Dfetch(tex, tex_coords));  }
+
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords)
+{   return decode_input(textureLod(tex, tex_coords.xy, 0.0));    }
+
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));     }
+/*
+//  tex2Dproj:
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+*/
+/*
+//  tex3D:
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
+{   return decode_input(tex3D(tex, tex_coords));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, texel_off));    }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex3Dbias:
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dbias(tex, tex_coords));   }
+
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dbias(tex, tex_coords, texel_off));    }
+
+//  tex3Dfetch:
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
+{   return decode_input(tex3Dfetch(tex, tex_coords));  }
+
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex3Dlod:
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dlod(tex, tex_coords));    }
+
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dlod(tex, tex_coords, texel_off));     }
+
+//  tex3Dproj:
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dproj(tex, tex_coords));   }
+
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dproj(tex, tex_coords, texel_off));    }
+/////////*
+
+//  NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  This narrow selection of nonstandard tex2D* functions can be useful:
+
+//  tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0)));   }
+
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off));    }
+
+
+//  MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a narrower selection of tex2D* wrapper functions that decode an
+//  input sample with a specified gamma value.  These are useful for reading
+//  LUT's and for reading the input of pass0 in a later pass.
+
+//  tex2D:
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma);   }
+
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+/*
+//  tex2Dbias:
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma);   }
+
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma);    }
+
+//  tex2Dfetch:
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma);  }
+
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma);   }
+*/
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma);    }
+
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma);     }
+
+
+#endif  //  GAMMA_MANAGEMENT_H
+
+////////////////////////////  END GAMMA-MANAGEMENT  //////////////////////////
+
+//#include "quad-pixel-communication.h"
+
+///////////////////////  BEGIN QUAD-PIXEL-COMMUNICATION  //////////////////////
+
+#ifndef QUAD_PIXEL_COMMUNICATION_H
+#define QUAD_PIXEL_COMMUNICATION_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey*
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DISCLAIMER  /////////////////////////////////
+
+//  *This code was inspired by "Shader Amortization using Pixel Quad Message
+//  Passing" by Eric Penner, published in GPU Pro 2, Chapter VI.2.  My intent
+//  is not to plagiarize his fundamentally similar code and assert my own
+//  copyright, but the algorithmic helper functions require so little code that
+//  implementations can't vary by much except bugfixes and conventions.  I just
+//  wanted to license my own particular code here to avoid ambiguity and make it
+//  clear that as far as I'm concerned, people can do as they please with it.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  Given screen pixel numbers, derive a "quad vector" describing a fragment's
+//  position in its 2x2 pixel quad.  Given that vector, obtain the values of any
+//  variable at neighboring fragments.
+//  Requires:   Using this file in general requires:
+//              1.) ddx() and ddy() are present in the current Cg profile.
+//              2.) The GPU driver is using fine/high-quality derivatives.
+//                  Functions will give incorrect results if this is not true,
+//                  so a test function is included.
+
+
+/////////////////////  QUAD-PIXEL COMMUNICATION PRIMITIVES  ////////////////////
+
+float4 get_quad_vector_naive(float4 output_pixel_num_wrt_uvxy)
+{
+    //  Requires:   Two measures of the current fragment's output pixel number
+    //              in the range ([0, output_size.x), [0, output_size.y)):
+    //              1.) output_pixel_num_wrt_uvxy.xy increase with uv coords.
+    //              2.) output_pixel_num_wrt_uvxy.zw increase with screen xy.
+    //  Returns:    Two measures of the fragment's position in its 2x2 quad:
+    //              1.) The .xy components are its 2x2 placement with respect to
+    //                  uv direction (the origin (0, 0) is at the top-left):
+    //                  top-left     = (-1.0, -1.0) top-right    = ( 1.0, -1.0)
+    //                  bottom-left  = (-1.0,  1.0) bottom-right = ( 1.0,  1.0)
+    //                  You need this to arrange/weight shared texture samples.
+    //              2.) The .zw components are its 2x2 placement with respect to
+    //                  screen xy direction (position); the origin varies.
+    //                  quad_gather needs this measure to work correctly.
+    //              Note: quad_vector.zw = quad_vector.xy * float2(
+    //                      ddx(output_pixel_num_wrt_uvxy.x),
+    //                      ddy(output_pixel_num_wrt_uvxy.y));
+    //  Caveats:    This function assumes the GPU driver always starts 2x2 pixel
+    //              quads at even pixel numbers.  This assumption can be wrong
+    //              for odd output resolutions (nondeterministically so).
+    float4 pixel_odd = frac(output_pixel_num_wrt_uvxy * 0.5) * 2.0;
+    float4 quad_vector = pixel_odd * 2.0 - float4(1.0);
+    return quad_vector;
+}
+
+float4 get_quad_vector(float4 output_pixel_num_wrt_uvxy)
+{
+    //  Requires:   Same as get_quad_vector_naive() (see that first).
+    //  Returns:    Same as get_quad_vector_naive() (see that first), but it's
+    //              correct even if the 2x2 pixel quad starts at an odd pixel,
+    //              which can occur at odd resolutions.
+    float4 quad_vector_guess =
+        get_quad_vector_naive(output_pixel_num_wrt_uvxy);
+    //  If quad_vector_guess.zw doesn't increase with screen xy, we know
+    //  the 2x2 pixel quad starts at an odd pixel:
+    float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_guess.z),
+                                                ddy(quad_vector_guess.w));
+    return quad_vector_guess * odd_start_mirror.xyxy;
+}
+
+float4 get_quad_vector(float2 output_pixel_num_wrt_uv)
+{
+    //  Requires:   1.) ddx() and ddy() are present in the current Cg profile.
+    //              2.) output_pixel_num_wrt_uv must increase with uv coords and
+    //                  measure the current fragment's output pixel number in:
+    //                      ([0, output_size.x), [0, output_size.y))
+    //  Returns:    Same as get_quad_vector_naive() (see that first), but it's
+    //              correct even if the 2x2 pixel quad starts at an odd pixel,
+    //              which can occur at odd resolutions.
+    //  Caveats:    This function requires less information than the version
+    //              taking a float4, but it's potentially slower.
+    //  Do screen coords increase with or against uv?  Get the direction
+    //  with respect to (uv.x, uv.y) for (screen.x, screen.y) in {-1, 1}.
+    float2 screen_uv_mirror = float2(ddx(output_pixel_num_wrt_uv.x),
+                                        ddy(output_pixel_num_wrt_uv.y));
+    float2 pixel_odd_wrt_uv = frac(output_pixel_num_wrt_uv * 0.5) * 2.0;
+    float2 quad_vector_uv_guess = (pixel_odd_wrt_uv - float2(0.5)) * 2.0;
+    float2 quad_vector_screen_guess = quad_vector_uv_guess * screen_uv_mirror;
+    //  If quad_vector_screen_guess doesn't increase with screen xy, we know
+    //  the 2x2 pixel quad starts at an odd pixel:
+    float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_screen_guess.x),
+                                                ddy(quad_vector_screen_guess.y));
+    float4 quad_vector_guess = float4(
+        quad_vector_uv_guess, quad_vector_screen_guess);
+    return quad_vector_guess * odd_start_mirror.xyxy;
+}
+
+void quad_gather(float4 quad_vector, float4 curr,
+    out float4 adjx, out float4 adjy, out float4 diag)
+{
+    //  Requires:   1.) ddx() and ddy() are present in the current Cg profile.
+    //              2.) The GPU driver is using fine/high-quality derivatives.
+    //              3.) quad_vector describes the current fragment's location in
+    //                  its 2x2 pixel quad using get_quad_vector()'s conventions.
+    //              4.) curr is any vector you wish to get neighboring values of.
+    //  Returns:    Values of an input vector (curr) at neighboring fragments
+    //              adjacent x, adjacent y, and diagonal (via out parameters).
+    adjx = curr - ddx(curr) * quad_vector.z;
+    adjy = curr - ddy(curr) * quad_vector.w;
+    diag = adjx - ddy(adjx) * quad_vector.w;
+}
+
+void quad_gather(float4 quad_vector, float3 curr,
+    out float3 adjx, out float3 adjy, out float3 diag)
+{
+    //  Float3 version
+    adjx = curr - ddx(curr) * quad_vector.z;
+    adjy = curr - ddy(curr) * quad_vector.w;
+    diag = adjx - ddy(adjx) * quad_vector.w;
+}
+
+void quad_gather(float4 quad_vector, float2 curr,
+    out float2 adjx, out float2 adjy, out float2 diag)
+{
+    //  Float2 version
+    adjx = curr - ddx(curr) * quad_vector.z;
+    adjy = curr - ddy(curr) * quad_vector.w;
+    diag = adjx - ddy(adjx) * quad_vector.w;
+}
+
+float4 quad_gather(float4 quad_vector, float curr)
+{
+    //  Float version:
+    //  Returns:    return.x == current
+    //              return.y == adjacent x
+    //              return.z == adjacent y
+    //              return.w == diagonal
+    float4 all = float4(curr);
+    all.y = all.x - ddx(all.x) * quad_vector.z;
+    all.zw = all.xy - ddy(all.xy) * quad_vector.w;
+    return all;
+}
+
+float4 quad_gather_sum(float4 quad_vector, float4 curr)
+{
+    //  Requires:   Same as quad_gather()
+    //  Returns:    Sum of an input vector (curr) at all fragments in a quad.
+    float4 adjx, adjy, diag;
+    quad_gather(quad_vector, curr, adjx, adjy, diag);
+    return (curr + adjx + adjy + diag);
+}
+
+float3 quad_gather_sum(float4 quad_vector, float3 curr)
+{
+    //  Float3 version:
+    float3 adjx, adjy, diag;
+    quad_gather(quad_vector, curr, adjx, adjy, diag);
+    return (curr + adjx + adjy + diag);
+}
+
+float2 quad_gather_sum(float4 quad_vector, float2 curr)
+{
+    //  Float2 version:
+    float2 adjx, adjy, diag;
+    quad_gather(quad_vector, curr, adjx, adjy, diag);
+    return (curr + adjx + adjy + diag);
+}
+
+float quad_gather_sum(float4 quad_vector, float curr)
+{
+    //  Float version:
+    float4 all_values = quad_gather(quad_vector, curr);
+    return (all_values.x + all_values.y + all_values.z + all_values.w);
+}
+
+bool fine_derivatives_working(float4 quad_vector, float4 curr)
+{
+    //  Requires:   1.) ddx() and ddy() are present in the current Cg profile.
+    //              2.) quad_vector describes the current fragment's location in
+    //                  its 2x2 pixel quad using get_quad_vector()'s conventions.
+    //              3.) curr must be a test vector with non-constant derivatives
+    //                  (its value should change nonlinearly across fragments).
+    //  Returns:    true if fine/hybrid/high-quality derivatives are used, or
+    //              false if coarse derivatives are used or inconclusive
+    //  Usage:      Test whether quad-pixel communication is working!
+    //  Method:     We can confirm fine derivatives are used if the following
+    //              holds (ever, for any value at any fragment):
+    //                  (ddy(curr) != ddy(adjx)) or (ddx(curr) != ddx(adjy))
+    //              The more values we test (e.g. test a float4 two ways), the
+    //              easier it is to demonstrate fine derivatives are working.
+    //  TODO: Check for floating point exact comparison issues!
+    float4 ddx_curr = ddx(curr);
+    float4 ddy_curr = ddy(curr);
+    float4 adjx = curr - ddx_curr * quad_vector.z;
+    float4 adjy = curr - ddy_curr * quad_vector.w;
+    bool ddy_different = any(bool4(ddy_curr.x != ddy(adjx).x, ddy_curr.y != ddy(adjx).y, ddy_curr.z != ddy(adjx).z, ddy_curr.w != ddy(adjx).w));
+    bool ddx_different = any(bool4(ddx_curr.x != ddx(adjy).x, ddx_curr.y != ddx(adjy).y, ddx_curr.z != ddx(adjy).z, ddx_curr.w != ddx(adjy).w));
+    return any(bool2(ddy_different, ddx_different));
+}
+
+bool fine_derivatives_working_fast(float4 quad_vector, float curr)
+{
+    //  Requires:   Same as fine_derivatives_working()
+    //  Returns:    Same as fine_derivatives_working()
+    //  Usage:      This is faster than fine_derivatives_working() but more
+    //              likely to return false negatives, so it's less useful for
+    //              offline testing/debugging.  It's also useless as the basis
+    //              for dynamic runtime branching as of May 2014: Derivatives
+    //              (and quad-pixel communication) are currently disallowed in
+    //              branches.  However, future GPU's may allow you to use them
+    //              in dynamic branches if you promise the branch condition
+    //              evaluates the same for every fragment in the quad (and/or if
+    //              the driver enforces that promise by making a single fragment
+    //              control branch decisions).  If that ever happens, this
+    //              version may become a more economical choice.
+    float ddx_curr = ddx(curr);
+    float ddy_curr = ddy(curr);
+    float adjx = curr - ddx_curr * quad_vector.z;
+    return (ddy_curr != ddy(adjx));
+}
+
+#endif  //  QUAD_PIXEL_COMMUNICATION_H
+
+////////////////////////  END QUAD-PIXEL-COMMUNICATION  ///////////////////////
+
+//#include "special-functions.h"
+
+///////////////////////////  BEGIN SPECIAL-FUNCTIONS  //////////////////////////
+
+#ifndef SPECIAL_FUNCTIONS_H
+#define SPECIAL_FUNCTIONS_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file implements the following mathematical special functions:
+//  1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2))
+//  2.) gamma(s), a real-numbered extension of the integer factorial function
+//  It also implements normalized_ligamma(s, z), a normalized lower incomplete
+//  gamma function for s < 0.5 only.  Both gamma() and normalized_ligamma() can
+//  be called with an _impl suffix to use an implementation version with a few
+//  extra precomputed parameters (which may be useful for the caller to reuse).
+//  See below for details.
+//
+//  Design Rationale:
+//  Pretty much every line of code in this file is duplicated four times for
+//  different input types (float4/float3/float2/float).  This is unfortunate,
+//  but Cg doesn't allow function templates.  Macros would be far less verbose,
+//  but they would make the code harder to document and read.  I don't expect
+//  these functions will require a whole lot of maintenance changes unless
+//  someone ever has need for more robust incomplete gamma functions, so code
+//  duplication seems to be the lesser evil in this case.
+
+
+///////////////////////////  GAUSSIAN ERROR FUNCTION  //////////////////////////
+
+float4 erf6(float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Return an Abramowitz/Stegun approximation of erf(), where:
+    //                  erf(x) = 2/sqrt(pi) * integral(e**(-x**2))
+    //              This approximation has a max absolute error of 2.5*10**-5
+    //              with solid numerical robustness and efficiency.  See:
+	//                  https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions
+	static const float4 one = float4(1.0);
+	const float4 sign_x = sign(x);
+	const float4 t = one/(one + 0.47047*abs(x));
+	const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float3 erf6(const float3 x)
+{
+    //  Float3 version:
+	static const float3 one = float3(1.0);
+	const float3 sign_x = sign(x);
+	const float3 t = one/(one + 0.47047*abs(x));
+	const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float2 erf6(const float2 x)
+{
+    //  Float2 version:
+	static const float2 one = float2(1.0);
+	const float2 sign_x = sign(x);
+	const float2 t = one/(one + 0.47047*abs(x));
+	const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float erf6(const float x)
+{
+    //  Float version:
+	const float sign_x = sign(x);
+	const float t = 1.0/(1.0 + 0.47047*abs(x));
+	const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float4 erft(const float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Approximate erf() with the hyperbolic tangent.  The error is
+    //              visually noticeable, but it's blazing fast and perceptually
+    //              close...at least on ATI hardware.  See:
+    //                  http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html
+    //  Warning:    Only use this if your hardware drivers correctly implement
+    //              tanh(): My nVidia 8800GTS returns garbage output.
+	return tanh(1.202760580 * x);
+}
+
+float3 erft(const float3 x)
+{
+    //  Float3 version:
+	return tanh(1.202760580 * x);
+}
+
+float2 erft(const float2 x)
+{
+    //  Float2 version:
+	return tanh(1.202760580 * x);
+}
+
+float erft(const float x)
+{
+    //  Float version:
+	return tanh(1.202760580 * x);
+}
+
+inline float4 erf(const float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Some approximation of erf(x), depending on user settings.
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float3 erf(const float3 x)
+{
+    //  Float3 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float2 erf(const float2 x)
+{
+    //  Float2 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float erf(const float x)
+{
+    //  Float version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+
+///////////////////////////  COMPLETE GAMMA FUNCTION  //////////////////////////
+
+float4 gamma_impl(const float4 s, const float4 s_inv)
+{
+    //  Requires:   1.) s is the standard parameter to the gamma function, and
+    //                  it should lie in the [0, 36] range.
+    //              2.) s_inv = 1.0/s.  This implementation function requires
+    //                  the caller to precompute this value, giving users the
+    //                  opportunity to reuse it.
+    //  Returns:    Return approximate gamma function (real-numbered factorial)
+    //              output using the Lanczos approximation with two coefficients
+    //              calculated using Paul Godfrey's method here:
+    //                  http://my.fit.edu/~gabdo/gamma.txt
+    //              An optimal g value for s in [0, 36] is ~1.12906830989, with
+    //              a maximum relative error of 0.000463 for 2**16 equally
+    //              evals.  We could use three coeffs (0.0000346 error) without
+    //              hurting latency, but this allows more parallelism with
+    //              outside instructions.
+	static const float4 g = float4(1.12906830989);
+	static const float4 c0 = float4(0.8109119309638332633713423362694399653724431);
+	static const float4 c1 = float4(0.4808354605142681877121661197951496120000040);
+	static const float4 e = float4(2.71828182845904523536028747135266249775724709);
+	const float4 sph = s + float4(0.5);
+	const float4 lanczos_sum = c0 + c1/(s + float4(1.0));
+	const float4 base = (sph + g)/e;  //  or (s + g + float4(0.5))/e
+	//  gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s).
+	//  This has less error for small s's than (s -= 1.0) at the beginning.
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float3 gamma_impl(const float3 s, const float3 s_inv)
+{
+    //  Float3 version:
+	static const float3 g = float3(1.12906830989);
+	static const float3 c0 = float3(0.8109119309638332633713423362694399653724431);
+	static const float3 c1 = float3(0.4808354605142681877121661197951496120000040);
+	static const float3 e = float3(2.71828182845904523536028747135266249775724709);
+	const float3 sph = s + float3(0.5);
+	const float3 lanczos_sum = c0 + c1/(s + float3(1.0));
+	const float3 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float2 gamma_impl(const float2 s, const float2 s_inv)
+{
+    //  Float2 version:
+	static const float2 g = float2(1.12906830989);
+	static const float2 c0 = float2(0.8109119309638332633713423362694399653724431);
+	static const float2 c1 = float2(0.4808354605142681877121661197951496120000040);
+	static const float2 e = float2(2.71828182845904523536028747135266249775724709);
+	const float2 sph = s + float2(0.5);
+	const float2 lanczos_sum = c0 + c1/(s + float2(1.0));
+	const float2 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float gamma_impl(const float s, const float s_inv)
+{
+    //  Float version:
+	static const float g = 1.12906830989;
+	static const float c0 = 0.8109119309638332633713423362694399653724431;
+	static const float c1 = 0.4808354605142681877121661197951496120000040;
+	static const float e = 2.71828182845904523536028747135266249775724709;
+	const float sph = s + 0.5;
+	const float lanczos_sum = c0 + c1/(s + 1.0);
+	const float base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float4 gamma(const float4 s)
+{
+    //  Requires:   s is the standard parameter to the gamma function, and it
+    //              should lie in the [0, 36] range.
+    //  Returns:    Return approximate gamma function output with a maximum
+    //              relative error of 0.000463.  See gamma_impl for details.
+	return gamma_impl(s, float4(1.0)/s);
+}
+
+float3 gamma(const float3 s)
+{
+    //  Float3 version:
+	return gamma_impl(s, float3(1.0)/s);
+}
+
+float2 gamma(const float2 s)
+{
+    //  Float2 version:
+	return gamma_impl(s, float2(1.0)/s);
+}
+
+float gamma(const float s)
+{
+    //  Float version:
+	return gamma_impl(s, 1.0/s);
+}
+
+
+////////////////  INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT)  ///////////////
+
+//  Lower incomplete gamma function for small s and z (implementation):
+float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z <= ~0.775075
+    //              3.) s_inv = 1.0/s (precomputed for outside reuse)
+    //  Returns:    A series representation for the lower incomplete gamma
+    //              function for small s and small z (4 terms).
+    //  The actual "rolled up" summation looks like:
+	//      last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0;
+	//      sum = last_sign * last_pow / ((s + k) * last_factorial)
+	//      for(int i = 0; i < 4; ++i)
+	//      {
+	//          last_sign *= -1.0; last_pow *= z; last_factorial *= i;
+	//          sum += last_sign * last_pow / ((s + k) * last_factorial);
+	//      }
+	//  Unrolled, constant-unfolded and arranged for madds and parallelism:
+	const float4 scale = pow(z, s);
+	float4 sum = s_inv;  //  Summation iteration 0 result
+	//  Summation iterations 1, 2, and 3:
+	const float4 z_sq = z*z;
+	const float4 denom1 = s + float4(1.0);
+	const float4 denom2 = 2.0*s + float4(4.0);
+	const float4 denom3 = 6.0*s + float4(18.0);
+	//float4 denom4 = 24.0*s + float4(96.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	//sum += z_sq * z_sq / denom4;
+	//  Scale and return:
+	return scale * sum;
+}
+
+float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv)
+{
+    //  Float3 version:
+	const float3 scale = pow(z, s);
+	float3 sum = s_inv;
+	const float3 z_sq = z*z;
+	const float3 denom1 = s + float3(1.0);
+	const float3 denom2 = 2.0*s + float3(4.0);
+	const float3 denom3 = 6.0*s + float3(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv)
+{
+    //  Float2 version:
+	const float2 scale = pow(z, s);
+	float2 sum = s_inv;
+	const float2 z_sq = z*z;
+	const float2 denom1 = s + float2(1.0);
+	const float2 denom2 = 2.0*s + float2(4.0);
+	const float2 denom3 = 6.0*s + float2(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float ligamma_small_z_impl(const float s, const float z, const float s_inv)
+{
+    //  Float version:
+	const float scale = pow(z, s);
+	float sum = s_inv;
+	const float z_sq = z*z;
+	const float denom1 = s + 1.0;
+	const float denom2 = 2.0*s + 4.0;
+	const float denom3 = 6.0*s + 18.0;
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+//  Upper incomplete gamma function for small s and large z (implementation):
+float4 uigamma_large_z_impl(const float4 s, const float4 z)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z > ~0.775075
+    //  Returns:    Gauss's continued fraction representation for the upper
+    //              incomplete gamma function (4 terms).
+	//  The "rolled up" continued fraction looks like this.  The denominator
+    //  is truncated, and it's calculated "from the bottom up:"
+	//      denom = float4('inf');
+	//      float4 one = float4(1.0);
+	//      for(int i = 4; i > 0; --i)
+	//      {
+	//          denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom;
+	//      }
+	//  Unrolled and constant-unfolded for madds and parallelism:
+	const float4 numerator = pow(z, s) * exp(-z);
+	float4 denom = float4(7.0) + z - s;
+	denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom;
+	denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom;
+	denom = float4(1.0) + z - s + (s - float4(1.0))/denom;
+	return numerator / denom;
+}
+
+float3 uigamma_large_z_impl(const float3 s, const float3 z)
+{
+    //  Float3 version:
+	const float3 numerator = pow(z, s) * exp(-z);
+	float3 denom = float3(7.0) + z - s;
+	denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom;
+	denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom;
+	denom = float3(1.0) + z - s + (s - float3(1.0))/denom;
+	return numerator / denom;
+}
+
+float2 uigamma_large_z_impl(const float2 s, const float2 z)
+{
+    //  Float2 version:
+	const float2 numerator = pow(z, s) * exp(-z);
+	float2 denom = float2(7.0) + z - s;
+	denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom;
+	denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom;
+	denom = float2(1.0) + z - s + (s - float2(1.0))/denom;
+	return numerator / denom;
+}
+
+float uigamma_large_z_impl(const float s, const float z)
+{
+    //  Float version:
+	const float numerator = pow(z, s) * exp(-z);
+	float denom = 7.0 + z - s;
+	denom = 5.0 + z - s + (3.0*s - 9.0)/denom;
+	denom = 3.0 + z - s + (2.0*s - 4.0)/denom;
+	denom = 1.0 + z - s + (s - 1.0)/denom;
+	return numerator / denom;
+}
+
+//  Normalized lower incomplete gamma function for small s (implementation):
+float4 normalized_ligamma_impl(const float4 s, const float4 z,
+    const float4 s_inv, const float4 gamma_s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) s_inv = 1/s (precomputed for outside reuse)
+    //              3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse)
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  Since we only care about s < 0.5, we only need
+    //              to evaluate two branches (not four) based on z.  Each branch
+    //              uses four terms, with a max relative error of ~0.00182.  The
+    //              branch threshold and specifics were adapted for fewer terms
+    //              from Gil/Segura/Temme's paper here:
+    //                  http://oai.cwi.nl/oai/asset/20433/20433B.pdf
+	//  Evaluate both branches: Real branches test slower even when available.
+	static const float4 thresh = float4(0.775075);
+	bool4 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	z_is_large.w = z.w > thresh.w;
+	const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	//  Combine the results from both branches:
+	bool4 inverse_z_is_large = not(z_is_large);
+	return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large);
+}
+
+float3 normalized_ligamma_impl(const float3 s, const float3 z,
+    const float3 s_inv, const float3 gamma_s_inv)
+{
+    //  Float3 version:
+	static const float3 thresh = float3(0.775075);
+	bool3 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool3 inverse_z_is_large = not(z_is_large);
+	return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large);
+}
+
+float2 normalized_ligamma_impl(const float2 s, const float2 z,
+    const float2 s_inv, const float2 gamma_s_inv)
+{
+    //  Float2 version:
+	static const float2 thresh = float2(0.775075);
+	bool2 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool2 inverse_z_is_large = not(z_is_large);
+	return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large);
+}
+
+float normalized_ligamma_impl(const float s, const float z,
+    const float s_inv, const float gamma_s_inv)
+{
+    //  Float version:
+	static const float thresh = 0.775075;
+	const bool z_is_large = z > thresh;
+	const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	return large_z * float(z_is_large) + small_z * float(!z_is_large);
+}
+
+//  Normalized lower incomplete gamma function for small s:
+float4 normalized_ligamma(const float4 s, const float4 z)
+{
+    //  Requires:   s < ~0.5
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  See normalized_ligamma_impl() for details.
+	const float4 s_inv = float4(1.0)/s;
+	const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float3 normalized_ligamma(const float3 s, const float3 z)
+{
+    //  Float3 version:
+	const float3 s_inv = float3(1.0)/s;
+	const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float2 normalized_ligamma(const float2 s, const float2 z)
+{
+    //  Float2 version:
+	const float2 s_inv = float2(1.0)/s;
+	const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float normalized_ligamma(const float s, const float z)
+{
+    //  Float version:
+	const float s_inv = 1.0/s;
+	const float gamma_s_inv = 1.0/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+#endif  //  SPECIAL_FUNCTIONS_H
+
+////////////////////////////  END SPECIAL-FUNCTIONS  ///////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////////  HELPERS  //////////////////////////////////
+
+inline float4 uv2_to_uv4(float2 tex_uv)
+{
+    //  Make a float2 uv offset safe for adding to float4 tex2Dlod coords:
+    return float4(tex_uv, 0.0, 0.0);
+}
+
+//  Make a length squared helper macro (for usage with static constants):
+#define LENGTH_SQ(vec) (dot(vec, vec))
+
+inline float get_fast_gaussian_weight_sum_inv(const float sigma)
+{
+    //  We can use the Gaussian integral to calculate the asymptotic weight for
+    //  the center pixel.  Since the unnormalized center pixel weight is 1.0,
+    //  the normalized weight is the same as the weight sum inverse.  Given a
+    //  large enough blur (9+), the asymptotic weight sum is close and faster:
+    //      center_weight = 0.5 *
+    //          (erf(0.5/(sigma*sqrt(2.0))) - erf(-0.5/(sigma*sqrt(2.0))))
+    //      erf(-x) == -erf(x), so we get 0.5 * (2.0 * erf(blah blah)):
+    //  However, we can get even faster results with curve-fitting.  These are
+    //  also closer than the asymptotic results, because they were constructed
+    //  from 64 blurs sizes from [3, 131) and 255 equally-spaced sigmas from
+    //  (0, blurN_std_dev), so the results for smaller sigmas are biased toward
+    //  smaller blurs.  The max error is 0.0031793913.
+    //  Relative FPS: 134.3 with erf, 135.8 with curve-fitting.
+    //static const float temp = 0.5/sqrt(2.0);
+    //return erf(temp/sigma);
+    return min(exp(exp(0.348348412457428/
+        (sigma - 0.0860587260734721))), 0.399334576340352/sigma);
+}
+
+
+////////////////////  ARBITRARILY RESIZABLE SEPARABLE BLURS  ///////////////////
+
+float3 tex2Dblur11resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 11x Gaussian blurred texture lookup using a 11-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  Calculate Gaussian blur kernel weights and a normalization factor for
+    //  distances of 0-4, ignoring constant factors (since we're normalizing).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float weight_sum_inv = 1.0 /
+        (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5));
+    //  Statically normalize weights, sum weighted samples, and return.  Blurs are
+    //  currently optimized for dynamic weights.
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w5 * tex2D_linearize(tex, tex_uv - 5.0 * dxdy).rgb;
+    sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
+    sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb;
+    sum += w5 * tex2D_linearize(tex, tex_uv + 5.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur9resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 9x Gaussian blurred texture lookup using a 9-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
+    sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur7resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 7x Gaussian blurred texture lookup using a 7-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3));
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur5resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 5x Gaussian blurred texture lookup using a 5-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2));
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur3resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 3x Gaussian blurred texture lookup using a 3-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1);
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+
+///////////////////////////  FAST SEPARABLE BLURS  ///////////////////////////
+
+float3 tex2Dblur11fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   1.) Global requirements must be met (see file description).
+    //              2.) filter_linearN must = "true" in your .cgp file.
+    //              3.) For gamma-correct bilinear filtering, global
+    //                  gamma_aware_bilinear == true (from gamma-management.h)
+    //  Returns:    A 1D 11x Gaussian blurred texture lookup using 6 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float weight_sum_inv = 1.0 /
+        (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    const float w01 = w0 * 0.5 + w1;
+    const float w23 = w2 + w3;
+    const float w45 = w4 + w5;
+    const float w01_ratio = w1/w01;
+    const float w23_ratio = w3/w23;
+    const float w45_ratio = w5/w45;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w45 * tex2D_linearize(tex, tex_uv - (4.0 + w45_ratio) * dxdy).rgb;
+    sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb;
+    sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb;
+    sum += w45 * tex2D_linearize(tex, tex_uv + (4.0 + w45_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur9fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 9x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 4 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    const float w12 = w1 + w2;
+    const float w34 = w3 + w4;
+    const float w12_ratio = w2/w12;
+    const float w34_ratio = w4/w34;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w34 * tex2D_linearize(tex, tex_uv - (3.0 + w34_ratio) * dxdy).rgb;
+    sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb;
+    sum += w34 * tex2D_linearize(tex, tex_uv + (3.0 + w34_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur7fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 7x Gaussian blurred texture lookup using 4 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    const float w01 = w0 * 0.5 + w1;
+    const float w23 = w2 + w3;
+    const float w01_ratio = w1/w01;
+    const float w23_ratio = w3/w23;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb;
+    sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur5fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 5x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 2 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    const float w12 = w1 + w2;
+    const float w12_ratio = w2/w12;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur3fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 3x Gaussian blurred texture lookup using 2 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    const float w01 = w0 * 0.5 + w1;
+    const float w01_ratio = w1/w01;
+    //  Weights for all samples are the same, so just average them:
+    return 0.5 * (
+        tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb +
+        tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb);
+}
+
+
+////////////////////////////  HUGE SEPARABLE BLURS  ////////////////////////////
+
+//  Huge separable blurs come only in "fast" versions.
+float3 tex2Dblur43fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 43x Gaussian blurred texture lookup using 22 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float w6 = exp(-36.0 * denom_inv);
+    const float w7 = exp(-49.0 * denom_inv);
+    const float w8 = exp(-64.0 * denom_inv);
+    const float w9 = exp(-81.0 * denom_inv);
+    const float w10 = exp(-100.0 * denom_inv);
+    const float w11 = exp(-121.0 * denom_inv);
+    const float w12 = exp(-144.0 * denom_inv);
+    const float w13 = exp(-169.0 * denom_inv);
+    const float w14 = exp(-196.0 * denom_inv);
+    const float w15 = exp(-225.0 * denom_inv);
+    const float w16 = exp(-256.0 * denom_inv);
+    const float w17 = exp(-289.0 * denom_inv);
+    const float w18 = exp(-324.0 * denom_inv);
+    const float w19 = exp(-361.0 * denom_inv);
+    const float w20 = exp(-400.0 * denom_inv);
+    const float w21 = exp(-441.0 * denom_inv);
+    //const float weight_sum_inv = 1.0 /
+    //    (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 +
+    //        w12 + w13 + w14 + w15 + w16 + w17 + w18 + w19 + w20 + w21));
+    const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    const float w0_1 = w0 * 0.5 + w1;
+    const float w2_3 = w2 + w3;
+    const float w4_5 = w4 + w5;
+    const float w6_7 = w6 + w7;
+    const float w8_9 = w8 + w9;
+    const float w10_11 = w10 + w11;
+    const float w12_13 = w12 + w13;
+    const float w14_15 = w14 + w15;
+    const float w16_17 = w16 + w17;
+    const float w18_19 = w18 + w19;
+    const float w20_21 = w20 + w21;
+    const float w0_1_ratio = w1/w0_1;
+    const float w2_3_ratio = w3/w2_3;
+    const float w4_5_ratio = w5/w4_5;
+    const float w6_7_ratio = w7/w6_7;
+    const float w8_9_ratio = w9/w8_9;
+    const float w10_11_ratio = w11/w10_11;
+    const float w12_13_ratio = w13/w12_13;
+    const float w14_15_ratio = w15/w14_15;
+    const float w16_17_ratio = w17/w16_17;
+    const float w18_19_ratio = w19/w18_19;
+    const float w20_21_ratio = w21/w20_21;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w20_21 * tex2D_linearize(tex, tex_uv - (20.0 + w20_21_ratio) * dxdy).rgb;
+    sum += w18_19 * tex2D_linearize(tex, tex_uv - (18.0 + w18_19_ratio) * dxdy).rgb;
+    sum += w16_17 * tex2D_linearize(tex, tex_uv - (16.0 + w16_17_ratio) * dxdy).rgb;
+    sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb;
+    sum += w16_17 * tex2D_linearize(tex, tex_uv + (16.0 + w16_17_ratio) * dxdy).rgb;
+    sum += w18_19 * tex2D_linearize(tex, tex_uv + (18.0 + w18_19_ratio) * dxdy).rgb;
+    sum += w20_21 * tex2D_linearize(tex, tex_uv + (20.0 + w20_21_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur31fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 31x Gaussian blurred texture lookup using 16 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float w6 = exp(-36.0 * denom_inv);
+    const float w7 = exp(-49.0 * denom_inv);
+    const float w8 = exp(-64.0 * denom_inv);
+    const float w9 = exp(-81.0 * denom_inv);
+    const float w10 = exp(-100.0 * denom_inv);
+    const float w11 = exp(-121.0 * denom_inv);
+    const float w12 = exp(-144.0 * denom_inv);
+    const float w13 = exp(-169.0 * denom_inv);
+    const float w14 = exp(-196.0 * denom_inv);
+    const float w15 = exp(-225.0 * denom_inv);
+    //const float weight_sum_inv = 1.0 /
+    //    (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 +
+    //        w9 + w10 + w11 + w12 + w13 + w14 + w15));
+    const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    const float w0_1 = w0 * 0.5 + w1;
+    const float w2_3 = w2 + w3;
+    const float w4_5 = w4 + w5;
+    const float w6_7 = w6 + w7;
+    const float w8_9 = w8 + w9;
+    const float w10_11 = w10 + w11;
+    const float w12_13 = w12 + w13;
+    const float w14_15 = w14 + w15;
+    const float w0_1_ratio = w1/w0_1;
+    const float w2_3_ratio = w3/w2_3;
+    const float w4_5_ratio = w5/w4_5;
+    const float w6_7_ratio = w7/w6_7;
+    const float w8_9_ratio = w9/w8_9;
+    const float w10_11_ratio = w11/w10_11;
+    const float w12_13_ratio = w13/w12_13;
+    const float w14_15_ratio = w15/w14_15;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur25fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 25x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 12 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float w6 = exp(-36.0 * denom_inv);
+    const float w7 = exp(-49.0 * denom_inv);
+    const float w8 = exp(-64.0 * denom_inv);
+    const float w9 = exp(-81.0 * denom_inv);
+    const float w10 = exp(-100.0 * denom_inv);
+    const float w11 = exp(-121.0 * denom_inv);
+    const float w12 = exp(-144.0 * denom_inv);
+    //const float weight_sum_inv = 1.0 / (w0 + 2.0 * (
+    //    w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12));
+    const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    const float w1_2 = w1 + w2;
+    const float w3_4 = w3 + w4;
+    const float w5_6 = w5 + w6;
+    const float w7_8 = w7 + w8;
+    const float w9_10 = w9 + w10;
+    const float w11_12 = w11 + w12;
+    const float w1_2_ratio = w2/w1_2;
+    const float w3_4_ratio = w4/w3_4;
+    const float w5_6_ratio = w6/w5_6;
+    const float w7_8_ratio = w8/w7_8;
+    const float w9_10_ratio = w10/w9_10;
+    const float w11_12_ratio = w12/w11_12;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w11_12 * tex2D_linearize(tex, tex_uv - (11.0 + w11_12_ratio) * dxdy).rgb;
+    sum += w9_10 * tex2D_linearize(tex, tex_uv - (9.0 + w9_10_ratio) * dxdy).rgb;
+    sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w9_10 * tex2D_linearize(tex, tex_uv + (9.0 + w9_10_ratio) * dxdy).rgb;
+    sum += w11_12 * tex2D_linearize(tex, tex_uv + (11.0 + w11_12_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur17fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 17x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 8 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float w6 = exp(-36.0 * denom_inv);
+    const float w7 = exp(-49.0 * denom_inv);
+    const float w8 = exp(-64.0 * denom_inv);
+    //const float weight_sum_inv = 1.0 / (w0 + 2.0 * (
+    //    w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8));
+    const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    const float w1_2 = w1 + w2;
+    const float w3_4 = w3 + w4;
+    const float w5_6 = w5 + w6;
+    const float w7_8 = w7 + w8;
+    const float w1_2_ratio = w2/w1_2;
+    const float w3_4_ratio = w4/w3_4;
+    const float w5_6_ratio = w6/w5_6;
+    const float w7_8_ratio = w8/w7_8;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+
+////////////////////  ARBITRARILY RESIZABLE ONE-PASS BLURS  ////////////////////
+
+float3 tex2Dblur3x3resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 3x3 Gaussian blurred mipmapped texture lookup of the
+    //              resized input.
+    //  Description:
+    //  This is the only arbitrarily resizable one-pass blur; tex2Dblur5x5resize
+    //  would perform like tex2Dblur9x9, MUCH slower than tex2Dblur5resize.
+    const float denom_inv = 0.5/(sigma*sigma);
+    //  Load each sample.  We need all 3x3 samples.  Quad-pixel communication
+    //  won't help either: This should perform like tex2Dblur5x5, but sharing a
+    //  4x4 sample field would perform more like tex2Dblur8x8shared (worse).
+    const float2 sample4_uv = tex_uv;
+    const float2 dx = float2(dxdy.x, 0.0);
+    const float2 dy = float2(0.0, dxdy.y);
+    const float2 sample1_uv = sample4_uv - dy;
+    const float2 sample7_uv = sample4_uv + dy;
+    const float3 sample0 = tex2D_linearize(tex, sample1_uv - dx).rgb;
+    const float3 sample1 = tex2D_linearize(tex, sample1_uv).rgb;
+    const float3 sample2 = tex2D_linearize(tex, sample1_uv + dx).rgb;
+    const float3 sample3 = tex2D_linearize(tex, sample4_uv - dx).rgb;
+    const float3 sample4 = tex2D_linearize(tex, sample4_uv).rgb;
+    const float3 sample5 = tex2D_linearize(tex, sample4_uv + dx).rgb;
+    const float3 sample6 = tex2D_linearize(tex, sample7_uv - dx).rgb;
+    const float3 sample7 = tex2D_linearize(tex, sample7_uv).rgb;
+    const float3 sample8 = tex2D_linearize(tex, sample7_uv + dx).rgb;
+    //  Statically compute Gaussian sample weights:
+    const float w4 = 1.0;
+    const float w1_3_5_7 = exp(-LENGTH_SQ(float2(1.0, 0.0)) * denom_inv);
+    const float w0_2_6_8 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
+    const float weight_sum_inv = 1.0/(w4 + 4.0 * (w1_3_5_7 + w0_2_6_8));
+    //  Weight and sum the samples:
+    const float3 sum = w4 * sample4 +
+        w1_3_5_7 * (sample1 + sample3 + sample5 + sample7) +
+        w0_2_6_8 * (sample0 + sample2 + sample6 + sample8);
+    return sum * weight_sum_inv;
+}
+
+
+////////////////////////////  FASTER ONE-PASS BLURS  ///////////////////////////
+
+float3 tex2Dblur9x9(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Perform a 1-pass 9x9 blur with 5x5 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 9x9 Gaussian blurred mipmapped texture lookup composed of
+    //              5x5 carefully selected bilinear samples.
+    //  Description:
+    //  Perform a 1-pass 9x9 blur with 5x5 bilinear samples.  Adjust the
+    //  bilinear sample location to reflect the true Gaussian weights for each
+    //  underlying texel.  The following diagram illustrates the relative
+    //  locations of bilinear samples.  Each sample with the same number has the
+    //  same weight (notice the symmetry).  The letters a, b, c, d distinguish
+    //  quadrants, and the letters U, D, L, R, C (up, down, left, right, center)
+    //  distinguish 1D directions along the line containing the pixel center:
+    //      6a 5a 2U 5b 6b
+    //      4a 3a 1U 3b 4b
+    //      2L 1L 0C 1R 2R
+    //      4c 3c 1D 3d 4d
+    //      6c 5c 2D 5d 6d
+    //  The following diagram illustrates the underlying equally spaced texels,
+    //  named after the sample that accesses them and subnamed by their location
+    //  within their 2x2, 2x1, 1x2, or 1x1 texel block:
+    //      6a4 6a3 5a4 5a3 2U2 5b3 5b4 6b3 6b4
+    //      6a2 6a1 5a2 5a1 2U1 5b1 5b2 6b1 6b2
+    //      4a4 4a3 3a4 3a3 1U2 3b3 3b4 4b3 4b4
+    //      4a2 4a1 3a2 3a1 1U1 3b1 3b2 4b1 4b2
+    //      2L2 2L1 1L2 1L1 0C1 1R1 1R2 2R1 2R2
+    //      4c2 4c1 3c2 3c1 1D1 3d1 3d2 4d1 4d2
+    //      4c4 4c3 3c4 3c3 1D2 3d3 3d4 4d3 4d4
+    //      6c2 6c1 5c2 5c1 2D1 5d1 5d2 6d1 6d2
+    //      6c4 6c3 5c4 5c3 2D2 5d3 5d4 6d3 6d4
+    //  Note there is only one C texel and only two texels for each U, D, L, or
+    //  R sample.  The center sample is effectively a nearest neighbor sample,
+    //  and the U/D/L/R samples use 1D linear filtering.  All other texels are
+    //  read with bilinear samples somewhere within their 2x2 texel blocks.
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute sampling offsets within each 2x2 texel block, based
+    //  on 1D sampling ratios between texels [1, 2] and [3, 4] texels away from
+    //  the center, and reuse them independently for both dimensions.  Compute
+    //  these offsets based on the relative 1D Gaussian weights of the texels
+    //  in question.  (w1off means "Gaussian weight for the texel 1.0 texels
+    //  away from the pixel center," etc.).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w1off = exp(-1.0 * denom_inv);
+    const float w2off = exp(-4.0 * denom_inv);
+    const float w3off = exp(-9.0 * denom_inv);
+    const float w4off = exp(-16.0 * denom_inv);
+    const float texel1to2ratio = w2off/(w1off + w2off);
+    const float texel3to4ratio = w4off/(w3off + w4off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including x-axis-aligned:
+    const float2 sample1R_texel_offset = float2(1.0, 0.0) + float2(texel1to2ratio, 0.0);
+    const float2 sample2R_texel_offset = float2(3.0, 0.0) + float2(texel3to4ratio, 0.0);
+    const float2 sample3d_texel_offset = float2(1.0, 1.0) + float2(texel1to2ratio, texel1to2ratio);
+    const float2 sample4d_texel_offset = float2(3.0, 1.0) + float2(texel3to4ratio, texel1to2ratio);
+    const float2 sample5d_texel_offset = float2(1.0, 3.0) + float2(texel1to2ratio, texel3to4ratio);
+    const float2 sample6d_texel_offset = float2(3.0, 3.0) + float2(texel3to4ratio, texel3to4ratio);
+
+    //  CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
+    //  Statically compute Gaussian texel weights for the bottom-right quadrant.
+    //  Read underscores as "and."
+    const float w1R1 = w1off;
+    const float w1R2 = w2off;
+    const float w2R1 = w3off;
+    const float w2R2 = w4off;
+    const float w3d1 =     exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
+    const float w3d2_3d3 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv);
+    const float w3d4 =     exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv);
+    const float w4d1_5d1 = exp(-LENGTH_SQ(float2(3.0, 1.0)) * denom_inv);
+    const float w4d2_5d3 = exp(-LENGTH_SQ(float2(4.0, 1.0)) * denom_inv);
+    const float w4d3_5d2 = exp(-LENGTH_SQ(float2(3.0, 2.0)) * denom_inv);
+    const float w4d4_5d4 = exp(-LENGTH_SQ(float2(4.0, 2.0)) * denom_inv);
+    const float w6d1 =     exp(-LENGTH_SQ(float2(3.0, 3.0)) * denom_inv);
+    const float w6d2_6d3 = exp(-LENGTH_SQ(float2(4.0, 3.0)) * denom_inv);
+    const float w6d4 =     exp(-LENGTH_SQ(float2(4.0, 4.0)) * denom_inv);
+    //  Statically add texel weights in each sample to get sample weights:
+    const float w0 = 1.0;
+    const float w1 = w1R1 + w1R2;
+    const float w2 = w2R1 + w2R2;
+    const float w3 = w3d1 + 2.0 * w3d2_3d3 + w3d4;
+    const float w4 = w4d1_5d1 + w4d2_5d3 + w4d3_5d2 + w4d4_5d4;
+    const float w5 = w4;
+    const float w6 = w6d1 + 2.0 * w6d2_6d3 + w6d4;
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv =
+        1.0/(w0 + 4.0 * (w1 + w2 + w3 + w4 + w5 + w6));
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 25 samples (1 nearest, 8 linear, 16 bilinear) using symmetry:
+    const float2 mirror_x = float2(-1.0, 1.0);
+    const float2 mirror_y = float2(1.0, -1.0);
+    const float2 mirror_xy = float2(-1.0, -1.0);
+    const float2 dxdy_mirror_x = dxdy * mirror_x;
+    const float2 dxdy_mirror_y = dxdy * mirror_y;
+    const float2 dxdy_mirror_xy = dxdy * mirror_xy;
+    //  Sampling order doesn't seem to affect performance, so just be clear:
+    const float3 sample0C = tex2D_linearize(tex, tex_uv).rgb;
+    const float3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb;
+    const float3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb;
+    const float3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb;
+    const float3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb;
+    const float3 sample2R = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset).rgb;
+    const float3 sample2D = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset.yx).rgb;
+    const float3 sample2L = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset).rgb;
+    const float3 sample2U = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset.yx).rgb;
+    const float3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb;
+    const float3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb;
+    const float3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb;
+    const float3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb;
+    const float3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb;
+    const float3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb;
+    const float3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb;
+    const float3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb;
+    const float3 sample5d = tex2D_linearize(tex, tex_uv + dxdy * sample5d_texel_offset).rgb;
+    const float3 sample5c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample5d_texel_offset).rgb;
+    const float3 sample5b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample5d_texel_offset).rgb;
+    const float3 sample5a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample5d_texel_offset).rgb;
+    const float3 sample6d = tex2D_linearize(tex, tex_uv + dxdy * sample6d_texel_offset).rgb;
+    const float3 sample6c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample6d_texel_offset).rgb;
+    const float3 sample6b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample6d_texel_offset).rgb;
+    const float3 sample6a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample6d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    float3 sum = w0 * sample0C;
+    sum += w1 * (sample1R + sample1D + sample1L + sample1U);
+    sum += w2 * (sample2R + sample2D + sample2L + sample2U);
+    sum += w3 * (sample3d + sample3c + sample3b + sample3a);
+    sum += w4 * (sample4d + sample4c + sample4b + sample4a);
+    sum += w5 * (sample5d + sample5c + sample5b + sample5a);
+    sum += w6 * (sample6d + sample6c + sample6b + sample6a);
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur7x7(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Perform a 1-pass 7x7 blur with 5x5 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 7x7 Gaussian blurred mipmapped texture lookup composed of
+    //              4x4 carefully selected bilinear samples.
+    //  Description:
+    //  First see the descriptions for tex2Dblur9x9() and tex2Dblur7().  This
+    //  blur mixes concepts from both.  The sample layout is as follows:
+    //      4a 3a 3b 4b
+    //      2a 1a 1b 2b
+    //      2c 1c 1d 2d
+    //      4c 3c 3d 4d
+    //  The texel layout is as follows.  Note that samples 3a/3b, 1a/1b, 1c/1d,
+    //  and 3c/3d share a vertical column of texels, and samples 2a/2c, 1a/1c,
+    //  1b/1d, and 2b/2d share a horizontal row of texels (all sample1's share
+    //  the center texel):
+    //      4a4  4a3  3a4  3ab3 3b4  4b3  4b4
+    //      4a2  4a1  3a2  3ab1 3b2  4b1  4b2
+    //      2a4  2a3  1a4  1ab3 1b4  2b3  2b4
+    //      2ac2 2ac1 1ac2 1*   1bd2 2bd1 2bd2
+    //      2c4  2c3  1c4  1cd3 1d4  2d3  2d4
+    //      4c2  4c1  3c2  3cd1 3d2  4d1  4d2
+    //      4c4  4c3  3c4  3cd3 3d4  4d3  4d4
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off = 1.0;
+    const float w1off = exp(-1.0 * denom_inv);
+    const float w2off = exp(-4.0 * denom_inv);
+    const float w3off = exp(-9.0 * denom_inv);
+    const float texel0to1ratio = w1off/(w0off * 0.5 + w1off);
+    const float texel2to3ratio = w3off/(w2off + w3off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including axis-aligned:
+    const float2 sample1d_texel_offset = float2(texel0to1ratio, texel0to1ratio);
+    const float2 sample2d_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample3d_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample4d_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+
+    //  CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
+    //  Statically compute Gaussian texel weights for the bottom-right quadrant.
+    //  Read underscores as "and."
+    const float w1abcd = 1.0;
+    const float w1bd2_1cd3 = exp(-LENGTH_SQ(float2(1.0, 0.0)) * denom_inv);
+    const float w2bd1_3cd1 = exp(-LENGTH_SQ(float2(2.0, 0.0)) * denom_inv);
+    const float w2bd2_3cd2 = exp(-LENGTH_SQ(float2(3.0, 0.0)) * denom_inv);
+    const float w1d4 =       exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
+    const float w2d3_3d2 =   exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv);
+    const float w2d4_3d4 =   exp(-LENGTH_SQ(float2(3.0, 1.0)) * denom_inv);
+    const float w4d1 =       exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv);
+    const float w4d2_4d3 =   exp(-LENGTH_SQ(float2(3.0, 2.0)) * denom_inv);
+    const float w4d4 =       exp(-LENGTH_SQ(float2(3.0, 3.0)) * denom_inv);
+    //  Statically add texel weights in each sample to get sample weights.
+    //  Split weights for shared texels between samples sharing them:
+    const float w1 = w1abcd * 0.25 + w1bd2_1cd3 + w1d4;
+    const float w2_3 = (w2bd1_3cd1 + w2bd2_3cd2) * 0.5 + w2d3_3d2 + w2d4_3d4;
+    const float w4 = w4d1 + 2.0 * w4d2_4d3 + w4d4;
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv =
+        1.0/(4.0 * (w1 + 2.0 * w2_3 + w4));
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 16 samples using symmetry:
+    const float2 mirror_x = float2(-1.0, 1.0);
+    const float2 mirror_y = float2(1.0, -1.0);
+    const float2 mirror_xy = float2(-1.0, -1.0);
+    const float2 dxdy_mirror_x = dxdy * mirror_x;
+    const float2 dxdy_mirror_y = dxdy * mirror_y;
+    const float2 dxdy_mirror_xy = dxdy * mirror_xy;
+    const float3 sample1a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample1d_texel_offset).rgb;
+    const float3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb;
+    const float3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb;
+    const float3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb;
+    const float3 sample1b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample1d_texel_offset).rgb;
+    const float3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb;
+    const float3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb;
+    const float3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb;
+    const float3 sample1c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample1d_texel_offset).rgb;
+    const float3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb;
+    const float3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb;
+    const float3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb;
+    const float3 sample1d = tex2D_linearize(tex, tex_uv + dxdy * sample1d_texel_offset).rgb;
+    const float3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb;
+    const float3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb;
+    const float3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w1 * (sample1a + sample1b + sample1c + sample1d);
+    sum += w2_3 * (sample2a + sample2b + sample2c + sample2d);
+    sum += w2_3 * (sample3a + sample3b + sample3c + sample3d);
+    sum += w4 * (sample4a + sample4b + sample4c + sample4d);
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur5x5(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Perform a 1-pass 5x5 blur with 3x3 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 5x5 Gaussian blurred mipmapped texture lookup composed of
+    //              3x3 carefully selected bilinear samples.
+    //  Description:
+    //  First see the description for tex2Dblur9x9().  This blur uses the same
+    //  concept and sample/texel locations except on a smaller scale.  Samples:
+    //      2a 1U 2b
+    //      1L 0C 1R
+    //      2c 1D 2d
+    //  Texels:
+    //      2a4 2a3 1U2 2b3 2b4
+    //      2a2 2a1 1U1 2b1 2b2
+    //      1L2 1L1 0C1 1R1 1R2
+    //      2c2 2c1 1D1 2d1 2d2
+    //      2c4 2c3 1D2 2d3 2d4
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w1off = exp(-1.0 * denom_inv);
+    const float w2off = exp(-4.0 * denom_inv);
+    const float texel1to2ratio = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including x-axis-aligned:
+    const float2 sample1R_texel_offset = float2(1.0, 0.0) + float2(texel1to2ratio, 0.0);
+    const float2 sample2d_texel_offset = float2(1.0, 1.0) + float2(texel1to2ratio, texel1to2ratio);
+
+    //  CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
+    //  Statically compute Gaussian texel weights for the bottom-right quadrant.
+    //  Read underscores as "and."
+    const float w1R1 = w1off;
+    const float w1R2 = w2off;
+    const float w2d1 =   exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
+    const float w2d2_3 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv);
+    const float w2d4 =   exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv);
+    //  Statically add texel weights in each sample to get sample weights:
+    const float w0 = 1.0;
+    const float w1 = w1R1 + w1R2;
+    const float w2 = w2d1 + 2.0 * w2d2_3 + w2d4;
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv = 1.0/(w0 + 4.0 * (w1 + w2));
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 9 samples (1 nearest, 4 linear, 4 bilinear) using symmetry:
+    const float2 mirror_x = float2(-1.0, 1.0);
+    const float2 mirror_y = float2(1.0, -1.0);
+    const float2 mirror_xy = float2(-1.0, -1.0);
+    const float2 dxdy_mirror_x = dxdy * mirror_x;
+    const float2 dxdy_mirror_y = dxdy * mirror_y;
+    const float2 dxdy_mirror_xy = dxdy * mirror_xy;
+    const float3 sample0C = tex2D_linearize(tex, tex_uv).rgb;
+    const float3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb;
+    const float3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb;
+    const float3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb;
+    const float3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb;
+    const float3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb;
+    const float3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb;
+    const float3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb;
+    const float3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    float3 sum = w0 * sample0C;
+    sum += w1 * (sample1R + sample1D + sample1L + sample1U);
+    sum += w2 * (sample2a + sample2b + sample2c + sample2d);
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur3x3(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Perform a 1-pass 3x3 blur with 5x5 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 3x3 Gaussian blurred mipmapped texture lookup composed of
+    //              2x2 carefully selected bilinear samples.
+    //  Description:
+    //  First see the descriptions for tex2Dblur9x9() and tex2Dblur7().  This
+    //  blur mixes concepts from both.  The sample layout is as follows:
+    //      0a 0b
+    //      0c 0d
+    //  The texel layout is as follows.  Note that samples 0a/0b and 0c/0d share
+    //  a vertical column of texels, and samples 0a/0c and 0b/0d share a
+    //  horizontal row of texels (all samples share the center texel):
+    //      0a3  0ab2 0b3
+    //      0ac1 0*0  0bd1
+    //      0c3  0cd2 0d3
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off = 1.0;
+    const float w1off = exp(-1.0 * denom_inv);
+    const float texel0to1ratio = w1off/(w0off * 0.5 + w1off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including axis-aligned:
+    const float2 sample0d_texel_offset = float2(texel0to1ratio, texel0to1ratio);
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 4 samples using symmetry:
+    const float2 mirror_x = float2(-1.0, 1.0);
+    const float2 mirror_y = float2(1.0, -1.0);
+    const float2 mirror_xy = float2(-1.0, -1.0);
+    const float2 dxdy_mirror_x = dxdy * mirror_x;
+    const float2 dxdy_mirror_y = dxdy * mirror_y;
+    const float2 dxdy_mirror_xy = dxdy * mirror_xy;
+    const float3 sample0a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample0d_texel_offset).rgb;
+    const float3 sample0b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample0d_texel_offset).rgb;
+    const float3 sample0c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample0d_texel_offset).rgb;
+    const float3 sample0d = tex2D_linearize(tex, tex_uv + dxdy * sample0d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Weights for all samples are the same, so just average them:
+    return 0.25 * (sample0a + sample0b + sample0c + sample0d);
+}
+
+
+//////////////////  LINEAR ONE-PASS BLURS WITH SHARED SAMPLES  /////////////////
+
+float3 tex2Dblur12x12shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
+    const float sigma)
+{
+    //  Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
+    //  Requires:   1.) Same as tex2Dblur9()
+    //              2.) ddx() and ddy() are present in the current Cg profile.
+    //              3.) The GPU driver is using fine/high-quality derivatives.
+    //              4.) quad_vector *correctly* describes the current fragment's
+    //                  location in its pixel quad, by the conventions noted in
+    //                  get_quad_vector[_naive].
+    //              5.) tex_uv.w = log2(video_size/output_size).y
+    //              6.) tex2Dlod() is present in the current Cg profile.
+    //  Optional:   Tune artifacts vs. excessive blurriness with the global
+    //              float error_blurring.
+    //  Returns:    A blurred texture lookup using a "virtual" 12x12 Gaussian
+    //              blur (a 6x6 blur of carefully selected bilinear samples)
+    //              of the given mip level.  There will be subtle inaccuracies,
+    //              especially for small or high-frequency detailed sources.
+    //  Description:
+    //  Perform a 1-pass blur with shared texture lookups across a pixel quad.
+    //  We'll get neighboring samples with high-quality ddx/ddy derivatives, as
+    //  in GPU Pro 2, Chapter VI.2, "Shader Amortization using Pixel Quad
+    //  Message Passing" by Eric Penner.
+    //
+    //  Our "virtual" 12x12 blur will be comprised of ((6 - 1)^2)/4 + 3 = 12
+    //  bilinear samples, where bilinear sampling positions are computed from
+    //  the relative Gaussian weights of the 4 surrounding texels.  The catch is
+    //  that the appropriate texel weights and sample coords differ for each
+    //  fragment, but we're reusing most of the same samples across a quad of
+    //  destination fragments.  (We do use unique coords for the four nearest
+    //  samples at each fragment.)  Mixing bilinear filtering and sample-sharing
+    //  therefore introduces some error into the weights, and this can get nasty
+    //  when the source image is small or high-frequency.  Computing bilinear
+    //  ratios based on weights at the sample field center results in sharpening
+    //  and ringing artifacts, but we can move samples closer to halfway between
+    //  texels to try blurring away the error (which can move features around by
+    //  a texel or so).  Tune this with the global float "error_blurring".
+    //
+    //  The pixel quad's sample field covers 12x12 texels, accessed through 6x6
+    //  bilinear (2x2 texel) taps.  Each fragment depends on a window of 10x10
+    //  texels (5x5 bilinear taps), and each fragment is responsible for loading
+    //  a 6x6 texel quadrant as a 3x3 block of bilinear taps, plus 3 more taps
+    //  to use unique bilinear coords for sample0* for each fragment.  This
+    //  diagram illustrates the relative locations of bilinear samples 1-9 for
+    //  each quadrant a, b, c, d (note samples will not be equally spaced):
+    //      8a 7a 6a 6b 7b 8b
+    //      5a 4a 3a 3b 4b 5b
+    //      2a 1a 0a 0b 1b 2b
+    //      2c 1c 0c 0d 1d 2d
+    //      5c 4c 3c 3d 4d 5d
+    //      8c 7c 6c 6d 7d 8d
+    //  The following diagram illustrates the underlying equally spaced texels,
+    //  named after the sample that accesses them and subnamed by their location
+    //  within their 2x2 texel block:
+    //      8a3 8a2 7a3 7a2 6a3 6a2 6b2 6b3 7b2 7b3 8b2 8b3
+    //      8a1 8a0 7a1 7a0 6a1 6a0 6b0 6b1 7b0 7b1 8b0 8b1
+    //      5a3 5a2 4a3 4a2 3a3 3a2 3b2 3b3 4b2 4b3 5b2 5b3
+    //      5a1 5a0 4a1 4a0 3a1 3a0 3b0 3b1 4b0 4b1 5b0 5b1
+    //      2a3 2a2 1a3 1a2 0a3 0a2 0b2 0b3 1b2 1b3 2b2 2b3
+    //      2a1 2a0 1a1 1a0 0a1 0a0 0b0 0b1 1b0 1b1 2b0 2b1
+    //      2c1 2c0 1c1 1c0 0c1 0c0 0d0 0d1 1d0 1d1 2d0 2d1
+    //      2c3 2c2 1c3 1c2 0c3 0c2 0d2 0d3 1d2 1d3 2d2 2d3
+    //      5c1 5c0 4c1 4c0 3c1 3c0 3d0 3d1 4d0 4d1 5d0 5d1
+    //      5c3 5c2 4c3 4c2 3c3 3c2 3d2 3d3 4d2 4d3 5d2 5d3
+    //      8c1 8c0 7c1 7c0 6c1 6c0 6d0 6d1 7d0 7d1 8d0 8d1
+    //      8c3 8c2 7c3 7c2 6c3 6c2 6d2 6d3 7d2 7d3 8d2 8d3
+    //  With this symmetric arrangement, we don't have to know which absolute
+    //  quadrant a sample lies in to assign kernel weights; it's enough to know
+    //  the sample number and the relative quadrant of the sample (relative to
+    //  the current quadrant):
+    //      {current, adjacent x, adjacent y, diagonal}
+
+    //  COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Statically compute sampling offsets within each 2x2 texel block, based
+    //  on appropriate 1D Gaussian sampling ratio between texels [0, 1], [2, 3],
+    //  and [4, 5] away from the fragment, and reuse them independently for both
+    //  dimensions.  Use the sample field center as the estimated destination,
+    //  but nudge the result closer to halfway between texels to blur error.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off   = 1.0;
+    const float w0_5off = exp(-(0.5*0.5) * denom_inv);
+    const float w1off   = exp(-(1.0*1.0) * denom_inv);
+    const float w1_5off = exp(-(1.5*1.5) * denom_inv);
+    const float w2off   = exp(-(2.0*2.0) * denom_inv);
+    const float w2_5off = exp(-(2.5*2.5) * denom_inv);
+    const float w3_5off = exp(-(3.5*3.5) * denom_inv);
+    const float w4_5off = exp(-(4.5*4.5) * denom_inv);
+    const float w5_5off = exp(-(5.5*5.5) * denom_inv);
+    const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
+    const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
+    const float texel4to5ratio = lerp(w5_5off/(w4_5off + w5_5off), 0.5, error_blurring);
+    //  We don't share sample0*, so use the nearest destination fragment:
+    const float texel0to1ratio_nearest = w1off/(w0off + w1off);
+    const float texel1to2ratio_nearest = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the bottom-right fragment to each
+    //  bilinear sample in the bottom-right quadrant:
+    const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample2_texel_offset = float2(4.0, 0.0) + float2(texel4to5ratio, texel0to1ratio);
+    const float2 sample3_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample4_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+    const float2 sample5_texel_offset = float2(4.0, 2.0) + float2(texel4to5ratio, texel2to3ratio);
+    const float2 sample6_texel_offset = float2(0.0, 4.0) + float2(texel0to1ratio, texel4to5ratio);
+    const float2 sample7_texel_offset = float2(2.0, 4.0) + float2(texel2to3ratio, texel4to5ratio);
+    const float2 sample8_texel_offset = float2(4.0, 4.0) + float2(texel4to5ratio, texel4to5ratio);
+
+    //  CALCULATE KERNEL WEIGHTS:
+    //  Statically compute bilinear sample weights at each destination fragment
+    //  based on the sum of their 4 underlying texel weights.  Assume a same-
+    //  resolution blur, so each symmetrically named sample weight will compute
+    //  the same at every fragment in the pixel quad: We can therefore compute
+    //  texel weights based only on the bottom-right quadrant (fragment at 0d0).
+    //  Too avoid too much boilerplate code, use a macro to get all 4 texel
+    //  weights for a bilinear sample based on the offset of its top-left texel:
+    #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
+        (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
+    const float w8diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -6.0);
+    const float w7diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -6.0);
+    const float w6diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -6.0);
+    const float w6adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -6.0);
+    const float w7adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -6.0);
+    const float w8adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -6.0);
+    const float w5diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -4.0);
+    const float w4diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0);
+    const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0);
+    const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0);
+    const float w4adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0);
+    const float w5adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -4.0);
+    const float w2diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -2.0);
+    const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0);
+    const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
+    const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
+    const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
+    const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -2.0);
+    const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 0.0);
+    const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0);
+    const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
+    const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
+    const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
+    const float w2curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 0.0);
+    const float w5adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 2.0);
+    const float w4adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0);
+    const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
+    const float w3curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
+    const float w4curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
+    const float w5curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 2.0);
+    const float w8adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 4.0);
+    const float w7adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 4.0);
+    const float w6adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 4.0);
+    const float w6curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 4.0);
+    const float w7curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 4.0);
+    const float w8curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 4.0);
+    #undef GET_TEXEL_QUAD_WEIGHTS
+    //  Statically pack weights for runtime:
+    const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
+    const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag);
+    const float4 w2 = float4(w2curr, w2adjx, w2adjy, w2diag);
+    const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag);
+    const float4 w4 = float4(w4curr, w4adjx, w4adjy, w4diag);
+    const float4 w5 = float4(w5curr, w5adjx, w5adjy, w5diag);
+    const float4 w6 = float4(w6curr, w6adjx, w6adjy, w6diag);
+    const float4 w7 = float4(w7curr, w7adjx, w7adjy, w7diag);
+    const float4 w8 = float4(w8curr, w8adjx, w8adjy, w8diag);
+    //  Get the weight sum inverse (normalization factor):
+    const float4 weight_sum4 = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8;
+    const float2 weight_sum2 = weight_sum4.xy + weight_sum4.zw;
+    const float weight_sum = weight_sum2.x + weight_sum2.y;
+    const float weight_sum_inv = 1.0/(weight_sum);
+
+    //  LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
+    const float2 dxdy_curr = dxdy * quad_vector.xy;
+    //  Load bilinear samples for the current quadrant (for this fragment):
+    const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
+    const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
+    const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
+    const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
+    const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
+    const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
+    const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
+    const float3 sample4curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample4_texel_offset)).rgb;
+    const float3 sample5curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample5_texel_offset)).rgb;
+    const float3 sample6curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample6_texel_offset)).rgb;
+    const float3 sample7curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample7_texel_offset)).rgb;
+    const float3 sample8curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample8_texel_offset)).rgb;
+
+    //  GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
+    //  Fetch the samples from other fragments in the 2x2 quad:
+    float3 sample1adjx, sample1adjy, sample1diag;
+    float3 sample2adjx, sample2adjy, sample2diag;
+    float3 sample3adjx, sample3adjy, sample3diag;
+    float3 sample4adjx, sample4adjy, sample4diag;
+    float3 sample5adjx, sample5adjy, sample5diag;
+    float3 sample6adjx, sample6adjy, sample6diag;
+    float3 sample7adjx, sample7adjy, sample7diag;
+    float3 sample8adjx, sample8adjy, sample8diag;
+    quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
+    quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
+    quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag);
+    quad_gather(quad_vector, sample4curr, sample4adjx, sample4adjy, sample4diag);
+    quad_gather(quad_vector, sample5curr, sample5adjx, sample5adjy, sample5diag);
+    quad_gather(quad_vector, sample6curr, sample6adjx, sample6adjy, sample6diag);
+    quad_gather(quad_vector, sample7curr, sample7adjx, sample7adjy, sample7diag);
+    quad_gather(quad_vector, sample8curr, sample8adjx, sample8adjy, sample8diag);
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    //  Fill each row of a matrix with an rgb sample and pre-multiply by the
+    //  weights to obtain a weighted result:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
+    sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag));
+    sum += mul(w2, float4x3(sample2curr, sample2adjx, sample2adjy, sample2diag));
+    sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag));
+    sum += mul(w4, float4x3(sample4curr, sample4adjx, sample4adjy, sample4diag));
+    sum += mul(w5, float4x3(sample5curr, sample5adjx, sample5adjy, sample5diag));
+    sum += mul(w6, float4x3(sample6curr, sample6adjx, sample6adjy, sample6diag));
+    sum += mul(w7, float4x3(sample7curr, sample7adjx, sample7adjy, sample7diag));
+    sum += mul(w8, float4x3(sample8curr, sample8adjx, sample8adjy, sample8diag));
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur10x10shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
+    const float sigma)
+{
+    //  Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
+    //  Requires:   Same as tex2Dblur12x12shared()
+    //  Returns:    A blurred texture lookup using a "virtual" 10x10 Gaussian
+    //              blur (a 5x5 blur of carefully selected bilinear samples)
+    //              of the given mip level.  There will be subtle inaccuracies,
+    //              especially for small or high-frequency detailed sources.
+    //  Description:
+    //  First see the description for tex2Dblur12x12shared().  This
+    //  function shares the same concept and sample placement, but each fragment
+    //  only uses 25 of the 36 samples taken across the pixel quad (to cover a
+    //  5x5 sample area, or 10x10 texel area), and it uses a lower standard
+    //  deviation to compensate.  Thanks to symmetry, the 11 omitted samples
+    //  are always the "same:"
+    //      8adjx, 2adjx, 5adjx,
+    //      6adjy, 7adjy, 8adjy,
+    //      2diag, 5diag, 6diag, 7diag, 8diag
+
+    //  COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off   = 1.0;
+    const float w0_5off = exp(-(0.5*0.5) * denom_inv);
+    const float w1off   = exp(-(1.0*1.0) * denom_inv);
+    const float w1_5off = exp(-(1.5*1.5) * denom_inv);
+    const float w2off   = exp(-(2.0*2.0) * denom_inv);
+    const float w2_5off = exp(-(2.5*2.5) * denom_inv);
+    const float w3_5off = exp(-(3.5*3.5) * denom_inv);
+    const float w4_5off = exp(-(4.5*4.5) * denom_inv);
+    const float w5_5off = exp(-(5.5*5.5) * denom_inv);
+    const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
+    const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
+    const float texel4to5ratio = lerp(w5_5off/(w4_5off + w5_5off), 0.5, error_blurring);
+    //  We don't share sample0*, so use the nearest destination fragment:
+    const float texel0to1ratio_nearest = w1off/(w0off + w1off);
+    const float texel1to2ratio_nearest = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the bottom-right fragment to each
+    //  bilinear sample in the bottom-right quadrant:
+    const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample2_texel_offset = float2(4.0, 0.0) + float2(texel4to5ratio, texel0to1ratio);
+    const float2 sample3_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample4_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+    const float2 sample5_texel_offset = float2(4.0, 2.0) + float2(texel4to5ratio, texel2to3ratio);
+    const float2 sample6_texel_offset = float2(0.0, 4.0) + float2(texel0to1ratio, texel4to5ratio);
+    const float2 sample7_texel_offset = float2(2.0, 4.0) + float2(texel2to3ratio, texel4to5ratio);
+    const float2 sample8_texel_offset = float2(4.0, 4.0) + float2(texel4to5ratio, texel4to5ratio);
+
+    //  CALCULATE KERNEL WEIGHTS:
+    //  Statically compute bilinear sample weights at each destination fragment
+    //  from the sum of their 4 texel weights (details in tex2Dblur12x12shared).
+    #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
+        (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
+    //  We only need 25 of the 36 sample weights.  Skip the following weights:
+    //      8adjx, 2adjx, 5adjx,
+    //      6adjy, 7adjy, 8adjy,
+    //      2diag, 5diag, 6diag, 7diag, 8diag
+    const float w4diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0);
+    const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0);
+    const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0);
+    const float w4adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0);
+    const float w5adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -4.0);
+    const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0);
+    const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
+    const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
+    const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
+    const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -2.0);
+    const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0);
+    const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
+    const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
+    const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
+    const float w2curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 0.0);
+    const float w4adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0);
+    const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
+    const float w3curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
+    const float w4curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
+    const float w5curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 2.0);
+    const float w7adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 4.0);
+    const float w6adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 4.0);
+    const float w6curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 4.0);
+    const float w7curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 4.0);
+    const float w8curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 4.0);
+    #undef GET_TEXEL_QUAD_WEIGHTS
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv = 1.0/(w0curr + w1curr + w2curr + w3curr +
+        w4curr + w5curr + w6curr + w7curr + w8curr +
+        w0adjx + w1adjx + w3adjx + w4adjx + w6adjx + w7adjx +
+        w0adjy + w1adjy + w2adjy + w3adjy + w4adjy + w5adjy +
+        w0diag + w1diag + w3diag + w4diag);
+    //  Statically pack most weights for runtime.  Note the mixed packing:
+    const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
+    const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag);
+    const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag);
+    const float4 w4 = float4(w4curr, w4adjx, w4adjy, w4diag);
+    const float4 w2and5 = float4(w2curr, w2adjy, w5curr, w5adjy);
+    const float4 w6and7 = float4(w6curr, w6adjx, w7curr, w7adjx);
+
+    //  LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
+    const float2 dxdy_curr = dxdy * quad_vector.xy;
+    //  Load bilinear samples for the current quadrant (for this fragment):
+    const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
+    const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
+    const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
+    const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
+    const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
+    const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
+    const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
+    const float3 sample4curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample4_texel_offset)).rgb;
+    const float3 sample5curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample5_texel_offset)).rgb;
+    const float3 sample6curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample6_texel_offset)).rgb;
+    const float3 sample7curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample7_texel_offset)).rgb;
+    const float3 sample8curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample8_texel_offset)).rgb;
+
+    //  GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
+    //  Fetch the samples from other fragments in the 2x2 quad in order of need:
+    float3 sample1adjx, sample1adjy, sample1diag;
+    float3 sample2adjx, sample2adjy, sample2diag;
+    float3 sample3adjx, sample3adjy, sample3diag;
+    float3 sample4adjx, sample4adjy, sample4diag;
+    float3 sample5adjx, sample5adjy, sample5diag;
+    float3 sample6adjx, sample6adjy, sample6diag;
+    float3 sample7adjx, sample7adjy, sample7diag;
+    quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
+    quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
+    quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag);
+    quad_gather(quad_vector, sample4curr, sample4adjx, sample4adjy, sample4diag);
+    quad_gather(quad_vector, sample5curr, sample5adjx, sample5adjy, sample5diag);
+    quad_gather(quad_vector, sample6curr, sample6adjx, sample6adjy, sample6diag);
+    quad_gather(quad_vector, sample7curr, sample7adjx, sample7adjy, sample7diag);
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    //  Fill each row of a matrix with an rgb sample and pre-multiply by the
+    //  weights to obtain a weighted result.  First do the simple ones:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
+    sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag));
+    sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag));
+    sum += mul(w4, float4x3(sample4curr, sample4adjx, sample4adjy, sample4diag));
+    //  Now do the mixed-sample ones:
+    sum += mul(w2and5, float4x3(sample2curr, sample2adjy, sample5curr, sample5adjy));
+    sum += mul(w6and7, float4x3(sample6curr, sample6adjx, sample7curr, sample7adjx));
+    sum += w8curr * sample8curr;
+    //  Normalize the sum (so the weights add to 1.0) and return:
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur8x8shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
+    const float sigma)
+{
+    //  Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
+    //  Requires:   Same as tex2Dblur12x12shared()
+    //  Returns:    A blurred texture lookup using a "virtual" 8x8 Gaussian
+    //              blur (a 4x4 blur of carefully selected bilinear samples)
+    //              of the given mip level.  There will be subtle inaccuracies,
+    //              especially for small or high-frequency detailed sources.
+    //  Description:
+    //  First see the description for tex2Dblur12x12shared().  This function
+    //  shares the same concept and a similar sample placement, except each
+    //  quadrant contains 4x4 texels and 2x2 samples instead of 6x6 and 3x3
+    //  respectively.  There could be a total of 16 samples, 4 of which each
+    //  fragment is responsible for, but each fragment loads 0a/0b/0c/0d with
+    //  its own offset to reduce shared sample artifacts, bringing the sample
+    //  count for each fragment to 7.  Sample placement:
+    //      3a 2a 2b 3b
+    //      1a 0a 0b 1b
+    //      1c 0c 0d 1d
+    //      3c 2c 2d 3d
+    //  Texel placement:
+    //      3a3 3a2 2a3 2a2 2b2 2b3 3b2 3b3
+    //      3a1 3a0 2a1 2a0 2b0 2b1 3b0 3b1
+    //      1a3 1a2 0a3 0a2 0b2 0b3 1b2 1b3
+    //      1a1 1a0 0a1 0a0 0b0 0b1 1b0 1b1
+    //      1c1 1c0 0c1 0c0 0d0 0d1 1d0 1d1
+    //      1c3 1c2 0c3 0c2 0d2 0d3 1d2 1d3
+    //      3c1 3c0 2c1 2c0 2d0 2d1 3d0 4d1
+    //      3c3 3c2 2c3 2c2 2d2 2d3 3d2 4d3
+    
+    //  COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off   = 1.0;
+    const float w0_5off = exp(-(0.5*0.5) * denom_inv);
+    const float w1off   = exp(-(1.0*1.0) * denom_inv);
+    const float w1_5off = exp(-(1.5*1.5) * denom_inv);
+    const float w2off   = exp(-(2.0*2.0) * denom_inv);
+    const float w2_5off = exp(-(2.5*2.5) * denom_inv);
+    const float w3_5off = exp(-(3.5*3.5) * denom_inv);
+    const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
+    const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
+    //  We don't share sample0*, so use the nearest destination fragment:
+    const float texel0to1ratio_nearest = w1off/(w0off + w1off);
+    const float texel1to2ratio_nearest = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the bottom-right fragment to each
+    //  bilinear sample in the bottom-right quadrant:
+    const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample2_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample3_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+
+    //  CALCULATE KERNEL WEIGHTS:
+    //  Statically compute bilinear sample weights at each destination fragment
+    //  from the sum of their 4 texel weights (details in tex2Dblur12x12shared).
+    #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
+        (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
+    const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0);
+    const float w2diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0);
+    const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0);
+    const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0);
+    const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0);
+    const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
+    const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
+    const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
+    const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0);
+    const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
+    const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
+    const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
+    const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0);
+    const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
+    const float w2curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
+    const float w3curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
+    #undef GET_TEXEL_QUAD_WEIGHTS
+    //  Statically pack weights for runtime:
+    const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
+    const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag);
+    const float4 w2 = float4(w2curr, w2adjx, w2adjy, w2diag);
+    const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag);
+    //  Get the weight sum inverse (normalization factor):
+    const float4 weight_sum4 = w0 + w1 + w2 + w3;
+    const float2 weight_sum2 = weight_sum4.xy + weight_sum4.zw;
+    const float weight_sum = weight_sum2.x + weight_sum2.y;
+    const float weight_sum_inv = 1.0/(weight_sum);
+
+    //  LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
+    const float2 dxdy_curr = dxdy * quad_vector.xy;
+    //  Load bilinear samples for the current quadrant (for this fragment):
+    const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
+    const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
+    const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
+    const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
+    const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
+    const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
+    const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
+
+    //  GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
+    //  Fetch the samples from other fragments in the 2x2 quad:
+    float3 sample1adjx, sample1adjy, sample1diag;
+    float3 sample2adjx, sample2adjy, sample2diag;
+    float3 sample3adjx, sample3adjy, sample3diag;
+    quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
+    quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
+    quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag);
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    //  Fill each row of a matrix with an rgb sample and pre-multiply by the
+    //  weights to obtain a weighted result:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
+    sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag));
+    sum += mul(w2, float4x3(sample2curr, sample2adjx, sample2adjy, sample2diag));
+    sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag));
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur6x6shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
+    const float sigma)
+{
+    //  Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
+    //  Requires:   Same as tex2Dblur12x12shared()
+    //  Returns:    A blurred texture lookup using a "virtual" 6x6 Gaussian
+    //              blur (a 3x3 blur of carefully selected bilinear samples)
+    //              of the given mip level.  There will be some inaccuracies,subtle inaccuracies,
+    //              especially for small or high-frequency detailed sources.
+    //  Description:
+    //  First see the description for tex2Dblur8x8shared().  This
+    //  function shares the same concept and sample placement, but each fragment
+    //  only uses 9 of the 16 samples taken across the pixel quad (to cover a
+    //  3x3 sample area, or 6x6 texel area), and it uses a lower standard
+    //  deviation to compensate.  Thanks to symmetry, the 7 omitted samples
+    //  are always the "same:"
+    //      1adjx, 3adjx
+    //      2adjy, 3adjy
+    //      1diag, 2diag, 3diag
+
+    //  COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off   = 1.0;
+    const float w0_5off = exp(-(0.5*0.5) * denom_inv);
+    const float w1off   = exp(-(1.0*1.0) * denom_inv);
+    const float w1_5off = exp(-(1.5*1.5) * denom_inv);
+    const float w2off   = exp(-(2.0*2.0) * denom_inv);
+    const float w2_5off = exp(-(2.5*2.5) * denom_inv);
+    const float w3_5off = exp(-(3.5*3.5) * denom_inv);
+    const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
+    const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
+    //  We don't share sample0*, so use the nearest destination fragment:
+    const float texel0to1ratio_nearest = w1off/(w0off + w1off);
+    const float texel1to2ratio_nearest = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the bottom-right fragment to each
+    //  bilinear sample in the bottom-right quadrant:
+    const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample2_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample3_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+
+    //  CALCULATE KERNEL WEIGHTS:
+    //  Statically compute bilinear sample weights at each destination fragment
+    //  from the sum of their 4 texel weights (details in tex2Dblur12x12shared).
+    #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
+        (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
+    //  We only need 9 of the 16 sample weights.  Skip the following weights:
+    //      1adjx, 3adjx
+    //      2adjy, 3adjy
+    //      1diag, 2diag, 3diag
+    const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
+    const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
+    const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
+    const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
+    const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
+    const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
+    const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
+    const float w2curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
+    const float w3curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
+    #undef GET_TEXEL_QUAD_WEIGHTS
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv = 1.0/(w0curr + w1curr + w2curr + w3curr +
+        w0adjx + w2adjx + w0adjy + w1adjy + w0diag);
+    //  Statically pack some weights for runtime:
+    const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
+
+    //  LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
+    const float2 dxdy_curr = dxdy * quad_vector.xy;
+    //  Load bilinear samples for the current quadrant (for this fragment):
+    const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
+    const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
+    const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
+    const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
+    const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
+    const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
+    const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
+
+    //  GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
+    //  Fetch the samples from other fragments in the 2x2 quad:
+    float3 sample1adjx, sample1adjy, sample1diag;
+    float3 sample2adjx, sample2adjy, sample2diag;
+    quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
+    quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    //  Fill each row of a matrix with an rgb sample and pre-multiply by the
+    //  weights to obtain a weighted result for sample1*, and handle the rest
+    //  of the weights more directly/verbosely:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
+    sum += w1curr * sample1curr + w1adjy * sample1adjy + w2curr * sample2curr +
+            w2adjx * sample2adjx + w3curr * sample3curr;
+    return sum * weight_sum_inv;
+}
+
+
+///////////////////////  MAX OPTIMAL SIGMA BLUR WRAPPERS  //////////////////////
+
+//  The following blurs are static wrappers around the dynamic blurs above.
+//  HOPEFULLY, the compiler will be smart enough to do constant-folding.
+
+//  Resizable separable blurs:
+inline float3 tex2Dblur11resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur11resize(tex, tex_uv, dxdy, blur11_std_dev);
+}
+inline float3 tex2Dblur9resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur9resize(tex, tex_uv, dxdy, blur9_std_dev);
+}
+inline float3 tex2Dblur7resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur7resize(tex, tex_uv, dxdy, blur7_std_dev);
+}
+inline float3 tex2Dblur5resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur5resize(tex, tex_uv, dxdy, blur5_std_dev);
+}
+inline float3 tex2Dblur3resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur3resize(tex, tex_uv, dxdy, blur3_std_dev);
+}
+//  Fast separable blurs:
+inline float3 tex2Dblur11fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur11fast(tex, tex_uv, dxdy, blur11_std_dev);
+}
+inline float3 tex2Dblur9fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur9fast(tex, tex_uv, dxdy, blur9_std_dev);
+}
+inline float3 tex2Dblur7fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur7fast(tex, tex_uv, dxdy, blur7_std_dev);
+}
+inline float3 tex2Dblur5fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur5fast(tex, tex_uv, dxdy, blur5_std_dev);
+}
+inline float3 tex2Dblur3fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur3fast(tex, tex_uv, dxdy, blur3_std_dev);
+}
+//  Huge, "fast" separable blurs:
+inline float3 tex2Dblur43fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur43fast(tex, tex_uv, dxdy, blur43_std_dev);
+}
+inline float3 tex2Dblur31fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur31fast(tex, tex_uv, dxdy, blur31_std_dev);
+}
+inline float3 tex2Dblur25fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur25fast(tex, tex_uv, dxdy, blur25_std_dev);
+}
+inline float3 tex2Dblur17fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur17fast(tex, tex_uv, dxdy, blur17_std_dev);
+}
+//  Resizable one-pass blurs:
+inline float3 tex2Dblur3x3resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur3x3resize(tex, tex_uv, dxdy, blur3_std_dev);
+}
+//  "Fast" one-pass blurs:
+inline float3 tex2Dblur9x9(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur9x9(tex, tex_uv, dxdy, blur9_std_dev);
+}
+inline float3 tex2Dblur7x7(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur7x7(tex, tex_uv, dxdy, blur7_std_dev);
+}
+inline float3 tex2Dblur5x5(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur5x5(tex, tex_uv, dxdy, blur5_std_dev);
+}
+inline float3 tex2Dblur3x3(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur3x3(tex, tex_uv, dxdy, blur3_std_dev);
+}
+//  "Fast" shared-sample one-pass blurs:
+inline float3 tex2Dblur12x12shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
+{
+    return tex2Dblur12x12shared(tex, tex_uv, dxdy, quad_vector, blur12_std_dev);
+}
+inline float3 tex2Dblur10x10shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
+{
+    return tex2Dblur10x10shared(tex, tex_uv, dxdy, quad_vector, blur10_std_dev);
+}
+inline float3 tex2Dblur8x8shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
+{
+    return tex2Dblur8x8shared(tex, tex_uv, dxdy, quad_vector, blur8_std_dev);
+}
+inline float3 tex2Dblur6x6shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
+{
+    return tex2Dblur6x6shared(tex, tex_uv, dxdy, quad_vector, blur6_std_dev);
+}
+
+
+#endif  //  BLUR_FUNCTIONS_H
+
+////////////////////////////  END BLUR-FUNCTIONS  ///////////////////////////
+
+///////////////////////////////  BLOOM CONSTANTS  //////////////////////////////
+
+//  Compute constants with manual inlines of the functions below:
+static const float bloom_diff_thresh = 1.0/256.0;
+
+
+
+///////////////////////////////////  HELPERS  //////////////////////////////////
+
+inline float get_min_sigma_to_blur_triad(const float triad_size,
+    const float thresh)
+{
+    //  Requires:   1.) triad_size is the final phosphor triad size in pixels
+    //              2.) thresh is the max desired pixel difference in the
+    //                  blurred triad (e.g. 1.0/256.0).
+    //  Returns:    Return the minimum sigma that will fully blur a phosphor
+    //              triad on the screen to an even color, within thresh.
+    //              This closed-form function was found by curve-fitting data.
+    //  Estimate: max error = ~0.086036, mean sq. error = ~0.0013387:
+    return -0.05168 + 0.6113*triad_size -
+        1.122*triad_size*sqrt(0.000416 + thresh);
+    //  Estimate: max error = ~0.16486, mean sq. error = ~0.0041041:
+    //return 0.5985*triad_size - triad_size*sqrt(thresh)
+}
+
+inline float get_absolute_scale_blur_sigma(const float thresh)
+{
+    //  Requires:   1.) min_expected_triads must be a global float.  The number
+    //                  of horizontal phosphor triads in the final image must be
+    //                  >= min_allowed_viewport_triads.x for realistic results.
+    //              2.) bloom_approx_scale_x must be a global float equal to the
+    //                  absolute horizontal scale of BLOOM_APPROX.
+    //              3.) bloom_approx_scale_x/min_allowed_viewport_triads.x
+    //                  should be <= 1.1658025090 to keep the final result <
+    //                  0.62666015625 (the largest sigma ensuring the largest
+    //                  unused texel weight stays < 1.0/256.0 for a 3x3 blur).
+    //              4.) thresh is the max desired pixel difference in the
+    //                  blurred triad (e.g. 1.0/256.0).
+    //  Returns:    Return the minimum Gaussian sigma that will blur the pass
+    //              output as much as it would have taken to blur away
+    //              bloom_approx_scale_x horizontal phosphor triads.
+    //  Description:
+    //  BLOOM_APPROX should look like a downscaled phosphor blur.  Ideally, we'd
+    //  use the same blur sigma as the actual phosphor bloom and scale it down
+    //  to the current resolution with (bloom_approx_scale_x/viewport_size_x), but
+    //  we don't know the viewport size in this pass.  Instead, we'll blur as
+    //  much as it would take to blur away min_allowed_viewport_triads.x.  This
+    //  will blur "more than necessary" if the user actually uses more triads,
+    //  but that's not terrible either, because blurring a constant fraction of
+    //  the viewport may better resemble a true optical bloom anyway (since the
+    //  viewport will generally be about the same fraction of each player's
+    //  field of view, regardless of screen size and resolution).
+    //  Assume an extremely large viewport size for asymptotic results.
+    return bloom_approx_scale_x/max_viewport_size_x *
+        get_min_sigma_to_blur_triad(
+            max_viewport_size_x/min_allowed_viewport_triads.x, thresh);
+}
+
+inline float get_center_weight(const float sigma)
+{
+    //  Given a Gaussian blur sigma, get the blur weight for the center texel.
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        return get_fast_gaussian_weight_sum_inv(sigma);
+    #else
+        const float denom_inv = 0.5/(sigma*sigma);
+        const float w0 = 1.0;
+        const float w1 = exp(-1.0 * denom_inv);
+        const float w2 = exp(-4.0 * denom_inv);
+        const float w3 = exp(-9.0 * denom_inv);
+        const float w4 = exp(-16.0 * denom_inv);
+        const float w5 = exp(-25.0 * denom_inv);
+        const float w6 = exp(-36.0 * denom_inv);
+        const float w7 = exp(-49.0 * denom_inv);
+        const float w8 = exp(-64.0 * denom_inv);
+        const float w9 = exp(-81.0 * denom_inv);
+        const float w10 = exp(-100.0 * denom_inv);
+        const float w11 = exp(-121.0 * denom_inv);
+        const float w12 = exp(-144.0 * denom_inv);
+        const float w13 = exp(-169.0 * denom_inv);
+        const float w14 = exp(-196.0 * denom_inv);
+        const float w15 = exp(-225.0 * denom_inv);
+        const float w16 = exp(-256.0 * denom_inv);
+        const float w17 = exp(-289.0 * denom_inv);
+        const float w18 = exp(-324.0 * denom_inv);
+        const float w19 = exp(-361.0 * denom_inv);
+        const float w20 = exp(-400.0 * denom_inv);
+        const float w21 = exp(-441.0 * denom_inv);
+        //  Note: If the implementation uses a smaller blur than the max allowed,
+        //  the worst case scenario is that the center weight will be overestimated,
+        //  so we'll put a bit more energy into the brightpass...no huge deal.
+        //  Then again, if the implementation uses a larger blur than the max
+        //  "allowed" because of dynamic branching, the center weight could be
+        //  underestimated, which is more of a problem...consider always using
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+            //  43x blur:
+            const float weight_sum_inv = 1.0 /
+                (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 +
+                w11 + w12 + w13 + w14 + w15 + w16 + w17 + w18 + w19 + w20 + w21));
+        #else
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+            //  31x blur:
+            const float weight_sum_inv = 1.0 /
+                (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 +
+                w8 + w9 + w10 + w11 + w12 + w13 + w14 + w15));
+        #else
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+            //  25x blur:
+            const float weight_sum_inv = 1.0 / (w0 + 2.0 * (
+                w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12));
+        #else
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+            //  17x blur:
+            const float weight_sum_inv = 1.0 / (w0 + 2.0 * (
+                w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8));
+        #else
+            //  9x blur:
+            const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+        const float center_weight = weight_sum_inv * weight_sum_inv;
+        return center_weight;
+    #endif
+}
+
+inline float3 tex2DblurNfast(const sampler2D texture, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  If sigma is static, we can safely branch and use the smallest blur
+    //  that's big enough.  Ignore #define hints, because we'll only use a
+    //  large blur if we actually need it, and the branches cost nothing.
+    #ifndef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #define PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE
+    #else
+        //  It's still worth branching if the profile supports dynamic branches:
+        //  It's much faster than using a hugely excessive blur, but each branch
+        //  eats ~1% FPS.
+        #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+            #define PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE
+        #endif
+    #endif
+    //  Failed optimization notes:
+    //  I originally created a same-size mipmapped 5-tap separable blur10 that
+    //  could handle any sigma by reaching into lower mip levels.  It was
+    //  as fast as blur25fast for runtime sigmas and a tad faster than
+    //  blur31fast for static sigmas, but mipmapping two viewport-size passes
+    //  ate 10% of FPS across all codepaths, so it wasn't worth it.
+    #ifdef PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE
+        if(sigma <= blur9_std_dev)
+        {
+            return tex2Dblur9fast(texture, tex_uv, dxdy, sigma);
+        }
+        else if(sigma <= blur17_std_dev)
+        {
+            return tex2Dblur17fast(texture, tex_uv, dxdy, sigma);
+        }
+        else if(sigma <= blur25_std_dev)
+        {
+            return tex2Dblur25fast(texture, tex_uv, dxdy, sigma);
+        }
+        else if(sigma <= blur31_std_dev)
+        {
+            return tex2Dblur31fast(texture, tex_uv, dxdy, sigma);
+        }
+        else
+        {
+            return tex2Dblur43fast(texture, tex_uv, dxdy, sigma);
+        }
+    #else
+        //  If we can't afford to branch, we can only guess at what blur
+        //  size we need.  Therefore, use the largest blur allowed.
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+            return tex2Dblur43fast(texture, tex_uv, dxdy, sigma);
+        #else
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+            return tex2Dblur31fast(texture, tex_uv, dxdy, sigma);
+        #else
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+            return tex2Dblur25fast(texture, tex_uv, dxdy, sigma);
+        #else
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+            return tex2Dblur17fast(texture, tex_uv, dxdy, sigma);
+        #else
+            return tex2Dblur9fast(texture, tex_uv, dxdy, sigma);
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    #endif  //  PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE
+}
+
+inline float get_bloom_approx_sigma(const float output_size_x_runtime,
+    const float estimated_viewport_size_x)
+{
+    //  Requires:   1.) output_size_x_runtime == BLOOM_APPROX.output_size.x.
+    //                  This is included for dynamic codepaths just in case the
+    //                  following two globals are incorrect:
+    //              2.) bloom_approx_size_x_for_skip should == the same
+    //                  if PHOSPHOR_BLOOM_FAKE is #defined
+    //              3.) bloom_approx_size_x should == the same otherwise
+    //  Returns:    For gaussian4x4, return a dynamic small bloom sigma that's
+    //              as close to optimal as possible given available information.
+    //              For blur3x3, return the a static small bloom sigma that
+    //              works well for typical cases.  Otherwise, we're using simple
+    //              bilinear filtering, so use static calculations.
+    //  Assume the default static value.  This is a compromise that ensures
+    //  typical triads are blurred, even if unusually large ones aren't.
+    static const float mask_num_triads_static =
+        max(min_allowed_viewport_triads.x, mask_num_triads_desired_static);
+    const float mask_num_triads_from_size =
+        estimated_viewport_size_x/mask_triad_size_desired;
+    const float mask_num_triads_runtime = max(min_allowed_viewport_triads.x,
+        lerp(mask_num_triads_from_size, mask_num_triads_desired,
+            mask_specify_num_triads));
+    //  Assume an extremely large viewport size for asymptotic results:
+    static const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0);
+    if(bloom_approx_filter > 1.5)   //  4x4 true Gaussian resize
+    {
+        //  Use the runtime num triads and output size:
+        const float asymptotic_triad_size =
+            max_viewport_size_x/mask_num_triads_runtime;
+        const float asymptotic_sigma = get_min_sigma_to_blur_triad(
+            asymptotic_triad_size, bloom_diff_thresh);
+        const float bloom_approx_sigma =
+            asymptotic_sigma * output_size_x_runtime/max_viewport_size_x;
+        //  The BLOOM_APPROX input has to be ORIG_LINEARIZED to avoid moire, but
+        //  account for the Gaussian scanline sigma from the last pass too.
+        //  The bloom will be too wide horizontally but tall enough vertically.
+        return length(float2(bloom_approx_sigma, beam_max_sigma));
+    }
+    else    //  3x3 blur resize (the bilinear resize doesn't need a sigma)
+    {
+        //  We're either using blur3x3 or bilinear filtering.  The biggest
+        //  reason to choose blur3x3 is to avoid dynamic weights, so use a
+        //  static calculation.
+        #ifdef PHOSPHOR_BLOOM_FAKE
+            static const float output_size_x_static =
+                bloom_approx_size_x_for_fake;
+        #else
+            static const float output_size_x_static = bloom_approx_size_x;
+        #endif
+        static const float asymptotic_triad_size =
+            max_viewport_size_x/mask_num_triads_static;
+        const float asymptotic_sigma = get_min_sigma_to_blur_triad(
+            asymptotic_triad_size, bloom_diff_thresh);
+        const float bloom_approx_sigma =
+            asymptotic_sigma * output_size_x_static/max_viewport_size_x;
+        //  The BLOOM_APPROX input has to be ORIG_LINEARIZED to avoid moire, but
+        //  try accounting for the Gaussian scanline sigma from the last pass
+        //  too; use the static default value:
+        return length(float2(bloom_approx_sigma, beam_max_sigma_static));
+    }
+}
+
+inline float get_final_bloom_sigma(const float bloom_sigma_runtime)
+{
+    //  Requires:   1.) bloom_sigma_runtime is a precalculated sigma that's
+    //                  optimal for the [known] triad size.
+    //              2.) Call this from a fragment shader (not a vertex shader),
+    //                  or blurring with static sigmas won't be constant-folded.
+    //  Returns:    Return the optimistic static sigma if the triad size is
+    //              known at compile time.  Otherwise return the optimal runtime
+    //              sigma (10% slower) or an implementation-specific compromise
+    //              between an optimistic or pessimistic static sigma.
+    //  Notes:      Call this from the fragment shader, NOT the vertex shader,
+    //              so static sigmas can be constant-folded!
+    const float bloom_sigma_optimistic = get_min_sigma_to_blur_triad(
+        mask_triad_size_desired_static, bloom_diff_thresh);
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        return bloom_sigma_runtime;
+    #else
+        //  Overblurring looks as bad as underblurring, so assume average-size
+        //  triads, not worst-case huge triads:
+        return bloom_sigma_optimistic;
+    #endif
+}
+
+
+#endif  //  BLOOM_FUNCTIONS_H
+
+////////////////////////////  END BLOOM-FUNCTIONS  ///////////////////////////
+
+//#include "../../../../include/gamma-management.h"
+
+///////////////////////////////////  HELPERS  //////////////////////////////////
+
+float3 tex2Dresize_gaussian4x4(sampler2D tex, float2 tex_uv, float2 dxdy, float2 tex_size, float2 texture_size_inv, float2 tex_uv_to_pixel_scale, float sigma)
+{
+    //  Requires:   1.) All requirements of gamma-management.h must be satisfied!
+    //              2.) filter_linearN must == "true" in your .cgp preset.
+    //              3.) mipmap_inputN must == "true" in your .cgp preset if
+    //                  output_size << SRC.video_size.
+    //              4.) dxdy should contain the uv pixel spacing:
+    //                      dxdy = max(float2(1.0),
+    //                          SRC.video_size/output_size)/SRC.texture_size;
+    //              5.) texture_size == SRC.texture_size
+    //              6.) texture_size_inv == float2(1.0)/SRC.texture_size
+    //              7.) tex_uv_to_pixel_scale == output_size *
+    //                      SRC.texture_size / SRC.video_size;
+    //              8.) sigma is the desired Gaussian standard deviation, in
+    //                  terms of output pixels.  It should be < ~0.66171875 to
+    //                  ensure the first unused sample (outside the 4x4 box) has
+    //                  a weight < 1.0/256.0.
+    //  Returns:    A true 4x4 Gaussian resize of the input.
+    //  Description:
+    //  Given correct inputs, this Gaussian resizer samples 4 pixel locations
+    //  along each downsized dimension and/or 4 texel locations along each
+    //  upsized dimension.  It computes dynamic weights based on the pixel-space
+    //  distance of each sample from the destination pixel.  It is arbitrarily
+    //  resizable and higher quality than tex2Dblur3x3_resize, but it's slower.
+    //  TODO: Move this to a more suitable file once there are others like it.
+    const float denom_inv = 0.5/(sigma*sigma);
+    //  We're taking 4x4 samples, and we're snapping to texels for upsizing.
+    //  Find texture coords for sample 5 (second row, second column):
+    const float2 curr_texel = tex_uv * tex_size;
+    const float2 prev_texel =
+        floor(curr_texel - float2(under_half)) + float2(0.5);
+    const float2 prev_texel_uv = prev_texel * texture_size_inv;
+    const float2 snap = float2((dxdy.x <= texture_size_inv.x), (dxdy.y <= texture_size_inv.y));
+    const float2 sample5_downsize_uv = tex_uv - 0.5 * dxdy;
+    const float2 sample5_uv = lerp(sample5_downsize_uv, prev_texel_uv, snap);
+    //  Compute texture coords for other samples:
+    const float2 dx = float2(dxdy.x, 0.0);
+    const float2 sample0_uv = sample5_uv - dxdy;
+    const float2 sample10_uv = sample5_uv + dxdy;
+    const float2 sample15_uv = sample5_uv + 2.0 * dxdy;
+    const float2 sample1_uv = sample0_uv + dx;
+    const float2 sample2_uv = sample0_uv + 2.0 * dx;
+    const float2 sample3_uv = sample0_uv + 3.0 * dx;
+    const float2 sample4_uv = sample5_uv - dx;
+    const float2 sample6_uv = sample5_uv + dx;
+    const float2 sample7_uv = sample5_uv + 2.0 * dx;
+    const float2 sample8_uv = sample10_uv - 2.0 * dx;
+    const float2 sample9_uv = sample10_uv - dx;
+    const float2 sample11_uv = sample10_uv + dx;
+    const float2 sample12_uv = sample15_uv - 3.0 * dx;
+    const float2 sample13_uv = sample15_uv - 2.0 * dx;
+    const float2 sample14_uv = sample15_uv - dx;
+    //  Load each sample:
+    float3 sample0 = tex2D_linearize(tex, sample0_uv).rgb;
+    float3 sample1 = tex2D_linearize(tex, sample1_uv).rgb;
+    float3 sample2 = tex2D_linearize(tex, dx).rgb;
+    float3 sample3 = tex2D_linearize(tex, sample3_uv).rgb;
+    float3 sample4 = tex2D_linearize(tex, sample4_uv).rgb;
+    float3 sample5 = tex2D_linearize(tex, sample5_uv).rgb;
+    float3 sample6 = tex2D_linearize(tex, sample6_uv).rgb;
+    float3 sample7 = tex2D_linearize(tex, sample7_uv).rgb;
+    float3 sample8 = tex2D_linearize(tex, sample8_uv).rgb;
+    float3 sample9 = tex2D_linearize(tex, sample9_uv).rgb;
+    float3 sample10 = tex2D_linearize(tex, sample10_uv).rgb;
+    float3 sample11 = tex2D_linearize(tex, sample11_uv).rgb;
+    float3 sample12 = tex2D_linearize(tex, sample12_uv).rgb;
+    float3 sample13 = tex2D_linearize(tex, sample13_uv).rgb;
+    float3 sample14 = tex2D_linearize(tex, sample14_uv).rgb;
+    float3 sample15 = tex2D_linearize(tex, sample15_uv).rgb;
+    //  Compute destination pixel offsets for each sample:
+    const float2 dest_pixel = tex_uv * tex_uv_to_pixel_scale;
+    const float2 sample0_offset = sample0_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const float2 sample1_offset = sample1_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const float2 sample2_offset = sample2_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const float2 sample3_offset = sample3_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const float2 sample4_offset = sample4_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const float2 sample5_offset = sample5_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const float2 sample6_offset = sample6_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const float2 sample7_offset = sample7_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const float2 sample8_offset = sample8_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const float2 sample9_offset = sample9_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const float2 sample10_offset = sample10_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const float2 sample11_offset = sample11_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const float2 sample12_offset = sample12_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const float2 sample13_offset = sample13_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const float2 sample14_offset = sample14_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const float2 sample15_offset = sample15_uv * tex_uv_to_pixel_scale - dest_pixel;
+    //  Compute Gaussian sample weights:
+    const float w0 = exp(-LENGTH_SQ(sample0_offset) * denom_inv);
+    const float w1 = exp(-LENGTH_SQ(sample1_offset) * denom_inv);
+    const float w2 = exp(-LENGTH_SQ(sample2_offset) * denom_inv);
+    const float w3 = exp(-LENGTH_SQ(sample3_offset) * denom_inv);
+    const float w4 = exp(-LENGTH_SQ(sample4_offset) * denom_inv);
+    const float w5 = exp(-LENGTH_SQ(sample5_offset) * denom_inv);
+    const float w6 = exp(-LENGTH_SQ(sample6_offset) * denom_inv);
+    const float w7 = exp(-LENGTH_SQ(sample7_offset) * denom_inv);
+    const float w8 = exp(-LENGTH_SQ(sample8_offset) * denom_inv);
+    const float w9 = exp(-LENGTH_SQ(sample9_offset) * denom_inv);
+    const float w10 = exp(-LENGTH_SQ(sample10_offset) * denom_inv);
+    const float w11 = exp(-LENGTH_SQ(sample11_offset) * denom_inv);
+    const float w12 = exp(-LENGTH_SQ(sample12_offset) * denom_inv);
+    const float w13 = exp(-LENGTH_SQ(sample13_offset) * denom_inv);
+    const float w14 = exp(-LENGTH_SQ(sample14_offset) * denom_inv);
+    const float w15 = exp(-LENGTH_SQ(sample15_offset) * denom_inv);
+    const float weight_sum_inv = 1.0/(
+        w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 +
+        w8 +w9 + w10 + w11 + w12 + w13 + w14 + w15);
+    //  Weight and sum the samples:
+    const float3 sum = w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
+        w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 +
+        w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 +
+        w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15;
+    return sum * weight_sum_inv;
+}
+
+void main() {
+    //  Would a viewport-relative size work better for this pass?  (No.)
+    //  PROS:
+    //  1.) Instead of writing an absolute size to user-cgp-constants.h, we'd
+    //      write a viewport scale.  That number could be used to directly scale
+    //      the viewport-resolution bloom sigma and/or triad size to a smaller
+    //      scale.  This way, we could calculate an optimal dynamic sigma no
+    //      matter how the dot pitch is specified.
+    //  CONS:
+    //  1.) Texel smearing would be much worse at small viewport sizes, but
+    //      performance would be much worse at large viewport sizes, so there
+    //      would be no easy way to calculate a decent scale.
+    //  2.) Worse, we could no longer get away with using a constant-size blur!
+    //      Instead, we'd have to face all the same difficulties as the real
+    //      phosphor bloom, which requires static #ifdefs to decide the blur
+    //      size based on the expected triad size...a dynamic value.
+    //  3.) Like the phosphor bloom, we'd have less control over making the blur
+    //      size correct for an optical blur.  That said, we likely overblur (to
+    //      maintain brightness) more than the eye would do by itself: 20/20
+    //      human vision distinguishes ~1 arc minute, or 1/60 of a degree.  The
+    //      highest viewing angle recommendation I know of is THX's 40.04 degree
+    //      recommendation, at which 20/20 vision can distinguish about 2402.4
+    //      lines.  Assuming the "TV lines" definition, that means 1201.2
+    //      distinct light lines and 1201.2 distinct dark lines can be told
+    //      apart, i.e. 1201.2 pairs of lines.  This would correspond to 1201.2
+    //      pairs of alternating lit/unlit phosphors, so 2402.4 phosphors total
+    //      (if they're alternately lit).  That's a max of 800.8 triads.  Using
+    //      a more popular 30 degree viewing angle recommendation, 20/20 vision
+    //      can distinguish 1800 lines, or 600 triads of alternately lit
+    //      phosphors.  In contrast, we currently blur phosphors all the way
+    //      down to 341.3 triads to ensure full brightness.
+    //  4.) Realistically speaking, we're usually just going to use bilinear
+    //      filtering in this pass anyway, but it only works well to limit
+    //      bandwidth if it's done at a small constant scale.
+    
+    //  Get the constants we need to sample:
+//    const sampler2D texture = ORIG_LINEARIZED.texture;
+//    const float2 tex_uv = tex_uv;
+//    const float2 blur_dxdy = blur_dxdy;
+    const float2 texture_size_ = ORIG_LINEARIZEDtexture_size;
+//    const float2 texture_size_inv = texture_size_inv;
+//    const float2 tex_uv_to_pixel_scale = tex_uv_to_pixel_scale;
+    float2 tex_uv_r, tex_uv_g, tex_uv_b;
+
+    if(beam_misconvergence)
+    {
+        const float2 uv_scanline_step = uv_scanline_step;
+        const float2 convergence_offsets_r = get_convergence_offsets_r_vector();
+        const float2 convergence_offsets_g = get_convergence_offsets_g_vector();
+        const float2 convergence_offsets_b = get_convergence_offsets_b_vector();
+        tex_uv_r = tex_uv - convergence_offsets_r * uv_scanline_step;
+        tex_uv_g = tex_uv - convergence_offsets_g * uv_scanline_step;
+        tex_uv_b = tex_uv - convergence_offsets_b * uv_scanline_step;
+    }
+    //  Get the blur sigma:
+    const float bloom_approx_sigma = get_bloom_approx_sigma(output_size.x,
+        estimated_viewport_size_x);
+
+    //  Sample the resized and blurred texture, and apply convergence offsets if
+    //  necessary.  Applying convergence offsets here triples our samples from
+    //  16/9/1 to 48/27/3, but faster and easier than sampling BLOOM_APPROX and
+    //  HALATION_BLUR 3 times at full resolution every time they're used.
+    float3 color_r, color_g, color_b, color;
+    if(bloom_approx_filter > 1.5)
+    {
+        //  Use a 4x4 Gaussian resize.  This is slower but technically correct.
+        if(beam_misconvergence)
+        {
+            color_r = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv_r,
+                blur_dxdy, texture_size_, texture_size_inv,
+                tex_uv_to_pixel_scale, bloom_approx_sigma);
+            color_g = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv_g,
+                blur_dxdy, texture_size_, texture_size_inv,
+                tex_uv_to_pixel_scale, bloom_approx_sigma);
+            color_b = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv_b,
+                blur_dxdy, texture_size_, texture_size_inv,
+                tex_uv_to_pixel_scale, bloom_approx_sigma);
+        }
+        else
+        {
+            color = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv,
+                blur_dxdy, texture_size_, texture_size_inv,
+                tex_uv_to_pixel_scale, bloom_approx_sigma);
+        }
+    }
+    else if(bloom_approx_filter > 0.5)
+    {
+        //  Use a 3x3 resize blur.  This is the softest option, because we're
+        //  blurring already blurry bilinear samples.  It doesn't play quite as
+        //  nicely with convergence offsets, but it has its charms.
+        if(beam_misconvergence)
+        {
+            color_r = tex2Dblur3x3resize(ORIG_LINEARIZED, tex_uv_r,
+                blur_dxdy, bloom_approx_sigma);
+            color_g = tex2Dblur3x3resize(ORIG_LINEARIZED, tex_uv_g,
+                blur_dxdy, bloom_approx_sigma);
+            color_b = tex2Dblur3x3resize(ORIG_LINEARIZED, tex_uv_b,
+                blur_dxdy, bloom_approx_sigma);
+        }
+        else
+        {
+            color = tex2Dblur3x3resize(ORIG_LINEARIZED, tex_uv, blur_dxdy);
+        }
+    }
+    else
+    {
+        //  Use bilinear sampling.  This approximates a 4x4 Gaussian resize MUCH
+        //  better than tex2Dblur3x3_resize for the very small sigmas we're
+        //  likely to use at small output resolutions.  (This estimate becomes
+        //  too sharp above ~400x300, but the blurs break down above that
+        //  resolution too, unless min_allowed_viewport_triads is high enough to
+        //  keep bloom_approx_scale_x/min_allowed_viewport_triads < ~1.1658025.)
+        if(beam_misconvergence)
+        {
+            color_r = tex2D_linearize(ORIG_LINEARIZED, tex_uv_r).rgb;
+            color_g = tex2D_linearize(ORIG_LINEARIZED, tex_uv_g).rgb;
+            color_b = tex2D_linearize(ORIG_LINEARIZED, tex_uv_b).rgb;
+        }
+        else
+        {
+            color = tex2D_linearize(ORIG_LINEARIZED, tex_uv).rgb;
+        }
+    }
+    //  Pack the colors from the red/green/blue beams into a single vector:
+    if(beam_misconvergence)
+    {
+        color = float3(color_r.r, color_g.g, color_b.b);
+    }
+    //  Encode and output the blurred image:
+		FragColor = encode_output(float4(tex2D_linearize(ORIG_LINEARIZED, tex_uv)));
+}
\ No newline at end of file
diff --git a/shaders/CRT-Royale.shader/bloom-approx.vs b/shaders/CRT-Royale.shader/bloom-approx.vs
new file mode 100644
index 00000000..e4faac1e
--- /dev/null
+++ b/shaders/CRT-Royale.shader/bloom-approx.vs
@@ -0,0 +1,5859 @@
+#version 150
+
+in vec4 position;
+in vec2 texCoord;
+
+out Vertex {
+   vec2 vTexCoord;
+   vec2 tex_uv;
+   vec2 blur_dxdy;
+   vec2 uv_scanline_step;
+   float estimated_viewport_size_x;
+   vec2 texture_size_inv;
+   vec2 tex_uv_to_pixel_scale;
+};
+
+uniform vec4 targetSize;
+uniform vec4 sourceSize[];
+uniform int phase;
+
+// USER SETTINGS BLOCK //
+
+#define crt_gamma 2.500000
+#define lcd_gamma 2.200000
+#define levels_contrast 1.0
+#define halation_weight 0.0
+#define diffusion_weight 0.075
+#define bloom_underestimate_levels 0.8
+#define bloom_excess 0.000000
+#define beam_min_sigma 0.020000
+#define beam_max_sigma 0.300000
+#define beam_spot_power 0.330000
+#define beam_min_shape 2.000000
+#define beam_max_shape 4.000000
+#define beam_shape_power 0.250000
+#define beam_horiz_filter 0.000000
+#define beam_horiz_sigma 0.35
+#define beam_horiz_linear_rgb_weight 1.000000
+#define convergence_offset_x_r -0.000000
+#define convergence_offset_x_g 0.000000
+#define convergence_offset_x_b 0.000000
+#define convergence_offset_y_r 0.000000
+#define convergence_offset_y_g -0.000000
+#define convergence_offset_y_b 0.000000
+#define mask_type 1.000000
+#define mask_sample_mode_desired 0.000000
+#define mask_specify_num_triads 0.000000
+#define mask_triad_size_desired 3.000000
+#define mask_num_triads_desired 480.000000
+#define aa_subpixel_r_offset_x_runtime -0.0
+#define aa_subpixel_r_offset_y_runtime 0.000000
+#define aa_cubic_c 0.500000
+#define aa_gauss_sigma 0.500000
+#define geom_mode_runtime 0.000000
+#define geom_radius 2.000000
+#define geom_view_dist 2.000000
+#define geom_tilt_angle_x 0.000000
+#define geom_tilt_angle_y 0.000000
+#define geom_aspect_ratio_x 432.000000
+#define geom_aspect_ratio_y 329.000000
+#define geom_overscan_x 1.000000
+#define geom_overscan_y 1.000000
+#define border_size 0.015
+#define border_darkness 2.0
+#define border_compress 2.500000
+#define interlace_bff 0.000000
+#define interlace_1080i 0.000000
+
+// END USER SETTINGS BLOCK //
+
+// compatibility macros for transparently converting HLSLisms into GLSLisms
+#define mul(a,b) (b*a)
+#define lerp(a,b,c) mix(a,b,c)
+#define saturate(c) clamp(c, 0.0, 1.0)
+#define frac(x) (fract(x))
+#define float2 vec2
+#define float3 vec3
+#define float4 vec4
+#define bool2 bvec2
+#define bool3 bvec3
+#define bool4 bvec4
+#define float2x2 mat2x2
+#define float3x3 mat3x3
+#define float4x4 mat4x4
+#define float4x3 mat4x3
+#define float2x4 mat2x4
+#define IN params
+#define texture_size sourceSize[0].xy
+#define video_size sourceSize[0].xy
+#define output_size targetSize.xy
+#define frame_count phase
+#define static  
+#define inline  
+#define const  
+#define fmod(x,y) mod(x,y)
+#define ddx(c) dFdx(c)
+#define ddy(c) dFdy(c)
+#define atan2(x,y) atan(y,x)
+#define rsqrt(c) inversesqrt(c)
+
+#define input_texture source[0]
+
+#if defined(GL_ES)
+	#define COMPAT_PRECISION mediump
+#else
+	#define COMPAT_PRECISION
+#endif
+
+#if __VERSION__ >= 130
+	#define COMPAT_TEXTURE texture
+#else
+	#define COMPAT_TEXTURE texture2D
+#endif
+
+#define ORIG_LINEARIZEDvideo_size sourceSize[1].xy
+#define ORIG_LINEARIZEDtexture_size sourceSize[1].xy
+
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+////////////////////////////  END USER-SETTINGS  //////////////////////////
+
+//#include "bind-shader-h"
+
+/////////////////////////////  BEGIN BIND-SHADER-PARAMS  ////////////////////////////
+
+#ifndef BIND_SHADER_PARAMS_H
+#define BIND_SHADER_PARAMS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+/////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+////////////////////   END DERIVED-SETTINGS-AND-CONSTANTS   /////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+//  Override some parameters for gamma-management.h and tex2Dantialias.h:
+#define OVERRIDE_DEVICE_GAMMA
+static const float gba_gamma = 3.5; //  Irrelevant but necessary to define.
+#define ANTIALIAS_OVERRIDE_BASICS
+#define ANTIALIAS_OVERRIDE_PARAMETERS
+
+//  Provide accessors for vector constants that pack scalar uniforms:
+inline float2 get_aspect_vector(const float geom_aspect_ratio)
+{
+    //  Get an aspect ratio vector.  Enforce geom_max_aspect_ratio, and prevent
+    //  the absolute scale from affecting the uv-mapping for curvature:
+    const float geom_clamped_aspect_ratio =
+        min(geom_aspect_ratio, geom_max_aspect_ratio);
+    const float2 geom_aspect =
+        normalize(float2(geom_clamped_aspect_ratio, 1.0));
+    return geom_aspect;
+}
+
+inline float2 get_geom_overscan_vector()
+{
+    return float2(geom_overscan_x, geom_overscan_y);
+}
+
+inline float2 get_geom_tilt_angle_vector()
+{
+    return float2(geom_tilt_angle_x, geom_tilt_angle_y);
+}
+
+inline float3 get_convergence_offsets_x_vector()
+{
+    return float3(convergence_offset_x_r, convergence_offset_x_g,
+        convergence_offset_x_b);
+}
+
+inline float3 get_convergence_offsets_y_vector()
+{
+    return float3(convergence_offset_y_r, convergence_offset_y_g,
+        convergence_offset_y_b);
+}
+
+inline float2 get_convergence_offsets_r_vector()
+{
+    return float2(convergence_offset_x_r, convergence_offset_y_r);
+}
+
+inline float2 get_convergence_offsets_g_vector()
+{
+    return float2(convergence_offset_x_g, convergence_offset_y_g);
+}
+
+inline float2 get_convergence_offsets_b_vector()
+{
+    return float2(convergence_offset_x_b, convergence_offset_y_b);
+}
+
+inline float2 get_aa_subpixel_r_offset()
+{
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+            //  WARNING: THIS IS EXTREMELY EXPENSIVE.
+            return float2(aa_subpixel_r_offset_x_runtime,
+                aa_subpixel_r_offset_y_runtime);
+        #else
+            return aa_subpixel_r_offset_static;
+        #endif
+    #else
+        return aa_subpixel_r_offset_static;
+    #endif
+}
+
+//  Provide accessors settings which still need "cooking:"
+inline float get_mask_amplify()
+{
+    static const float mask_grille_amplify = 1.0/mask_grille_avg_color;
+    static const float mask_slot_amplify = 1.0/mask_slot_avg_color;
+    static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color;
+    return mask_type < 0.5 ? mask_grille_amplify :
+        mask_type < 1.5 ? mask_slot_amplify :
+        mask_shadow_amplify;
+}
+
+inline float get_mask_sample_mode()
+{
+    #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_desired;
+        #else
+            return clamp(mask_sample_mode_desired, 1.0, 2.0);
+        #endif
+    #else
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_static;
+        #else
+            return clamp(mask_sample_mode_static, 1.0, 2.0);
+        #endif
+    #endif
+}
+
+#endif  //  BIND_SHADER_PARAMS_H
+
+////////////////////////////  END BIND-SHADER-PARAMS  ///////////////////////////
+
+//#include "../../../../include/gamma-management.h"
+
+////////////////////////////  BEGIN GAMMA-MANAGEMENT  //////////////////////////
+
+#ifndef GAMMA_MANAGEMENT_H
+#define GAMMA_MANAGEMENT_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//  
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides gamma-aware tex*D*() and encode_output() functions.
+//  Requires:   Before #include-ing this file, the including file must #define
+//              the following macros when applicable and follow their rules:
+//              1.) #define FIRST_PASS if this is the first pass.
+//              2.) #define LAST_PASS if this is the last pass.
+//              3.) If sRGB is available, set srgb_framebufferN = "true" for
+//                  every pass except the last in your .cgp preset.
+//              4.) If sRGB isn't available but you want gamma-correctness with
+//                  no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
+//              5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
+//              6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
+//              7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
+//              8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
+//              If an option in [5, 8] is #defined in the first or last pass, it
+//              should be #defined for both.  It shouldn't make a difference
+//              whether it's #defined for intermediate passes or not.
+//  Optional:   The including file (or an earlier included file) may optionally
+//              #define a number of macros indicating it will override certain
+//              macros and associated constants are as follows:
+//              static constants with either static or uniform constants.  The
+//              1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
+//                  static const float ntsc_gamma
+//                  static const float pal_gamma
+//                  static const float crt_reference_gamma_high
+//                  static const float crt_reference_gamma_low
+//                  static const float lcd_reference_gamma
+//                  static const float crt_office_gamma
+//                  static const float lcd_office_gamma
+//              2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
+//                  static const float crt_gamma
+//                  static const float gba_gamma
+//                  static const float lcd_gamma
+//              3.) OVERRIDE_FINAL_GAMMA: The user must first define:
+//                  static const float input_gamma
+//                  static const float intermediate_gamma
+//                  static const float output_gamma
+//                  (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
+//              4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
+//                  static const bool assume_opaque_alpha
+//              The gamma constant overrides must be used in every pass or none,
+//              and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
+//              OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
+//  Usage:      After setting macros appropriately, ignore gamma correction and
+//              replace all tex*D*() calls with equivalent gamma-aware
+//              tex*D*_linearize calls, except:
+//              1.) When you read an LUT, use regular tex*D or a gamma-specified
+//                  function, depending on its gamma encoding:
+//                      tex*D*_linearize_gamma (takes a runtime gamma parameter)
+//              2.) If you must read pass0's original input in a later pass, use
+//                  tex2D_linearize_ntsc_gamma.  If you want to read pass0's
+//                  input with gamma-corrected bilinear filtering, consider
+//                  creating a first linearizing pass and reading from the input
+//                  of pass1 later.
+//              Then, return encode_output(color) from every fragment shader.
+//              Finally, use the global gamma_aware_bilinear boolean if you want
+//              to statically branch based on whether bilinear filtering is
+//              gamma-correct or not (e.g. for placing Gaussian blur samples).
+//
+//  Detailed Policy:
+//  tex*D*_linearize() functions enforce a consistent gamma-management policy
+//  based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings.  They assume
+//  their input texture has the same encoding characteristics as the input for
+//  the current pass (which doesn't apply to the exceptions listed above).
+//  Similarly, encode_output() enforces a policy based on the LAST_PASS and
+//  GAMMA_ENCODE_EVERY_FBO settings.  Together, they result in one of the
+//  following two pipelines.
+//  Typical pipeline with intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = linear_color;     //  Automatic sRGB encoding
+//      linear_color = intermediate_output;     //  Automatic sRGB decoding
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Typical pipeline without intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
+//      linear_color = pow(intermediate_output, intermediate_gamma);
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
+//  easily get gamma-correctness without banding on devices where sRGB isn't
+//  supported.
+//
+//  Use This Header to Maximize Code Reuse:
+//  The purpose of this header is to provide a consistent interface for texture
+//  reads and output gamma-encoding that localizes and abstracts away all the
+//  annoying details.  This greatly reduces the amount of code in each shader
+//  pass that depends on the pass number in the .cgp preset or whether sRGB
+//  FBO's are being used: You can trivially change the gamma behavior of your
+//  whole pass by commenting or uncommenting 1-3 #defines.  To reuse the same
+//  code in your first, Nth, and last passes, you can even put it all in another
+//  header file and #include it from skeleton .cg files that #define the
+//  appropriate pass-specific settings.
+//
+//  Rationale for Using Three Macros:
+//  This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
+//  SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
+//  a lower maintenance burden on each pass.  At first glance it seems we could
+//  accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
+//  This works for simple use cases where input_gamma == output_gamma, but it
+//  breaks down for more complex scenarios like CRT simulation, where the pass
+//  number determines the gamma encoding of the input and output.
+
+
+///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
+
+//  Set standard gamma constants, but allow users to override them:
+#ifndef OVERRIDE_STANDARD_GAMMA
+    //  Standard encoding gammas:
+    static const float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
+    static const float pal_gamma = 2.8;     //  Never actually 2.8 in practice
+    //  Typical device decoding gammas (only use for emulating devices):
+    //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
+    //  gammas: The standards purposely undercorrected for an analog CRT's
+    //  assumed 2.5 reference display gamma to maintain contrast in assumed
+    //  [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
+    //  These unstated assumptions about display gamma and perceptual rendering
+    //  intent caused a lot of confusion, and more modern CRT's seemed to target
+    //  NTSC 2.2 gamma with circuitry.  LCD displays seem to have followed suit
+    //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
+    //  displays designed to view sRGB in bright environments.  (Standards are
+    //  also in flux again with BT.1886, but it's underspecified for displays.)
+    static const float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
+    static const float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
+    static const float lcd_reference_gamma = 2.5;       //  To match CRT
+    static const float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
+    static const float lcd_office_gamma = 2.2;  //  Approximates sRGB
+#endif  //  OVERRIDE_STANDARD_GAMMA
+
+//  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
+//  but only if they're aware of it.
+#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
+    static const bool assume_opaque_alpha = false;
+#endif
+
+
+///////////////////////  DERIVED CONSTANTS AS FUNCTIONS  ///////////////////////
+
+//  gamma-management.h should be compatible with overriding gamma values with
+//  runtime user parameters, but we can only define other global constants in
+//  terms of static constants, not uniform user parameters.  To get around this
+//  limitation, we need to define derived constants using functions.
+
+//  Set device gamma constants, but allow users to override them:
+#ifdef OVERRIDE_DEVICE_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_crt_gamma()    {   return crt_gamma;   }
+    inline float get_gba_gamma()    {   return gba_gamma;   }
+    inline float get_lcd_gamma()    {   return lcd_gamma;   }
+#else
+    inline float get_crt_gamma()    {   return crt_reference_gamma_high;    }
+    inline float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
+    inline float get_lcd_gamma()    {   return lcd_office_gamma;            }
+#endif  //  OVERRIDE_DEVICE_GAMMA
+
+//  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
+#ifdef OVERRIDE_FINAL_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_intermediate_gamma()   {   return intermediate_gamma;  }
+    inline float get_input_gamma()          {   return input_gamma;         }
+    inline float get_output_gamma()         {   return output_gamma;        }
+#else
+    //  If we gamma-correct every pass, always use ntsc_gamma between passes to
+    //  ensure middle passes don't need to care if anything is being simulated:
+    inline float get_intermediate_gamma()   {   return ntsc_gamma;          }
+    #ifdef SIMULATE_CRT_ON_LCD
+        inline float get_input_gamma()      {   return get_crt_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_LCD
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_LCD_ON_CRT
+        inline float get_input_gamma()      {   return get_lcd_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_CRT
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else   //  Don't simulate anything:
+        inline float get_input_gamma()      {   return ntsc_gamma;          }
+        inline float get_output_gamma()     {   return ntsc_gamma;          }
+    #endif  //  SIMULATE_GBA_ON_CRT
+    #endif  //  SIMULATE_LCD_ON_CRT
+    #endif  //  SIMULATE_GBA_ON_LCD
+    #endif  //  SIMULATE_CRT_ON_LCD
+#endif  //  OVERRIDE_FINAL_GAMMA
+
+//  Set decoding/encoding gammas for the current pass.  Use static constants for
+//  linearize_input and gamma_encode_output, because they aren't derived, and
+//  they let the compiler do dead-code elimination.
+#ifndef GAMMA_ENCODE_EVERY_FBO
+    #ifdef FIRST_PASS
+        static const bool linearize_input = true;
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        static const bool linearize_input = false;
+        inline float get_pass_input_gamma()     {   return 1.0;                 }
+    #endif
+    #ifdef LAST_PASS
+        static const bool gamma_encode_output = true;
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        static const bool gamma_encode_output = false;
+        inline float get_pass_output_gamma()    {   return 1.0;                 }
+    #endif
+#else
+    static const bool linearize_input = true;
+    static const bool gamma_encode_output = true;
+    #ifdef FIRST_PASS
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        inline float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
+    #endif
+    #ifdef LAST_PASS
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        inline float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
+    #endif
+#endif
+
+//  Users might want to know if bilinear filtering will be gamma-correct:
+static const bool gamma_aware_bilinear = !linearize_input;
+
+
+//////////////////////  COLOR ENCODING/DECODING FUNCTIONS  /////////////////////
+
+inline float4 encode_output(const float4 color)
+{
+    if(gamma_encode_output)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_input(const float4 color)
+{
+    if(linearize_input)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_gamma_input(const float4 color, const float3 gamma)
+{
+    if(assume_opaque_alpha)
+    {
+        return float4(pow(color.rgb, gamma), 1.0);
+    }
+    else
+    {
+        return float4(pow(color.rgb, gamma), color.a);
+    }
+}
+
+//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯
+//#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D)))
+// EDIT: it's the 'const' in front of the coords that's doing it
+
+///////////////////////////  TEXTURE LOOKUP WRAPPERS  //////////////////////////
+
+//  "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a wide array of linearizing texture lookup wrapper functions.  The
+//  Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
+//  lookups are provided for completeness in case that changes someday.  Nobody
+//  is likely to use the *fetch and *proj functions, but they're included just
+//  in case.  The only tex*D texture sampling functions omitted are:
+//      - tex*Dcmpbias
+//      - tex*Dcmplod
+//      - tex*DARRAY*
+//      - tex*DMS*
+//      - Variants returning integers
+//  Standard line length restrictions are ignored below for vertical brevity.
+/*
+//  tex1D:
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex1Dbias:
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dbias(tex, tex_coords));   }
+
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dbias(tex, tex_coords, texel_off));    }
+
+//  tex1Dfetch:
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
+{   return decode_input(tex1Dfetch(tex, tex_coords));  }
+
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex1Dlod:
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dlod(tex, tex_coords));    }
+
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dlod(tex, tex_coords, texel_off));     }
+
+//  tex1Dproj:
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+*/
+//  tex2D:
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords, texel_off));    }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex2Dbias:
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
+//{   return decode_input(tex2Dbias(tex, tex_coords));   }
+
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dbias(tex, tex_coords, texel_off));    }
+
+//  tex2Dfetch:
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
+//{   return decode_input(tex2Dfetch(tex, tex_coords));  }
+
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords)
+{   return decode_input(textureLod(tex, tex_coords.xy, 0.0));    }
+
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));     }
+/*
+//  tex2Dproj:
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+*/
+/*
+//  tex3D:
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
+{   return decode_input(tex3D(tex, tex_coords));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, texel_off));    }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex3Dbias:
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dbias(tex, tex_coords));   }
+
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dbias(tex, tex_coords, texel_off));    }
+
+//  tex3Dfetch:
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
+{   return decode_input(tex3Dfetch(tex, tex_coords));  }
+
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex3Dlod:
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dlod(tex, tex_coords));    }
+
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dlod(tex, tex_coords, texel_off));     }
+
+//  tex3Dproj:
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dproj(tex, tex_coords));   }
+
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dproj(tex, tex_coords, texel_off));    }
+/////////*
+
+//  NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  This narrow selection of nonstandard tex2D* functions can be useful:
+
+//  tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0)));   }
+
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off));    }
+
+
+//  MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a narrower selection of tex2D* wrapper functions that decode an
+//  input sample with a specified gamma value.  These are useful for reading
+//  LUT's and for reading the input of pass0 in a later pass.
+
+//  tex2D:
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma);   }
+
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+/*
+//  tex2Dbias:
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma);   }
+
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma);    }
+
+//  tex2Dfetch:
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma);  }
+
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma);   }
+*/
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma);    }
+
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma);     }
+
+
+#endif  //  GAMMA_MANAGEMENT_H
+
+////////////////////////////  END GAMMA-MANAGEMENT  //////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  END DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////////////
+
+//#include "scanline-functions.h"
+
+/////////////////////////////  BEGIN SCANLINE-FUNCTIONS  ////////////////////////////
+
+#ifndef SCANLINE_FUNCTIONS_H
+#define SCANLINE_FUNCTIONS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+////////////////////////////  END USER-SETTINGS  //////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  END DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////////////
+
+//#include "../../../../include/special-functions.h"
+
+///////////////////////////  BEGIN SPECIAL-FUNCTIONS  //////////////////////////
+
+#ifndef SPECIAL_FUNCTIONS_H
+#define SPECIAL_FUNCTIONS_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file implements the following mathematical special functions:
+//  1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2))
+//  2.) gamma(s), a real-numbered extension of the integer factorial function
+//  It also implements normalized_ligamma(s, z), a normalized lower incomplete
+//  gamma function for s < 0.5 only.  Both gamma() and normalized_ligamma() can
+//  be called with an _impl suffix to use an implementation version with a few
+//  extra precomputed parameters (which may be useful for the caller to reuse).
+//  See below for details.
+//
+//  Design Rationale:
+//  Pretty much every line of code in this file is duplicated four times for
+//  different input types (float4/float3/float2/float).  This is unfortunate,
+//  but Cg doesn't allow function templates.  Macros would be far less verbose,
+//  but they would make the code harder to document and read.  I don't expect
+//  these functions will require a whole lot of maintenance changes unless
+//  someone ever has need for more robust incomplete gamma functions, so code
+//  duplication seems to be the lesser evil in this case.
+
+
+///////////////////////////  GAUSSIAN ERROR FUNCTION  //////////////////////////
+
+float4 erf6(float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Return an Abramowitz/Stegun approximation of erf(), where:
+    //                  erf(x) = 2/sqrt(pi) * integral(e**(-x**2))
+    //              This approximation has a max absolute error of 2.5*10**-5
+    //              with solid numerical robustness and efficiency.  See:
+	//                  https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions
+	static const float4 one = float4(1.0);
+	const float4 sign_x = sign(x);
+	const float4 t = one/(one + 0.47047*abs(x));
+	const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float3 erf6(const float3 x)
+{
+    //  Float3 version:
+	static const float3 one = float3(1.0);
+	const float3 sign_x = sign(x);
+	const float3 t = one/(one + 0.47047*abs(x));
+	const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float2 erf6(const float2 x)
+{
+    //  Float2 version:
+	static const float2 one = float2(1.0);
+	const float2 sign_x = sign(x);
+	const float2 t = one/(one + 0.47047*abs(x));
+	const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float erf6(const float x)
+{
+    //  Float version:
+	const float sign_x = sign(x);
+	const float t = 1.0/(1.0 + 0.47047*abs(x));
+	const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float4 erft(const float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Approximate erf() with the hyperbolic tangent.  The error is
+    //              visually noticeable, but it's blazing fast and perceptually
+    //              close...at least on ATI hardware.  See:
+    //                  http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html
+    //  Warning:    Only use this if your hardware drivers correctly implement
+    //              tanh(): My nVidia 8800GTS returns garbage output.
+	return tanh(1.202760580 * x);
+}
+
+float3 erft(const float3 x)
+{
+    //  Float3 version:
+	return tanh(1.202760580 * x);
+}
+
+float2 erft(const float2 x)
+{
+    //  Float2 version:
+	return tanh(1.202760580 * x);
+}
+
+float erft(const float x)
+{
+    //  Float version:
+	return tanh(1.202760580 * x);
+}
+
+inline float4 erf(const float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Some approximation of erf(x), depending on user settings.
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float3 erf(const float3 x)
+{
+    //  Float3 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float2 erf(const float2 x)
+{
+    //  Float2 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float erf(const float x)
+{
+    //  Float version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+
+///////////////////////////  COMPLETE GAMMA FUNCTION  //////////////////////////
+
+float4 gamma_impl(const float4 s, const float4 s_inv)
+{
+    //  Requires:   1.) s is the standard parameter to the gamma function, and
+    //                  it should lie in the [0, 36] range.
+    //              2.) s_inv = 1.0/s.  This implementation function requires
+    //                  the caller to precompute this value, giving users the
+    //                  opportunity to reuse it.
+    //  Returns:    Return approximate gamma function (real-numbered factorial)
+    //              output using the Lanczos approximation with two coefficients
+    //              calculated using Paul Godfrey's method here:
+    //                  http://my.fit.edu/~gabdo/gamma.txt
+    //              An optimal g value for s in [0, 36] is ~1.12906830989, with
+    //              a maximum relative error of 0.000463 for 2**16 equally
+    //              evals.  We could use three coeffs (0.0000346 error) without
+    //              hurting latency, but this allows more parallelism with
+    //              outside instructions.
+	static const float4 g = float4(1.12906830989);
+	static const float4 c0 = float4(0.8109119309638332633713423362694399653724431);
+	static const float4 c1 = float4(0.4808354605142681877121661197951496120000040);
+	static const float4 e = float4(2.71828182845904523536028747135266249775724709);
+	const float4 sph = s + float4(0.5);
+	const float4 lanczos_sum = c0 + c1/(s + float4(1.0));
+	const float4 base = (sph + g)/e;  //  or (s + g + float4(0.5))/e
+	//  gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s).
+	//  This has less error for small s's than (s -= 1.0) at the beginning.
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float3 gamma_impl(const float3 s, const float3 s_inv)
+{
+    //  Float3 version:
+	static const float3 g = float3(1.12906830989);
+	static const float3 c0 = float3(0.8109119309638332633713423362694399653724431);
+	static const float3 c1 = float3(0.4808354605142681877121661197951496120000040);
+	static const float3 e = float3(2.71828182845904523536028747135266249775724709);
+	const float3 sph = s + float3(0.5);
+	const float3 lanczos_sum = c0 + c1/(s + float3(1.0));
+	const float3 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float2 gamma_impl(const float2 s, const float2 s_inv)
+{
+    //  Float2 version:
+	static const float2 g = float2(1.12906830989);
+	static const float2 c0 = float2(0.8109119309638332633713423362694399653724431);
+	static const float2 c1 = float2(0.4808354605142681877121661197951496120000040);
+	static const float2 e = float2(2.71828182845904523536028747135266249775724709);
+	const float2 sph = s + float2(0.5);
+	const float2 lanczos_sum = c0 + c1/(s + float2(1.0));
+	const float2 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float gamma_impl(const float s, const float s_inv)
+{
+    //  Float version:
+	static const float g = 1.12906830989;
+	static const float c0 = 0.8109119309638332633713423362694399653724431;
+	static const float c1 = 0.4808354605142681877121661197951496120000040;
+	static const float e = 2.71828182845904523536028747135266249775724709;
+	const float sph = s + 0.5;
+	const float lanczos_sum = c0 + c1/(s + 1.0);
+	const float base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float4 gamma(const float4 s)
+{
+    //  Requires:   s is the standard parameter to the gamma function, and it
+    //              should lie in the [0, 36] range.
+    //  Returns:    Return approximate gamma function output with a maximum
+    //              relative error of 0.000463.  See gamma_impl for details.
+	return gamma_impl(s, float4(1.0)/s);
+}
+
+float3 gamma(const float3 s)
+{
+    //  Float3 version:
+	return gamma_impl(s, float3(1.0)/s);
+}
+
+float2 gamma(const float2 s)
+{
+    //  Float2 version:
+	return gamma_impl(s, float2(1.0)/s);
+}
+
+float gamma(const float s)
+{
+    //  Float version:
+	return gamma_impl(s, 1.0/s);
+}
+
+
+////////////////  INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT)  ///////////////
+
+//  Lower incomplete gamma function for small s and z (implementation):
+float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z <= ~0.775075
+    //              3.) s_inv = 1.0/s (precomputed for outside reuse)
+    //  Returns:    A series representation for the lower incomplete gamma
+    //              function for small s and small z (4 terms).
+    //  The actual "rolled up" summation looks like:
+	//      last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0;
+	//      sum = last_sign * last_pow / ((s + k) * last_factorial)
+	//      for(int i = 0; i < 4; ++i)
+	//      {
+	//          last_sign *= -1.0; last_pow *= z; last_factorial *= i;
+	//          sum += last_sign * last_pow / ((s + k) * last_factorial);
+	//      }
+	//  Unrolled, constant-unfolded and arranged for madds and parallelism:
+	const float4 scale = pow(z, s);
+	float4 sum = s_inv;  //  Summation iteration 0 result
+	//  Summation iterations 1, 2, and 3:
+	const float4 z_sq = z*z;
+	const float4 denom1 = s + float4(1.0);
+	const float4 denom2 = 2.0*s + float4(4.0);
+	const float4 denom3 = 6.0*s + float4(18.0);
+	//float4 denom4 = 24.0*s + float4(96.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	//sum += z_sq * z_sq / denom4;
+	//  Scale and return:
+	return scale * sum;
+}
+
+float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv)
+{
+    //  Float3 version:
+	const float3 scale = pow(z, s);
+	float3 sum = s_inv;
+	const float3 z_sq = z*z;
+	const float3 denom1 = s + float3(1.0);
+	const float3 denom2 = 2.0*s + float3(4.0);
+	const float3 denom3 = 6.0*s + float3(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv)
+{
+    //  Float2 version:
+	const float2 scale = pow(z, s);
+	float2 sum = s_inv;
+	const float2 z_sq = z*z;
+	const float2 denom1 = s + float2(1.0);
+	const float2 denom2 = 2.0*s + float2(4.0);
+	const float2 denom3 = 6.0*s + float2(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float ligamma_small_z_impl(const float s, const float z, const float s_inv)
+{
+    //  Float version:
+	const float scale = pow(z, s);
+	float sum = s_inv;
+	const float z_sq = z*z;
+	const float denom1 = s + 1.0;
+	const float denom2 = 2.0*s + 4.0;
+	const float denom3 = 6.0*s + 18.0;
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+//  Upper incomplete gamma function for small s and large z (implementation):
+float4 uigamma_large_z_impl(const float4 s, const float4 z)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z > ~0.775075
+    //  Returns:    Gauss's continued fraction representation for the upper
+    //              incomplete gamma function (4 terms).
+	//  The "rolled up" continued fraction looks like this.  The denominator
+    //  is truncated, and it's calculated "from the bottom up:"
+	//      denom = float4('inf');
+	//      float4 one = float4(1.0);
+	//      for(int i = 4; i > 0; --i)
+	//      {
+	//          denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom;
+	//      }
+	//  Unrolled and constant-unfolded for madds and parallelism:
+	const float4 numerator = pow(z, s) * exp(-z);
+	float4 denom = float4(7.0) + z - s;
+	denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom;
+	denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom;
+	denom = float4(1.0) + z - s + (s - float4(1.0))/denom;
+	return numerator / denom;
+}
+
+float3 uigamma_large_z_impl(const float3 s, const float3 z)
+{
+    //  Float3 version:
+	const float3 numerator = pow(z, s) * exp(-z);
+	float3 denom = float3(7.0) + z - s;
+	denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom;
+	denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom;
+	denom = float3(1.0) + z - s + (s - float3(1.0))/denom;
+	return numerator / denom;
+}
+
+float2 uigamma_large_z_impl(const float2 s, const float2 z)
+{
+    //  Float2 version:
+	const float2 numerator = pow(z, s) * exp(-z);
+	float2 denom = float2(7.0) + z - s;
+	denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom;
+	denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom;
+	denom = float2(1.0) + z - s + (s - float2(1.0))/denom;
+	return numerator / denom;
+}
+
+float uigamma_large_z_impl(const float s, const float z)
+{
+    //  Float version:
+	const float numerator = pow(z, s) * exp(-z);
+	float denom = 7.0 + z - s;
+	denom = 5.0 + z - s + (3.0*s - 9.0)/denom;
+	denom = 3.0 + z - s + (2.0*s - 4.0)/denom;
+	denom = 1.0 + z - s + (s - 1.0)/denom;
+	return numerator / denom;
+}
+
+//  Normalized lower incomplete gamma function for small s (implementation):
+float4 normalized_ligamma_impl(const float4 s, const float4 z,
+    const float4 s_inv, const float4 gamma_s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) s_inv = 1/s (precomputed for outside reuse)
+    //              3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse)
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  Since we only care about s < 0.5, we only need
+    //              to evaluate two branches (not four) based on z.  Each branch
+    //              uses four terms, with a max relative error of ~0.00182.  The
+    //              branch threshold and specifics were adapted for fewer terms
+    //              from Gil/Segura/Temme's paper here:
+    //                  http://oai.cwi.nl/oai/asset/20433/20433B.pdf
+	//  Evaluate both branches: Real branches test slower even when available.
+	static const float4 thresh = float4(0.775075);
+	bool4 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	z_is_large.w = z.w > thresh.w;
+	const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	//  Combine the results from both branches:
+	bool4 inverse_z_is_large = not(z_is_large);
+	return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large);
+}
+
+float3 normalized_ligamma_impl(const float3 s, const float3 z,
+    const float3 s_inv, const float3 gamma_s_inv)
+{
+    //  Float3 version:
+	static const float3 thresh = float3(0.775075);
+	bool3 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool3 inverse_z_is_large = not(z_is_large);
+	return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large);
+}
+
+float2 normalized_ligamma_impl(const float2 s, const float2 z,
+    const float2 s_inv, const float2 gamma_s_inv)
+{
+    //  Float2 version:
+	static const float2 thresh = float2(0.775075);
+	bool2 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool2 inverse_z_is_large = not(z_is_large);
+	return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large);
+}
+
+float normalized_ligamma_impl(const float s, const float z,
+    const float s_inv, const float gamma_s_inv)
+{
+    //  Float version:
+	static const float thresh = 0.775075;
+	const bool z_is_large = z > thresh;
+	const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	return large_z * float(z_is_large) + small_z * float(!z_is_large);
+}
+
+//  Normalized lower incomplete gamma function for small s:
+float4 normalized_ligamma(const float4 s, const float4 z)
+{
+    //  Requires:   s < ~0.5
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  See normalized_ligamma_impl() for details.
+	const float4 s_inv = float4(1.0)/s;
+	const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float3 normalized_ligamma(const float3 s, const float3 z)
+{
+    //  Float3 version:
+	const float3 s_inv = float3(1.0)/s;
+	const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float2 normalized_ligamma(const float2 s, const float2 z)
+{
+    //  Float2 version:
+	const float2 s_inv = float2(1.0)/s;
+	const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float normalized_ligamma(const float s, const float z)
+{
+    //  Float version:
+	const float s_inv = 1.0/s;
+	const float gamma_s_inv = 1.0/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+#endif  //  SPECIAL_FUNCTIONS_H
+
+////////////////////////////  END SPECIAL-FUNCTIONS  ///////////////////////////
+
+//#include "../../../../include/gamma-management.h"
+
+////////////////////////////  BEGIN GAMMA-MANAGEMENT  //////////////////////////
+
+#ifndef GAMMA_MANAGEMENT_H
+#define GAMMA_MANAGEMENT_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//  
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides gamma-aware tex*D*() and encode_output() functions.
+//  Requires:   Before #include-ing this file, the including file must #define
+//              the following macros when applicable and follow their rules:
+//              1.) #define FIRST_PASS if this is the first pass.
+//              2.) #define LAST_PASS if this is the last pass.
+//              3.) If sRGB is available, set srgb_framebufferN = "true" for
+//                  every pass except the last in your .cgp preset.
+//              4.) If sRGB isn't available but you want gamma-correctness with
+//                  no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
+//              5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
+//              6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
+//              7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
+//              8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
+//              If an option in [5, 8] is #defined in the first or last pass, it
+//              should be #defined for both.  It shouldn't make a difference
+//              whether it's #defined for intermediate passes or not.
+//  Optional:   The including file (or an earlier included file) may optionally
+//              #define a number of macros indicating it will override certain
+//              macros and associated constants are as follows:
+//              static constants with either static or uniform constants.  The
+//              1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
+//                  static const float ntsc_gamma
+//                  static const float pal_gamma
+//                  static const float crt_reference_gamma_high
+//                  static const float crt_reference_gamma_low
+//                  static const float lcd_reference_gamma
+//                  static const float crt_office_gamma
+//                  static const float lcd_office_gamma
+//              2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
+//                  static const float crt_gamma
+//                  static const float gba_gamma
+//                  static const float lcd_gamma
+//              3.) OVERRIDE_FINAL_GAMMA: The user must first define:
+//                  static const float input_gamma
+//                  static const float intermediate_gamma
+//                  static const float output_gamma
+//                  (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
+//              4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
+//                  static const bool assume_opaque_alpha
+//              The gamma constant overrides must be used in every pass or none,
+//              and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
+//              OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
+//  Usage:      After setting macros appropriately, ignore gamma correction and
+//              replace all tex*D*() calls with equivalent gamma-aware
+//              tex*D*_linearize calls, except:
+//              1.) When you read an LUT, use regular tex*D or a gamma-specified
+//                  function, depending on its gamma encoding:
+//                      tex*D*_linearize_gamma (takes a runtime gamma parameter)
+//              2.) If you must read pass0's original input in a later pass, use
+//                  tex2D_linearize_ntsc_gamma.  If you want to read pass0's
+//                  input with gamma-corrected bilinear filtering, consider
+//                  creating a first linearizing pass and reading from the input
+//                  of pass1 later.
+//              Then, return encode_output(color) from every fragment shader.
+//              Finally, use the global gamma_aware_bilinear boolean if you want
+//              to statically branch based on whether bilinear filtering is
+//              gamma-correct or not (e.g. for placing Gaussian blur samples).
+//
+//  Detailed Policy:
+//  tex*D*_linearize() functions enforce a consistent gamma-management policy
+//  based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings.  They assume
+//  their input texture has the same encoding characteristics as the input for
+//  the current pass (which doesn't apply to the exceptions listed above).
+//  Similarly, encode_output() enforces a policy based on the LAST_PASS and
+//  GAMMA_ENCODE_EVERY_FBO settings.  Together, they result in one of the
+//  following two pipelines.
+//  Typical pipeline with intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = linear_color;     //  Automatic sRGB encoding
+//      linear_color = intermediate_output;     //  Automatic sRGB decoding
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Typical pipeline without intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
+//      linear_color = pow(intermediate_output, intermediate_gamma);
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
+//  easily get gamma-correctness without banding on devices where sRGB isn't
+//  supported.
+//
+//  Use This Header to Maximize Code Reuse:
+//  The purpose of this header is to provide a consistent interface for texture
+//  reads and output gamma-encoding that localizes and abstracts away all the
+//  annoying details.  This greatly reduces the amount of code in each shader
+//  pass that depends on the pass number in the .cgp preset or whether sRGB
+//  FBO's are being used: You can trivially change the gamma behavior of your
+//  whole pass by commenting or uncommenting 1-3 #defines.  To reuse the same
+//  code in your first, Nth, and last passes, you can even put it all in another
+//  header file and #include it from skeleton .cg files that #define the
+//  appropriate pass-specific settings.
+//
+//  Rationale for Using Three Macros:
+//  This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
+//  SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
+//  a lower maintenance burden on each pass.  At first glance it seems we could
+//  accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
+//  This works for simple use cases where input_gamma == output_gamma, but it
+//  breaks down for more complex scenarios like CRT simulation, where the pass
+//  number determines the gamma encoding of the input and output.
+
+
+///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
+
+//  Set standard gamma constants, but allow users to override them:
+#ifndef OVERRIDE_STANDARD_GAMMA
+    //  Standard encoding gammas:
+    static const float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
+    static const float pal_gamma = 2.8;     //  Never actually 2.8 in practice
+    //  Typical device decoding gammas (only use for emulating devices):
+    //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
+    //  gammas: The standards purposely undercorrected for an analog CRT's
+    //  assumed 2.5 reference display gamma to maintain contrast in assumed
+    //  [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
+    //  These unstated assumptions about display gamma and perceptual rendering
+    //  intent caused a lot of confusion, and more modern CRT's seemed to target
+    //  NTSC 2.2 gamma with circuitry.  LCD displays seem to have followed suit
+    //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
+    //  displays designed to view sRGB in bright environments.  (Standards are
+    //  also in flux again with BT.1886, but it's underspecified for displays.)
+    static const float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
+    static const float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
+    static const float lcd_reference_gamma = 2.5;       //  To match CRT
+    static const float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
+    static const float lcd_office_gamma = 2.2;  //  Approximates sRGB
+#endif  //  OVERRIDE_STANDARD_GAMMA
+
+//  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
+//  but only if they're aware of it.
+#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
+    static const bool assume_opaque_alpha = false;
+#endif
+
+
+///////////////////////  DERIVED CONSTANTS AS FUNCTIONS  ///////////////////////
+
+//  gamma-management.h should be compatible with overriding gamma values with
+//  runtime user parameters, but we can only define other global constants in
+//  terms of static constants, not uniform user parameters.  To get around this
+//  limitation, we need to define derived constants using functions.
+
+//  Set device gamma constants, but allow users to override them:
+#ifdef OVERRIDE_DEVICE_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_crt_gamma()    {   return crt_gamma;   }
+    inline float get_gba_gamma()    {   return gba_gamma;   }
+    inline float get_lcd_gamma()    {   return lcd_gamma;   }
+#else
+    inline float get_crt_gamma()    {   return crt_reference_gamma_high;    }
+    inline float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
+    inline float get_lcd_gamma()    {   return lcd_office_gamma;            }
+#endif  //  OVERRIDE_DEVICE_GAMMA
+
+//  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
+#ifdef OVERRIDE_FINAL_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_intermediate_gamma()   {   return intermediate_gamma;  }
+    inline float get_input_gamma()          {   return input_gamma;         }
+    inline float get_output_gamma()         {   return output_gamma;        }
+#else
+    //  If we gamma-correct every pass, always use ntsc_gamma between passes to
+    //  ensure middle passes don't need to care if anything is being simulated:
+    inline float get_intermediate_gamma()   {   return ntsc_gamma;          }
+    #ifdef SIMULATE_CRT_ON_LCD
+        inline float get_input_gamma()      {   return get_crt_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_LCD
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_LCD_ON_CRT
+        inline float get_input_gamma()      {   return get_lcd_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_CRT
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else   //  Don't simulate anything:
+        inline float get_input_gamma()      {   return ntsc_gamma;          }
+        inline float get_output_gamma()     {   return ntsc_gamma;          }
+    #endif  //  SIMULATE_GBA_ON_CRT
+    #endif  //  SIMULATE_LCD_ON_CRT
+    #endif  //  SIMULATE_GBA_ON_LCD
+    #endif  //  SIMULATE_CRT_ON_LCD
+#endif  //  OVERRIDE_FINAL_GAMMA
+
+//  Set decoding/encoding gammas for the current pass.  Use static constants for
+//  linearize_input and gamma_encode_output, because they aren't derived, and
+//  they let the compiler do dead-code elimination.
+#ifndef GAMMA_ENCODE_EVERY_FBO
+    #ifdef FIRST_PASS
+        static const bool linearize_input = true;
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        static const bool linearize_input = false;
+        inline float get_pass_input_gamma()     {   return 1.0;                 }
+    #endif
+    #ifdef LAST_PASS
+        static const bool gamma_encode_output = true;
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        static const bool gamma_encode_output = false;
+        inline float get_pass_output_gamma()    {   return 1.0;                 }
+    #endif
+#else
+    static const bool linearize_input = true;
+    static const bool gamma_encode_output = true;
+    #ifdef FIRST_PASS
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        inline float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
+    #endif
+    #ifdef LAST_PASS
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        inline float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
+    #endif
+#endif
+
+//  Users might want to know if bilinear filtering will be gamma-correct:
+static const bool gamma_aware_bilinear = !linearize_input;
+
+
+//////////////////////  COLOR ENCODING/DECODING FUNCTIONS  /////////////////////
+
+inline float4 encode_output(const float4 color)
+{
+    if(gamma_encode_output)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_input(const float4 color)
+{
+    if(linearize_input)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_gamma_input(const float4 color, const float3 gamma)
+{
+    if(assume_opaque_alpha)
+    {
+        return float4(pow(color.rgb, gamma), 1.0);
+    }
+    else
+    {
+        return float4(pow(color.rgb, gamma), color.a);
+    }
+}
+
+//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯
+//#define tex2D_linearize(C, D) decode_input(vec4(texture(C, D)))
+// EDIT: it's the 'const' in front of the coords that's doing it
+
+///////////////////////////  TEXTURE LOOKUP WRAPPERS  //////////////////////////
+
+//  "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a wide array of linearizing texture lookup wrapper functions.  The
+//  Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
+//  lookups are provided for completeness in case that changes someday.  Nobody
+//  is likely to use the *fetch and *proj functions, but they're included just
+//  in case.  The only tex*D texture sampling functions omitted are:
+//      - tex*Dcmpbias
+//      - tex*Dcmplod
+//      - tex*DARRAY*
+//      - tex*DMS*
+//      - Variants returning integers
+//  Standard line length restrictions are ignored below for vertical brevity.
+/*
+//  tex1D:
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex1Dbias:
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dbias(tex, tex_coords));   }
+
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dbias(tex, tex_coords, texel_off));    }
+
+//  tex1Dfetch:
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
+{   return decode_input(tex1Dfetch(tex, tex_coords));  }
+
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex1Dlod:
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dlod(tex, tex_coords));    }
+
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dlod(tex, tex_coords, texel_off));     }
+
+//  tex1Dproj:
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+*/
+//  tex2D:
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords, texel_off));    }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex2Dbias:
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
+//{   return decode_input(tex2Dbias(tex, tex_coords));   }
+
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dbias(tex, tex_coords, texel_off));    }
+
+//  tex2Dfetch:
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
+//{   return decode_input(tex2Dfetch(tex, tex_coords));  }
+
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords)
+{   return decode_input(textureLod(tex, tex_coords.xy, 0.0));    }
+
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));     }
+/*
+//  tex2Dproj:
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+*/
+/*
+//  tex3D:
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
+{   return decode_input(tex3D(tex, tex_coords));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, texel_off));    }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex3Dbias:
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dbias(tex, tex_coords));   }
+
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dbias(tex, tex_coords, texel_off));    }
+
+//  tex3Dfetch:
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
+{   return decode_input(tex3Dfetch(tex, tex_coords));  }
+
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex3Dlod:
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dlod(tex, tex_coords));    }
+
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dlod(tex, tex_coords, texel_off));     }
+
+//  tex3Dproj:
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dproj(tex, tex_coords));   }
+
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dproj(tex, tex_coords, texel_off));    }
+/////////*
+
+//  NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  This narrow selection of nonstandard tex2D* functions can be useful:
+
+//  tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0)));   }
+
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off));    }
+
+
+//  MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a narrower selection of tex2D* wrapper functions that decode an
+//  input sample with a specified gamma value.  These are useful for reading
+//  LUT's and for reading the input of pass0 in a later pass.
+
+//  tex2D:
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma);   }
+
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+/*
+//  tex2Dbias:
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma);   }
+
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma);    }
+
+//  tex2Dfetch:
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma);  }
+
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma);   }
+*/
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma);    }
+
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma);     }
+
+
+#endif  //  GAMMA_MANAGEMENT_H
+
+////////////////////////////  END GAMMA-MANAGEMENT  //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+/////////////////////////////  SCANLINE FUNCTIONS  /////////////////////////////
+
+inline float3 get_gaussian_sigma(const float3 color, const float sigma_range)
+{
+    //  Requires:   Globals:
+    //              1.) beam_min_sigma and beam_max_sigma are global floats
+    //                  containing the desired minimum and maximum beam standard
+    //                  deviations, for dim and bright colors respectively.
+    //              2.) beam_max_sigma must be > 0.0
+    //              3.) beam_min_sigma must be in (0.0, beam_max_sigma]
+    //              4.) beam_spot_power must be defined as a global float.
+    //              Parameters:
+    //              1.) color is the underlying source color along a scanline
+    //              2.) sigma_range = beam_max_sigma - beam_min_sigma; we take
+    //                  sigma_range as a parameter to avoid repeated computation
+    //                  when beam_{min, max}_sigma are runtime shader parameters
+    //  Optional:   Users may set beam_spot_shape_function to 1 to define the
+    //              inner f(color) subfunction (see below) as:
+    //                  f(color) = sqrt(1.0 - (color - 1.0)*(color - 1.0))
+    //              Otherwise (technically, if beam_spot_shape_function < 0.5):
+    //                  f(color) = pow(color, beam_spot_power)
+    //  Returns:    The standard deviation of the Gaussian beam for "color:"
+    //                  sigma = beam_min_sigma + sigma_range * f(color)
+    //  Details/Discussion:
+    //  The beam's spot shape vaguely resembles an aspect-corrected f() in the
+    //  range [0, 1] (not quite, but it's related).  f(color) = color makes
+    //  spots look like diamonds, and a spherical function or cube balances
+    //  between variable width and a soft/realistic shape.   A beam_spot_power
+    //  > 1.0 can produce an ugly spot shape and more initial clipping, but the
+    //  final shape also differs based on the horizontal resampling filter and
+    //  the phosphor bloom.  For instance, resampling horizontally in nonlinear
+    //  light and/or with a sharp (e.g. Lanczos) filter will sharpen the spot
+    //  shape, but a sixth root is still quite soft.  A power function (default
+    //  1.0/3.0 beam_spot_power) is most flexible, but a fixed spherical curve
+    //  has the highest variability without an awful spot shape.
+    //
+    //  beam_min_sigma affects scanline sharpness/aliasing in dim areas, and its
+    //  difference from beam_max_sigma affects beam width variability.  It only
+    //  affects clipping [for pure Gaussians] if beam_spot_power > 1.0 (which is
+    //  a conservative estimate for a more complex constraint).
+    //
+    //  beam_max_sigma affects clipping and increasing scanline width/softness
+    //  as color increases.  The wider this is, the more scanlines need to be
+    //  evaluated to avoid distortion.  For a pure Gaussian, the max_beam_sigma
+    //  at which the first unused scanline always has a weight < 1.0/255.0 is:
+    //      num scanlines = 2, max_beam_sigma = 0.2089; distortions begin ~0.34
+    //      num scanlines = 3, max_beam_sigma = 0.3879; distortions begin ~0.52
+    //      num scanlines = 4, max_beam_sigma = 0.5723; distortions begin ~0.70
+    //      num scanlines = 5, max_beam_sigma = 0.7591; distortions begin ~0.89
+    //      num scanlines = 6, max_beam_sigma = 0.9483; distortions begin ~1.08
+    //  Generalized Gaussians permit more leeway here as steepness increases.
+    if(beam_spot_shape_function < 0.5)
+    {
+        //  Use a power function:
+        return float3(beam_min_sigma) + sigma_range *
+            pow(color, float3(beam_spot_power));
+    }
+    else
+    {
+        //  Use a spherical function:
+        const float3 color_minus_1 = color - float3(1.0);
+        return float3(beam_min_sigma) + sigma_range *
+            sqrt(float3(1.0) - color_minus_1*color_minus_1);
+    }
+}
+
+inline float3 get_generalized_gaussian_beta(const float3 color,
+    const float shape_range)
+{
+    //  Requires:   Globals:
+    //              1.) beam_min_shape and beam_max_shape are global floats
+    //                  containing the desired min/max generalized Gaussian
+    //                  beta parameters, for dim and bright colors respectively.
+    //              2.) beam_max_shape must be >= 2.0
+    //              3.) beam_min_shape must be in [2.0, beam_max_shape]
+    //              4.) beam_shape_power must be defined as a global float.
+    //              Parameters:
+    //              1.) color is the underlying source color along a scanline
+    //              2.) shape_range = beam_max_shape - beam_min_shape; we take
+    //                  shape_range as a parameter to avoid repeated computation
+    //                  when beam_{min, max}_shape are runtime shader parameters
+    //  Returns:    The type-I generalized Gaussian "shape" parameter beta for
+    //              the given color.
+    //  Details/Discussion:
+    //  Beta affects the scanline distribution as follows:
+    //  a.) beta < 2.0 narrows the peak to a spike with a discontinuous slope
+    //  b.) beta == 2.0 just degenerates to a Gaussian
+    //  c.) beta > 2.0 flattens and widens the peak, then drops off more steeply
+    //      than a Gaussian.  Whereas high sigmas widen and soften peaks, high
+    //      beta widen and sharpen peaks at the risk of aliasing.
+    //  Unlike high beam_spot_powers, high beam_shape_powers actually soften shape
+    //  transitions, whereas lower ones sharpen them (at the risk of aliasing).
+    return beam_min_shape + shape_range * pow(color, float3(beam_shape_power));
+}
+
+float3 scanline_gaussian_integral_contrib(const float3 dist,
+    const float3 color, const float pixel_height, const float sigma_range)
+{
+    //  Requires:   1.) dist is the distance of the [potentially separate R/G/B]
+    //                  point(s) from a scanline in units of scanlines, where
+    //                  1.0 means the sample point straddles the next scanline.
+    //              2.) color is the underlying source color along a scanline.
+    //              3.) pixel_height is the output pixel height in scanlines.
+    //              4.) Requirements of get_gaussian_sigma() must be met.
+    //  Returns:    Return a scanline's light output over a given pixel.
+    //  Details:
+    //  The CRT beam profile follows a roughly Gaussian distribution which is
+    //  wider for bright colors than dark ones.  The integral over the full
+    //  range of a Gaussian function is always 1.0, so we can vary the beam
+    //  with a standard deviation without affecting brightness.  'x' = distance:
+    //      gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2))
+    //      gaussian integral = 0.5 (1.0 + erf(x/(sigma * sqrt(2))))
+    //  Use a numerical approximation of the "error function" (the Gaussian
+    //  indefinite integral) to find the definite integral of the scanline's
+    //  average brightness over a given pixel area.  Even if curved coords were
+    //  used in this pass, a flat scalar pixel height works almost as well as a
+    //  pixel height computed from a full pixel-space to scanline-space matrix.
+    const float3 sigma = get_gaussian_sigma(color, sigma_range);
+    const float3 ph_offset = float3(pixel_height * 0.5);
+    const float3 denom_inv = 1.0/(sigma*sqrt(2.0));
+    const float3 integral_high = erf((dist + ph_offset)*denom_inv);
+    const float3 integral_low = erf((dist - ph_offset)*denom_inv);
+    return color * 0.5*(integral_high - integral_low)/pixel_height;
+}
+
+float3 scanline_generalized_gaussian_integral_contrib(float3 dist,
+    float3 color, float pixel_height, float sigma_range,
+    float shape_range)
+{
+    //  Requires:   1.) Requirements of scanline_gaussian_integral_contrib()
+    //                  must be met.
+    //              2.) Requirements of get_gaussian_sigma() must be met.
+    //              3.) Requirements of get_generalized_gaussian_beta() must be
+    //                  met.
+    //  Returns:    Return a scanline's light output over a given pixel.
+    //  A generalized Gaussian distribution allows the shape (beta) to vary
+    //  as well as the width (alpha).  "gamma" refers to the gamma function:
+    //      generalized sample =
+    //          beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta)
+    //  ligamma(s, z) is the lower incomplete gamma function, for which we only
+    //  implement two of four branches (because we keep 1/beta <= 0.5):
+    //      generalized integral = 0.5 + 0.5* sign(x) *
+    //          ligamma(1/beta, (|x|/alpha)**beta)/gamma(1/beta)
+    //  See get_generalized_gaussian_beta() for a discussion of beta.
+    //  We base alpha on the intended Gaussian sigma, but it only strictly
+    //  models models standard deviation at beta == 2, because the standard
+    //  deviation depends on both alpha and beta (keeping alpha independent is
+    //  faster and preserves intuitive behavior and a full spectrum of results).
+    const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
+    const float3 beta = get_generalized_gaussian_beta(color, shape_range);
+    const float3 alpha_inv = float3(1.0)/alpha;
+    const float3 s = float3(1.0)/beta;
+    const float3 ph_offset = float3(pixel_height * 0.5);
+    //  Pass beta to gamma_impl to avoid repeated divides.  Similarly pass
+    //  beta (i.e. 1/s) and 1/gamma(s) to normalized_ligamma_impl.
+    const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, beta);
+    const float3 dist1 = dist + ph_offset;
+    const float3 dist0 = dist - ph_offset;
+    const float3 integral_high = sign(dist1) * normalized_ligamma_impl(
+        s, pow(abs(dist1)*alpha_inv, beta), beta, gamma_s_inv);
+    const float3 integral_low = sign(dist0) * normalized_ligamma_impl(
+        s, pow(abs(dist0)*alpha_inv, beta), beta, gamma_s_inv);
+    return color * 0.5*(integral_high - integral_low)/pixel_height;
+}
+
+float3 scanline_gaussian_sampled_contrib(const float3 dist, const float3 color,
+    const float pixel_height, const float sigma_range)
+{
+    //  See scanline_gaussian integral_contrib() for detailed comments!
+    //  gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2))
+    const float3 sigma = get_gaussian_sigma(color, sigma_range);
+    //  Avoid repeated divides:
+    const float3 sigma_inv = float3(1.0)/sigma;
+    const float3 inner_denom_inv = 0.5 * sigma_inv * sigma_inv;
+    const float3 outer_denom_inv = sigma_inv/sqrt(2.0*pi);
+    if(beam_antialias_level > 0.5)
+    {
+        //  Sample 1/3 pixel away in each direction as well:
+        const float3 sample_offset = float3(pixel_height/3.0);
+        const float3 dist2 = dist + sample_offset;
+        const float3 dist3 = abs(dist - sample_offset);
+        //  Average three pure Gaussian samples:
+        const float3 scale = color/3.0 * outer_denom_inv;
+        const float3 weight1 = exp(-(dist*dist)*inner_denom_inv);
+        const float3 weight2 = exp(-(dist2*dist2)*inner_denom_inv);
+        const float3 weight3 = exp(-(dist3*dist3)*inner_denom_inv);
+        return scale * (weight1 + weight2 + weight3);
+    }
+    else
+    {
+        return color*exp(-(dist*dist)*inner_denom_inv)*outer_denom_inv;
+    }
+}
+
+float3 scanline_generalized_gaussian_sampled_contrib(float3 dist,
+    float3 color, float pixel_height, float sigma_range,
+    float shape_range)
+{
+    //  See scanline_generalized_gaussian_integral_contrib() for details!
+    //  generalized sample =
+    //      beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta)
+    const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
+    const float3 beta = get_generalized_gaussian_beta(color, shape_range);
+    //  Avoid repeated divides:
+    const float3 alpha_inv = float3(1.0)/alpha;
+    const float3 beta_inv = float3(1.0)/beta;
+    const float3 scale = color * beta * 0.5 * alpha_inv /
+        gamma_impl(beta_inv, beta);
+    if(beam_antialias_level > 0.5)
+    {
+        //  Sample 1/3 pixel closer to and farther from the scanline too.
+        const float3 sample_offset = float3(pixel_height/3.0);
+        const float3 dist2 = dist + sample_offset;
+        const float3 dist3 = abs(dist - sample_offset);
+        //  Average three generalized Gaussian samples:
+        const float3 weight1 = exp(-pow(abs(dist*alpha_inv), beta));
+        const float3 weight2 = exp(-pow(abs(dist2*alpha_inv), beta));
+        const float3 weight3 = exp(-pow(abs(dist3*alpha_inv), beta));
+        return scale/3.0 * (weight1 + weight2 + weight3);
+    }
+    else
+    {
+        return scale * exp(-pow(abs(dist*alpha_inv), beta));
+    }
+}
+
+inline float3 scanline_contrib(float3 dist, float3 color,
+    float pixel_height, const float sigma_range, const float shape_range)
+{
+    //  Requires:   1.) Requirements of scanline_gaussian_integral_contrib()
+    //                  must be met.
+    //              2.) Requirements of get_gaussian_sigma() must be met.
+    //              3.) Requirements of get_generalized_gaussian_beta() must be
+    //                  met.
+    //  Returns:    Return a scanline's light output over a given pixel, using
+    //              a generalized or pure Gaussian distribution and sampling or
+    //              integrals as desired by user codepath choices.
+    if(beam_generalized_gaussian)
+    {
+        if(beam_antialias_level > 1.5)
+        {
+            return scanline_generalized_gaussian_integral_contrib(
+                dist, color, pixel_height, sigma_range, shape_range);
+        }
+        else
+        {
+            return scanline_generalized_gaussian_sampled_contrib(
+                dist, color, pixel_height, sigma_range, shape_range);
+        }
+    }
+    else
+    {
+        if(beam_antialias_level > 1.5)
+        {
+            return scanline_gaussian_integral_contrib(
+                dist, color, pixel_height, sigma_range);
+        }
+        else
+        {
+            return scanline_gaussian_sampled_contrib(
+                dist, color, pixel_height, sigma_range);
+        }
+    }
+}
+
+inline float3 get_raw_interpolated_color(const float3 color0,
+    const float3 color1, const float3 color2, const float3 color3,
+    const float4 weights)
+{
+    //  Use max to avoid bizarre artifacts from negative colors:
+    return max(mul(weights, float4x3(color0, color1, color2, color3)), 0.0);
+}
+
+float3 get_interpolated_linear_color(const float3 color0, const float3 color1,
+    const float3 color2, const float3 color3, const float4 weights)
+{
+    //  Requires:   1.) Requirements of include/gamma-management.h must be met:
+    //                  intermediate_gamma must be globally defined, and input
+    //                  colors are interpreted as linear RGB unless you #define
+    //                  GAMMA_ENCODE_EVERY_FBO (in which case they are
+    //                  interpreted as gamma-encoded with intermediate_gamma).
+    //              2.) color0-3 are colors sampled from a texture with tex2D().
+    //                  They are interpreted as defined in requirement 1.
+    //              3.) weights contains weights for each color, summing to 1.0.
+    //              4.) beam_horiz_linear_rgb_weight must be defined as a global
+    //                  float in [0.0, 1.0] describing how much blending should
+    //                  be done in linear RGB (rest is gamma-corrected RGB).
+    //              5.) RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE must be #defined
+    //                  if beam_horiz_linear_rgb_weight is anything other than a
+    //                  static constant, or we may try branching at runtime
+    //                  without dynamic branches allowed (slow).
+    //  Returns:    Return an interpolated color lookup between the four input
+    //              colors based on the weights in weights.  The final color will
+    //              be a linear RGB value, but the blending will be done as
+    //              indicated above.
+    const float intermediate_gamma = get_intermediate_gamma();
+    //  Branch if beam_horiz_linear_rgb_weight is static (for free) or if the
+    //  profile allows dynamic branches (faster than computing extra pows):
+    #ifndef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+    #else
+        #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+            #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+        #endif
+    #endif
+    #ifdef SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+        //  beam_horiz_linear_rgb_weight is static, so we can branch:
+        #ifdef GAMMA_ENCODE_EVERY_FBO
+            const float3 gamma_mixed_color = pow(get_raw_interpolated_color(
+                color0, color1, color2, color3, weights), float3(intermediate_gamma));
+            if(beam_horiz_linear_rgb_weight > 0.0)
+            {
+                const float3 linear_mixed_color = get_raw_interpolated_color(
+                    pow(color0, float3(intermediate_gamma)),
+                    pow(color1, float3(intermediate_gamma)),
+                    pow(color2, float3(intermediate_gamma)),
+                    pow(color3, float3(intermediate_gamma)),
+                    weights);
+                return lerp(gamma_mixed_color, linear_mixed_color,
+                    beam_horiz_linear_rgb_weight);
+            }
+            else
+            {
+                return gamma_mixed_color;
+            }
+        #else
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                color0, color1, color2, color3, weights);
+            if(beam_horiz_linear_rgb_weight < 1.0)
+            {
+                const float3 gamma_mixed_color = get_raw_interpolated_color(
+                    pow(color0, float3(1.0/intermediate_gamma)),
+                    pow(color1, float3(1.0/intermediate_gamma)),
+                    pow(color2, float3(1.0/intermediate_gamma)),
+                    pow(color3, float3(1.0/intermediate_gamma)),
+                    weights);
+                return lerp(gamma_mixed_color, linear_mixed_color,
+                    beam_horiz_linear_rgb_weight);
+            }
+            else
+            {
+                return linear_mixed_color;
+            }
+        #endif  //  GAMMA_ENCODE_EVERY_FBO
+    #else
+        #ifdef GAMMA_ENCODE_EVERY_FBO
+            //  Inputs: color0-3 are colors in gamma-encoded RGB.
+            const float3 gamma_mixed_color = pow(get_raw_interpolated_color(
+                color0, color1, color2, color3, weights), intermediate_gamma);
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                pow(color0, float3(intermediate_gamma)),
+                pow(color1, float3(intermediate_gamma)),
+                pow(color2, float3(intermediate_gamma)),
+                pow(color3, float3(intermediate_gamma)),
+                weights);
+            return lerp(gamma_mixed_color, linear_mixed_color,
+                beam_horiz_linear_rgb_weight);
+        #else
+            //  Inputs: color0-3 are colors in linear RGB.
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                color0, color1, color2, color3, weights);
+            const float3 gamma_mixed_color = get_raw_interpolated_color(
+                    pow(color0, float3(1.0/intermediate_gamma)),
+                    pow(color1, float3(1.0/intermediate_gamma)),
+                    pow(color2, float3(1.0/intermediate_gamma)),
+                    pow(color3, float3(1.0/intermediate_gamma)),
+                    weights);
+			// wtf fixme
+//			const float beam_horiz_linear_rgb_weight1 = 1.0;
+            return lerp(gamma_mixed_color, linear_mixed_color,
+                beam_horiz_linear_rgb_weight);
+        #endif  //  GAMMA_ENCODE_EVERY_FBO
+    #endif  //  SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+}
+
+float3 get_scanline_color(const sampler2D tex, const float2 scanline_uv,
+    const float2 uv_step_x, const float4 weights)
+{
+    //  Requires:   1.) scanline_uv must be vertically snapped to the caller's
+    //                  desired line or scanline and horizontally snapped to the
+    //                  texel just left of the output pixel (color1)
+    //              2.) uv_step_x must contain the horizontal uv distance
+    //                  between texels.
+    //              3.) weights must contain interpolation filter weights for
+    //                  color0, color1, color2, and color3, where color1 is just
+    //                  left of the output pixel.
+    //  Returns:    Return a horizontally interpolated texture lookup using 2-4
+    //              nearby texels, according to weights and the conventions of
+    //              get_interpolated_linear_color().
+    //  We can ignore the outside texture lookups for Quilez resampling.
+    const float3 color1 = COMPAT_TEXTURE(tex, scanline_uv).rgb;
+    const float3 color2 = COMPAT_TEXTURE(tex, scanline_uv + uv_step_x).rgb;
+    float3 color0 = float3(0.0);
+    float3 color3 = float3(0.0);
+    if(beam_horiz_filter > 0.5)
+    {
+        color0 = COMPAT_TEXTURE(tex, scanline_uv - uv_step_x).rgb;
+        color3 = COMPAT_TEXTURE(tex, scanline_uv + 2.0 * uv_step_x).rgb;
+    }
+    //  Sample the texture as-is, whether it's linear or gamma-encoded:
+    //  get_interpolated_linear_color() will handle the difference.
+    return get_interpolated_linear_color(color0, color1, color2, color3, weights);
+}
+
+float3 sample_single_scanline_horizontal(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size,
+    const float2 texture_size_inv)
+{
+    //  TODO: Add function requirements.
+    //  Snap to the previous texel and get sample dists from 2/4 nearby texels:
+    const float2 curr_texel = tex_uv * tex_size;
+    //  Use under_half to fix a rounding bug right around exact texel locations.
+    const float2 prev_texel =
+        floor(curr_texel - float2(under_half)) + float2(0.5);
+    const float2 prev_texel_hor = float2(prev_texel.x, curr_texel.y);
+    const float2 prev_texel_hor_uv = prev_texel_hor * texture_size_inv;
+    const float prev_dist = curr_texel.x - prev_texel_hor.x;
+    const float4 sample_dists = float4(1.0 + prev_dist, prev_dist,
+        1.0 - prev_dist, 2.0 - prev_dist);
+    //  Get Quilez, Lanczos2, or Gaussian resize weights for 2/4 nearby texels:
+    float4 weights;
+    if(beam_horiz_filter < 0.5)
+    {
+        //  Quilez:
+        const float x = sample_dists.y;
+        const float w2 = x*x*x*(x*(x*6.0 - 15.0) + 10.0);
+        weights = float4(0.0, 1.0 - w2, w2, 0.0);
+    }
+    else if(beam_horiz_filter < 1.5)
+    {
+        //  Gaussian:
+        float inner_denom_inv = 1.0/(2.0*beam_horiz_sigma*beam_horiz_sigma);
+        weights = exp(-(sample_dists*sample_dists)*inner_denom_inv);
+    }
+    else
+    {
+        //  Lanczos2:
+        const float4 pi_dists = FIX_ZERO(sample_dists * pi);
+        weights = 2.0 * sin(pi_dists) * sin(pi_dists * 0.5) /
+            (pi_dists * pi_dists);
+    }
+    //  Ensure the weight sum == 1.0:
+    const float4 final_weights = weights/dot(weights, float4(1.0));
+    //  Get the interpolated horizontal scanline color:
+    const float2 uv_step_x = float2(texture_size_inv.x, 0.0);
+    return get_scanline_color(
+        tex, prev_texel_hor_uv, uv_step_x, final_weights);
+}
+
+float3 sample_rgb_scanline_horizontal(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size,
+    const float2 texture_size_inv)
+{
+    //  TODO: Add function requirements.
+    //  Rely on a helper to make convergence easier.
+    if(beam_misconvergence)
+    {
+        const float3 convergence_offsets_rgb =
+            get_convergence_offsets_x_vector();
+        const float3 offset_u_rgb =
+            convergence_offsets_rgb * texture_size_inv.xxx;
+        const float2 scanline_uv_r = tex_uv - float2(offset_u_rgb.r, 0.0);
+        const float2 scanline_uv_g = tex_uv - float2(offset_u_rgb.g, 0.0);
+        const float2 scanline_uv_b = tex_uv - float2(offset_u_rgb.b, 0.0);
+        const float3 sample_r = sample_single_scanline_horizontal(
+            tex, scanline_uv_r, tex_size, texture_size_inv);
+        const float3 sample_g = sample_single_scanline_horizontal(
+            tex, scanline_uv_g, tex_size, texture_size_inv);
+        const float3 sample_b = sample_single_scanline_horizontal(
+            tex, scanline_uv_b, tex_size, texture_size_inv);
+        return float3(sample_r.r, sample_g.g, sample_b.b);
+    }
+    else
+    {
+        return sample_single_scanline_horizontal(tex, tex_uv, tex_size,
+            texture_size_inv);
+    }
+}
+
+float2 get_last_scanline_uv(const float2 tex_uv, const float2 tex_size,
+    const float2 texture_size_inv, const float2 il_step_multiple,
+    const float frame_count, out float dist)
+{
+    //  Compute texture coords for the last/upper scanline, accounting for
+    //  interlacing: With interlacing, only consider even/odd scanlines every
+    //  other frame.  Top-field first (TFF) order puts even scanlines on even
+    //  frames, and BFF order puts them on odd frames.  Texels are centered at:
+    //      frac(tex_uv * tex_size) == x.5
+    //  Caution: If these coordinates ever seem incorrect, first make sure it's
+    //  not because anisotropic filtering is blurring across field boundaries.
+    //  Note: TFF/BFF won't matter for sources that double-weave or similar.
+	// wtf fixme
+//	const float interlace_bff1 = 1.0;
+    const float field_offset = floor(il_step_multiple.y * 0.75) *
+        fmod(frame_count + float(interlace_bff), 2.0);
+    const float2 curr_texel = tex_uv * tex_size;
+    //  Use under_half to fix a rounding bug right around exact texel locations.
+    const float2 prev_texel_num = floor(curr_texel - float2(under_half));
+    const float wrong_field = fmod(
+        prev_texel_num.y + field_offset, il_step_multiple.y);
+    const float2 scanline_texel_num = prev_texel_num - float2(0.0, wrong_field);
+    //  Snap to the center of the previous scanline in the current field:
+    const float2 scanline_texel = scanline_texel_num + float2(0.5);
+    const float2 scanline_uv = scanline_texel * texture_size_inv;
+    //  Save the sample's distance from the scanline, in units of scanlines:
+    dist = (curr_texel.y - scanline_texel.y)/il_step_multiple.y;
+    return scanline_uv;
+}
+
+inline bool is_interlaced(float num_lines)
+{
+    //  Detect interlacing based on the number of lines in the source.
+    if(interlace_detect)
+    {
+        //  NTSC: 525 lines, 262.5/field; 486 active (2 half-lines), 243/field
+        //  NTSC Emulators: Typically 224 or 240 lines
+        //  PAL: 625 lines, 312.5/field; 576 active (typical), 288/field
+        //  PAL Emulators: ?
+        //  ATSC: 720p, 1080i, 1080p
+        //  Where do we place our cutoffs?  Assumptions:
+        //  1.) We only need to care about active lines.
+        //  2.) Anything > 288 and <= 576 lines is probably interlaced.
+        //  3.) Anything > 576 lines is probably not interlaced...
+        //  4.) ...except 1080 lines, which is a crapshoot (user decision).
+        //  5.) Just in case the main program uses calculated video sizes,
+        //      we should nudge the float thresholds a bit.
+        const bool sd_interlace = ((num_lines > 288.5) && (num_lines < 576.5));
+        const bool hd_interlace = bool(interlace_1080i) ?
+            ((num_lines > 1079.5) && (num_lines < 1080.5)) :
+            false;
+        return (sd_interlace || hd_interlace);
+    }
+    else
+    {
+        return false;
+    }
+}
+
+#endif  //  SCANLINE_FUNCTIONS_H
+
+/////////////////////////////  END SCANLINE-FUNCTIONS  ////////////////////////////
+
+///////////////////////////////  END VERTEX INCLUDES  /////////////////////////////
+
+#undef COMPAT_PRECISION
+#undef COMPAT_TEXTURE
+
+float bloom_approx_scale_x = targetSize.x / sourceSize[0].y;
+const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0);
+
+void main() {
+   gl_Position = position;
+   vTexCoord = texCoord;
+	const float2 video_uv = vTexCoord * texture_size/video_size;
+    tex_uv = video_uv * ORIG_LINEARIZEDvideo_size /
+        ORIG_LINEARIZEDtexture_size;
+    //  The last pass (vertical scanlines) had a viewport y scale, so we can
+    //  use it to calculate a better runtime sigma:
+    estimated_viewport_size_x =
+        video_size.y * geom_aspect_ratio_x/geom_aspect_ratio_y;
+
+    //  Get the uv sample distance between output pixels.  We're using a resize
+    //  blur, so arbitrary upsizing will be acceptable if filter_linearN =
+    //  "true," and arbitrary downsizing will be acceptable if mipmap_inputN =
+    //  "true" too.  The blur will be much more accurate if a true 4x4 Gaussian
+    //  resize is used instead of tex2Dblur3x3_resize (which samples between
+    //  texels even for upsizing).
+    const float2 dxdy_min_scale = ORIG_LINEARIZEDvideo_size/output_size;
+    const float2 texture_size_inv = float2(1.0, 1.0)/ORIG_LINEARIZEDtexture_size;
+    if(bloom_approx_filter > 1.5)   //  4x4 true Gaussian resize
+    {
+        //  For upsizing, we'll snap to texels and sample the nearest 4.
+        const float2 dxdy_scale = max(dxdy_min_scale, float2(1.0, 1.0));
+        blur_dxdy = dxdy_scale * texture_size_inv;
+    }
+    else
+    {
+        const float2 dxdy_scale = dxdy_min_scale;
+        blur_dxdy = dxdy_scale * texture_size_inv;
+    }
+    //  tex2Dresize_gaussian4x4 needs to know a bit more than the other filters:
+    tex_uv_to_pixel_scale = output_size *
+        ORIG_LINEARIZEDtexture_size / ORIG_LINEARIZEDvideo_size;
+    //texture_size_inv = texture_size_inv;
+
+    //  Detecting interlacing again here lets us apply convergence offsets in
+    //  this pass.  il_step_multiple contains the (texel, scanline) step
+    //  multiple: 1 for progressive, 2 for interlaced.
+    const float2 orig_video_size = ORIG_LINEARIZEDvideo_size;
+    const float y_step = 1.0 + float(is_interlaced(orig_video_size.y));
+    const float2 il_step_multiple = float2(1.0, y_step);
+    //  Get the uv distance between (texels, same-field scanlines):
+    uv_scanline_step = il_step_multiple / ORIG_LINEARIZEDtexture_size;
+}
\ No newline at end of file
diff --git a/shaders/CRT-Royale.shader/bloom-horizontal-reconstitute.fs b/shaders/CRT-Royale.shader/bloom-horizontal-reconstitute.fs
new file mode 100644
index 00000000..7750152c
--- /dev/null
+++ b/shaders/CRT-Royale.shader/bloom-horizontal-reconstitute.fs
@@ -0,0 +1,7240 @@
+#version 150
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+uniform sampler2D source[];
+uniform vec4 sourceSize[];
+uniform vec4 targetSize;
+
+// Dunno why this stuff didn't want to function in the vertex, but whatever
+in Vertex {
+   vec2 vTexCoord;
+//   vec2 video_uv;
+//   vec2 scanline_tex_uv;
+//   vec2 halation_tex_uv;
+//   vec2 brightpass_tex_uv;
+//   vec2 bloom_tex_uv;
+   vec2 bloom_dxdy;
+   float bloom_sigma_runtime;
+};
+
+   vec2 video_uv = vTexCoord;
+   vec2 scanline_tex_uv = vTexCoord;
+   vec2 halation_tex_uv = vTexCoord;
+   vec2 brightpass_tex_uv = vTexCoord;
+   vec2 bloom_tex_uv = vTexCoord;
+
+out vec4 FragColor;
+
+// USER SETTINGS BLOCK //
+
+#define crt_gamma 2.500000
+#define lcd_gamma 2.200000
+#define levels_contrast 1.0
+#define halation_weight 0.0
+#define diffusion_weight 0.075
+#define bloom_underestimate_levels 0.8
+#define bloom_excess 0.000000
+#define beam_min_sigma 0.020000
+#define beam_max_sigma 0.300000
+#define beam_spot_power 0.330000
+#define beam_min_shape 2.000000
+#define beam_max_shape 4.000000
+#define beam_shape_power 0.250000
+#define beam_horiz_filter 0.000000
+#define beam_horiz_sigma 0.35
+#define beam_horiz_linear_rgb_weight 1.000000
+#define convergence_offset_x_r -0.000000
+#define convergence_offset_x_g 0.000000
+#define convergence_offset_x_b 0.000000
+#define convergence_offset_y_r 0.000000
+#define convergence_offset_y_g -0.000000
+#define convergence_offset_y_b 0.000000
+#define mask_type 1.000000
+#define mask_sample_mode_desired 0.000000
+#define mask_specify_num_triads 0.000000
+#define mask_triad_size_desired 3.000000
+#define mask_num_triads_desired 480.000000
+#define aa_subpixel_r_offset_x_runtime -0.0
+#define aa_subpixel_r_offset_y_runtime 0.000000
+#define aa_cubic_c 0.500000
+#define aa_gauss_sigma 0.500000
+#define geom_mode_runtime 0.000000
+#define geom_radius 2.000000
+#define geom_view_dist 2.000000
+#define geom_tilt_angle_x 0.000000
+#define geom_tilt_angle_y 0.000000
+#define geom_aspect_ratio_x 432.000000
+#define geom_aspect_ratio_y 329.000000
+#define geom_overscan_x 1.000000
+#define geom_overscan_y 1.000000
+#define border_size 0.015
+#define border_darkness 2.0
+#define border_compress 2.500000
+#define interlace_bff 0.000000
+#define interlace_1080i 0.000000
+
+// END USER SETTINGS BLOCK //
+
+// compatibility macros for transparently converting HLSLisms into GLSLisms
+#define mul(a,b) (b*a)
+#define lerp(a,b,c) mix(a,b,c)
+#define saturate(c) clamp(c, 0.0, 1.0)
+#define frac(x) (fract(x))
+#define float2 vec2
+#define float3 vec3
+#define float4 vec4
+#define bool2 bvec2
+#define bool3 bvec3
+#define bool4 bvec4
+#define float2x2 mat2x2
+#define float3x3 mat3x3
+#define float4x4 mat4x4
+#define float4x3 mat4x3
+#define float2x4 mat2x4
+#define IN params
+#define texture_size sourceSize[0].xy
+#define video_size sourceSize[0].xy
+#define output_size targetSize.xy
+#define frame_count phase
+#define static  
+#define inline  
+#define const  
+#define fmod(x,y) mod(x,y)
+#define ddx(c) dFdx(c)
+#define ddy(c) dFdy(c)
+#define atan2(x,y) atan(y,x)
+#define rsqrt(c) inversesqrt(c)
+
+#define bloom_texture source[0]
+
+#define MASKED_SCANLINEStexture source[2]
+#define MASKED_SCANLINEStexture_size sourceSize[2].xy
+#define MASKED_SCANLINESvideo_size sourceSize[2].xy
+#define HALATION_BLURtexture source[5]
+#define HALATION_BLURtexture_size sourceSize[5].xy
+#define HALATION_BLURvideo_size sourceSize[5].xy
+#define BRIGHTPASStexture source[1]
+#define BRIGHTPASStexture_size sourceSize[1].xy
+#define BRIGHTPASSvideo_size sourceSize[1].xy
+
+#if defined(GL_ES)
+	#define COMPAT_PRECISION mediump
+#else
+	#define COMPAT_PRECISION
+#endif
+
+#if __VERSION__ >= 130
+	#define COMPAT_TEXTURE texture
+#else
+	#define COMPAT_TEXTURE texture2D
+#endif
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+////////////////////////////  END USER-SETTINGS  //////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  END DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////////////
+
+//#include "bind-shader-params.h"
+
+/////////////////////////////  BEGIN BIND-SHADER-PARAMS  ////////////////////////////
+
+#ifndef BIND_SHADER_PARAMS_H
+#define BIND_SHADER_PARAMS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+/////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+////////////////////   END DERIVED-SETTINGS-AND-CONSTANTS   /////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+//  Override some parameters for gamma-management.h and tex2Dantialias.h:
+#define OVERRIDE_DEVICE_GAMMA
+static const float gba_gamma = 3.5; //  Irrelevant but necessary to define.
+#define ANTIALIAS_OVERRIDE_BASICS
+#define ANTIALIAS_OVERRIDE_PARAMETERS
+
+//  Provide accessors for vector constants that pack scalar uniforms:
+inline float2 get_aspect_vector(const float geom_aspect_ratio)
+{
+    //  Get an aspect ratio vector.  Enforce geom_max_aspect_ratio, and prevent
+    //  the absolute scale from affecting the uv-mapping for curvature:
+    const float geom_clamped_aspect_ratio =
+        min(geom_aspect_ratio, geom_max_aspect_ratio);
+    const float2 geom_aspect =
+        normalize(float2(geom_clamped_aspect_ratio, 1.0));
+    return geom_aspect;
+}
+
+inline float2 get_geom_overscan_vector()
+{
+    return float2(geom_overscan_x, geom_overscan_y);
+}
+
+inline float2 get_geom_tilt_angle_vector()
+{
+    return float2(geom_tilt_angle_x, geom_tilt_angle_y);
+}
+
+inline float3 get_convergence_offsets_x_vector()
+{
+    return float3(convergence_offset_x_r, convergence_offset_x_g,
+        convergence_offset_x_b);
+}
+
+inline float3 get_convergence_offsets_y_vector()
+{
+    return float3(convergence_offset_y_r, convergence_offset_y_g,
+        convergence_offset_y_b);
+}
+
+inline float2 get_convergence_offsets_r_vector()
+{
+    return float2(convergence_offset_x_r, convergence_offset_y_r);
+}
+
+inline float2 get_convergence_offsets_g_vector()
+{
+    return float2(convergence_offset_x_g, convergence_offset_y_g);
+}
+
+inline float2 get_convergence_offsets_b_vector()
+{
+    return float2(convergence_offset_x_b, convergence_offset_y_b);
+}
+
+inline float2 get_aa_subpixel_r_offset()
+{
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+            //  WARNING: THIS IS EXTREMELY EXPENSIVE.
+            return float2(aa_subpixel_r_offset_x_runtime,
+                aa_subpixel_r_offset_y_runtime);
+        #else
+            return aa_subpixel_r_offset_static;
+        #endif
+    #else
+        return aa_subpixel_r_offset_static;
+    #endif
+}
+
+//  Provide accessors settings which still need "cooking:"
+inline float get_mask_amplify()
+{
+    static const float mask_grille_amplify = 1.0/mask_grille_avg_color;
+    static const float mask_slot_amplify = 1.0/mask_slot_avg_color;
+    static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color;
+    return mask_type < 0.5 ? mask_grille_amplify :
+        mask_type < 1.5 ? mask_slot_amplify :
+        mask_shadow_amplify;
+}
+
+inline float get_mask_sample_mode()
+{
+    #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_desired;
+        #else
+            return clamp(mask_sample_mode_desired, 1.0, 2.0);
+        #endif
+    #else
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_static;
+        #else
+            return clamp(mask_sample_mode_static, 1.0, 2.0);
+        #endif
+    #endif
+}
+
+#endif  //  BIND_SHADER_PARAMS_H
+
+////////////////////////////  END BIND-SHADER-PARAMS  ///////////////////////////
+
+float bloom_approx_scale_x = targetSize.x / sourceSize[0].y;
+const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0);
+const float bloom_diff_thresh_ = 1.0/256.0;
+
+///////////////////////////  BEGIN FRAGMENT-INCLUDES  ///////////////////////////
+
+//#include "bloom-functions.h"
+
+////////////////////////////  BEGIN BLOOM-FUNCTIONS  ///////////////////////////
+
+#ifndef BLOOM_FUNCTIONS_H
+#define BLOOM_FUNCTIONS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These utility functions and constants help several passes determine the
+//  size and center texel weight of the phosphor bloom in a uniform manner.
+
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//  We need to calculate the correct blur sigma using some .cgp constants:
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+////////////////////////////  END USER-SETTINGS  //////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  END DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////////////
+
+//#include "../../../../include/blur-functions.h"
+
+////////////////////////////  BEGIN BLUR-FUNCTIONS  ///////////////////////////
+
+#ifndef BLUR_FUNCTIONS_H
+#define BLUR_FUNCTIONS_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides reusable one-pass and separable (two-pass) blurs.
+//  Requires:   All blurs share these requirements (dxdy requirement is split):
+//              1.) All requirements of gamma-management.h must be satisfied!
+//              2.) filter_linearN must == "true" in your .cgp preset unless
+//                  you're using tex2DblurNresize at 1x scale.
+//              3.) mipmap_inputN must == "true" in your .cgp preset if
+//                  output_size < video_size.
+//              4.) output_size == video_size / pow(2, M), where M is some
+//                  positive integer.  tex2Dblur*resize can resize arbitrarily
+//                  (and the blur will be done after resizing), but arbitrary
+//                  resizes "fail" with other blurs due to the way they mix
+//                  static weights with bilinear sample exploitation.
+//              5.) In general, dxdy should contain the uv pixel spacing:
+//                      dxdy = (video_size/output_size)/texture_size
+//              6.) For separable blurs (tex2DblurNresize and tex2DblurNfast),
+//                  zero out the dxdy component in the unblurred dimension:
+//                      dxdy = float2(dxdy.x, 0.0) or float2(0.0, dxdy.y)
+//              Many blurs share these requirements:
+//              1.) One-pass blurs require scale_xN == scale_yN or scales > 1.0,
+//                  or they will blur more in the lower-scaled dimension.
+//              2.) One-pass shared sample blurs require ddx(), ddy(), and
+//                  tex2Dlod() to be supported by the current Cg profile, and
+//                  the drivers must support high-quality derivatives.
+//              3.) One-pass shared sample blurs require:
+//                      tex_uv.w == log2(video_size/output_size).y;
+//              Non-wrapper blurs share this requirement:
+//              1.) sigma is the intended standard deviation of the blur
+//              Wrapper blurs share this requirement, which is automatically
+//              met (unless OVERRIDE_BLUR_STD_DEVS is #defined; see below):
+//              1.) blurN_std_dev must be global static const float values
+//                  specifying standard deviations for Nx blurs in units
+//                  of destination pixels
+//  Optional:   1.) The including file (or an earlier included file) may
+//                  optionally #define USE_BINOMIAL_BLUR_STD_DEVS to replace
+//                  default standard deviations with those matching a binomial
+//                  distribution.  (See below for details/properties.)
+//              2.) The including file (or an earlier included file) may
+//                  optionally #define OVERRIDE_BLUR_STD_DEVS and override:
+//                      static const float blur3_std_dev
+//                      static const float blur4_std_dev
+//                      static const float blur5_std_dev
+//                      static const float blur6_std_dev
+//                      static const float blur7_std_dev
+//                      static const float blur8_std_dev
+//                      static const float blur9_std_dev
+//                      static const float blur10_std_dev
+//                      static const float blur11_std_dev
+//                      static const float blur12_std_dev
+//                      static const float blur17_std_dev
+//                      static const float blur25_std_dev
+//                      static const float blur31_std_dev
+//                      static const float blur43_std_dev
+//              3.) The including file (or an earlier included file) may
+//                  optionally #define OVERRIDE_ERROR_BLURRING and override:
+//                      static const float error_blurring
+//                  This tuning value helps mitigate weighting errors from one-
+//                  pass shared-sample blurs sharing bilinear samples between
+//                  fragments.  Values closer to 0.0 have "correct" blurriness
+//                  but allow more artifacts, and values closer to 1.0 blur away
+//                  artifacts by sampling closer to halfway between texels.
+//              UPDATE 6/21/14: The above static constants may now be overridden
+//              by non-static uniform constants.  This permits exposing blur
+//              standard deviations as runtime GUI shader parameters.  However,
+//              using them keeps weights from being statically computed, and the
+//              speed hit depends on the blur: On my machine, uniforms kill over
+//              53% of the framerate with tex2Dblur12x12shared, but they only
+//              drop the framerate by about 18% with tex2Dblur11fast.
+//  Quality and Performance Comparisons:
+//  For the purposes of the following discussion, "no sRGB" means
+//  GAMMA_ENCODE_EVERY_FBO is #defined, and "sRGB" means it isn't.
+//  1.) tex2DblurNfast is always faster than tex2DblurNresize.
+//  2.) tex2DblurNresize functions are the only ones that can arbitrarily resize
+//      well, because they're the only ones that don't exploit bilinear samples.
+//      This also means they're the only functions which can be truly gamma-
+//      correct without linear (or sRGB FBO) input, but only at 1x scale.
+//  3.) One-pass shared sample blurs only have a speed advantage without sRGB.
+//      They also have some inaccuracies due to their shared-[bilinear-]sample
+//      design, which grow increasingly bothersome for smaller blurs and higher-
+//      frequency source images (relative to their resolution).  I had high
+//      hopes for them, but their most realistic use case is limited to quickly
+//      reblurring an already blurred input at full resolution.  Otherwise:
+//      a.) If you're blurring a low-resolution source, you want a better blur.
+//      b.) If you're blurring a lower mipmap, you want a better blur.
+//      c.) If you're blurring a high-resolution, high-frequency source, you
+//          want a better blur.
+//  4.) The one-pass blurs without shared samples grow slower for larger blurs,
+//      but they're competitive with separable blurs at 5x5 and smaller, and
+//      even tex2Dblur7x7 isn't bad if you're wanting to conserve passes.
+//  Here are some framerates from a GeForce 8800GTS.  The first pass resizes to
+//  viewport size (4x in this test) and linearizes for sRGB codepaths, and the
+//  remaining passes perform 6 full blurs.  Mipmapped tests are performed at the
+//  same scale, so they just measure the cost of mipmapping each FBO (only every
+//  other FBO is mipmapped for separable blurs, to mimic realistic usage).
+//  Mipmap      Neither     sRGB+Mipmap sRGB        Function
+//  76.0        92.3        131.3       193.7       tex2Dblur3fast
+//  63.2        74.4        122.4       175.5       tex2Dblur3resize
+//  93.7        121.2       159.3       263.2       tex2Dblur3x3
+//  59.7        68.7        115.4       162.1       tex2Dblur3x3resize
+//  63.2        74.4        122.4       175.5       tex2Dblur5fast
+//  49.3        54.8        100.0       132.7       tex2Dblur5resize
+//  59.7        68.7        115.4       162.1       tex2Dblur5x5
+//  64.9        77.2        99.1        137.2       tex2Dblur6x6shared
+//  55.8        63.7        110.4       151.8       tex2Dblur7fast
+//  39.8        43.9        83.9        105.8       tex2Dblur7resize
+//  40.0        44.2        83.2        104.9       tex2Dblur7x7
+//  56.4        65.5        71.9        87.9        tex2Dblur8x8shared
+//  49.3        55.1        99.9        132.5       tex2Dblur9fast
+//  33.3        36.2        72.4        88.0        tex2Dblur9resize
+//  27.8        29.7        61.3        72.2        tex2Dblur9x9
+//  37.2        41.1        52.6        60.2        tex2Dblur10x10shared
+//  44.4        49.5        91.3        117.8       tex2Dblur11fast
+//  28.8        30.8        63.6        75.4        tex2Dblur11resize
+//  33.6        36.5        40.9        45.5        tex2Dblur12x12shared
+//  TODO: Fill in benchmarks for new untested blurs.
+//                                                  tex2Dblur17fast
+//                                                  tex2Dblur25fast
+//                                                  tex2Dblur31fast
+//                                                  tex2Dblur43fast
+//                                                  tex2Dblur3x3resize
+
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+//  Set static standard deviations, but allow users to override them with their
+//  own constants (even non-static uniforms if they're okay with the speed hit):
+#ifndef OVERRIDE_BLUR_STD_DEVS
+    //  blurN_std_dev values are specified in terms of dxdy strides.
+    #ifdef USE_BINOMIAL_BLUR_STD_DEVS
+        //  By request, we can define standard deviations corresponding to a
+        //  binomial distribution with p = 0.5 (related to Pascal's triangle).
+        //  This distribution works such that blurring multiple times should
+        //  have the same result as a single larger blur.  These values are
+        //  larger than default for blurs up to 6x and smaller thereafter.
+        static const float blur3_std_dev = 0.84931640625;
+        static const float blur4_std_dev = 0.84931640625;
+        static const float blur5_std_dev = 1.0595703125;
+        static const float blur6_std_dev = 1.06591796875;
+        static const float blur7_std_dev = 1.17041015625;
+        static const float blur8_std_dev = 1.1720703125;
+        static const float blur9_std_dev = 1.2259765625;
+        static const float blur10_std_dev = 1.21982421875;
+        static const float blur11_std_dev = 1.25361328125;
+        static const float blur12_std_dev = 1.2423828125;
+        static const float blur17_std_dev = 1.27783203125;
+        static const float blur25_std_dev = 1.2810546875;
+        static const float blur31_std_dev = 1.28125;
+        static const float blur43_std_dev = 1.28125;
+    #else
+        //  The defaults are the largest values that keep the largest unused
+        //  blur term on each side <= 1.0/256.0.  (We could get away with more
+        //  or be more conservative, but this compromise is pretty reasonable.)
+        static const float blur3_std_dev = 0.62666015625;
+        static const float blur4_std_dev = 0.66171875;
+        static const float blur5_std_dev = 0.9845703125;
+        static const float blur6_std_dev = 1.02626953125;
+        static const float blur7_std_dev = 1.36103515625;
+        static const float blur8_std_dev = 1.4080078125;
+        static const float blur9_std_dev = 1.7533203125;
+        static const float blur10_std_dev = 1.80478515625;
+        static const float blur11_std_dev = 2.15986328125;
+        static const float blur12_std_dev = 2.215234375;
+        static const float blur17_std_dev = 3.45535583496;
+        static const float blur25_std_dev = 5.3409576416;
+        static const float blur31_std_dev = 6.86488037109;
+        static const float blur43_std_dev = 10.1852050781;
+    #endif  //  USE_BINOMIAL_BLUR_STD_DEVS
+#endif  //  OVERRIDE_BLUR_STD_DEVS
+
+#ifndef OVERRIDE_ERROR_BLURRING
+    //  error_blurring should be in [0.0, 1.0].  Higher values reduce ringing
+    //  in shared-sample blurs but increase blurring and feature shifting.
+    static const float error_blurring = 0.5;
+#endif
+
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//  gamma-management.h relies on pass-specific settings to guide its behavior:
+//  FIRST_PASS, LAST_PASS, GAMMA_ENCODE_EVERY_FBO, etc.  See it for details.
+//#include "gamma-management.h"
+
+////////////////////////////  BEGIN GAMMA-MANAGEMENT  //////////////////////////
+
+#ifndef GAMMA_MANAGEMENT_H
+#define GAMMA_MANAGEMENT_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//  
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides gamma-aware tex*D*() and encode_output() functions.
+//  Requires:   Before #include-ing this file, the including file must #define
+//              the following macros when applicable and follow their rules:
+//              1.) #define FIRST_PASS if this is the first pass.
+//              2.) #define LAST_PASS if this is the last pass.
+//              3.) If sRGB is available, set srgb_framebufferN = "true" for
+//                  every pass except the last in your .cgp preset.
+//              4.) If sRGB isn't available but you want gamma-correctness with
+//                  no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
+//              5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
+//              6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
+//              7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
+//              8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
+//              If an option in [5, 8] is #defined in the first or last pass, it
+//              should be #defined for both.  It shouldn't make a difference
+//              whether it's #defined for intermediate passes or not.
+//  Optional:   The including file (or an earlier included file) may optionally
+//              #define a number of macros indicating it will override certain
+//              macros and associated constants are as follows:
+//              static constants with either static or uniform constants.  The
+//              1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
+//                  static const float ntsc_gamma
+//                  static const float pal_gamma
+//                  static const float crt_reference_gamma_high
+//                  static const float crt_reference_gamma_low
+//                  static const float lcd_reference_gamma
+//                  static const float crt_office_gamma
+//                  static const float lcd_office_gamma
+//              2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
+//                  static const float crt_gamma
+//                  static const float gba_gamma
+//                  static const float lcd_gamma
+//              3.) OVERRIDE_FINAL_GAMMA: The user must first define:
+//                  static const float input_gamma
+//                  static const float intermediate_gamma
+//                  static const float output_gamma
+//                  (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
+//              4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
+//                  static const bool assume_opaque_alpha
+//              The gamma constant overrides must be used in every pass or none,
+//              and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
+//              OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
+//  Usage:      After setting macros appropriately, ignore gamma correction and
+//              replace all tex*D*() calls with equivalent gamma-aware
+//              tex*D*_linearize calls, except:
+//              1.) When you read an LUT, use regular tex*D or a gamma-specified
+//                  function, depending on its gamma encoding:
+//                      tex*D*_linearize_gamma (takes a runtime gamma parameter)
+//              2.) If you must read pass0's original input in a later pass, use
+//                  tex2D_linearize_ntsc_gamma.  If you want to read pass0's
+//                  input with gamma-corrected bilinear filtering, consider
+//                  creating a first linearizing pass and reading from the input
+//                  of pass1 later.
+//              Then, return encode_output(color) from every fragment shader.
+//              Finally, use the global gamma_aware_bilinear boolean if you want
+//              to statically branch based on whether bilinear filtering is
+//              gamma-correct or not (e.g. for placing Gaussian blur samples).
+//
+//  Detailed Policy:
+//  tex*D*_linearize() functions enforce a consistent gamma-management policy
+//  based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings.  They assume
+//  their input texture has the same encoding characteristics as the input for
+//  the current pass (which doesn't apply to the exceptions listed above).
+//  Similarly, encode_output() enforces a policy based on the LAST_PASS and
+//  GAMMA_ENCODE_EVERY_FBO settings.  Together, they result in one of the
+//  following two pipelines.
+//  Typical pipeline with intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = linear_color;     //  Automatic sRGB encoding
+//      linear_color = intermediate_output;     //  Automatic sRGB decoding
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Typical pipeline without intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
+//      linear_color = pow(intermediate_output, intermediate_gamma);
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
+//  easily get gamma-correctness without banding on devices where sRGB isn't
+//  supported.
+//
+//  Use This Header to Maximize Code Reuse:
+//  The purpose of this header is to provide a consistent interface for texture
+//  reads and output gamma-encoding that localizes and abstracts away all the
+//  annoying details.  This greatly reduces the amount of code in each shader
+//  pass that depends on the pass number in the .cgp preset or whether sRGB
+//  FBO's are being used: You can trivially change the gamma behavior of your
+//  whole pass by commenting or uncommenting 1-3 #defines.  To reuse the same
+//  code in your first, Nth, and last passes, you can even put it all in another
+//  header file and #include it from skeleton .cg files that #define the
+//  appropriate pass-specific settings.
+//
+//  Rationale for Using Three Macros:
+//  This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
+//  SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
+//  a lower maintenance burden on each pass.  At first glance it seems we could
+//  accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
+//  This works for simple use cases where input_gamma == output_gamma, but it
+//  breaks down for more complex scenarios like CRT simulation, where the pass
+//  number determines the gamma encoding of the input and output.
+
+
+///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
+
+//  Set standard gamma constants, but allow users to override them:
+#ifndef OVERRIDE_STANDARD_GAMMA
+    //  Standard encoding gammas:
+    static const float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
+    static const float pal_gamma = 2.8;     //  Never actually 2.8 in practice
+    //  Typical device decoding gammas (only use for emulating devices):
+    //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
+    //  gammas: The standards purposely undercorrected for an analog CRT's
+    //  assumed 2.5 reference display gamma to maintain contrast in assumed
+    //  [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
+    //  These unstated assumptions about display gamma and perceptual rendering
+    //  intent caused a lot of confusion, and more modern CRT's seemed to target
+    //  NTSC 2.2 gamma with circuitry.  LCD displays seem to have followed suit
+    //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
+    //  displays designed to view sRGB in bright environments.  (Standards are
+    //  also in flux again with BT.1886, but it's underspecified for displays.)
+    static const float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
+    static const float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
+    static const float lcd_reference_gamma = 2.5;       //  To match CRT
+    static const float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
+    static const float lcd_office_gamma = 2.2;  //  Approximates sRGB
+#endif  //  OVERRIDE_STANDARD_GAMMA
+
+//  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
+//  but only if they're aware of it.
+#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
+    static const bool assume_opaque_alpha = false;
+#endif
+
+
+///////////////////////  DERIVED CONSTANTS AS FUNCTIONS  ///////////////////////
+
+//  gamma-management.h should be compatible with overriding gamma values with
+//  runtime user parameters, but we can only define other global constants in
+//  terms of static constants, not uniform user parameters.  To get around this
+//  limitation, we need to define derived constants using functions.
+
+//  Set device gamma constants, but allow users to override them:
+#ifdef OVERRIDE_DEVICE_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_crt_gamma()    {   return crt_gamma;   }
+    inline float get_gba_gamma()    {   return gba_gamma;   }
+    inline float get_lcd_gamma()    {   return lcd_gamma;   }
+#else
+    inline float get_crt_gamma()    {   return crt_reference_gamma_high;    }
+    inline float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
+    inline float get_lcd_gamma()    {   return lcd_office_gamma;            }
+#endif  //  OVERRIDE_DEVICE_GAMMA
+
+//  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
+#ifdef OVERRIDE_FINAL_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_intermediate_gamma()   {   return intermediate_gamma;  }
+    inline float get_input_gamma()          {   return input_gamma;         }
+    inline float get_output_gamma()         {   return output_gamma;        }
+#else
+    //  If we gamma-correct every pass, always use ntsc_gamma between passes to
+    //  ensure middle passes don't need to care if anything is being simulated:
+    inline float get_intermediate_gamma()   {   return ntsc_gamma;          }
+    #ifdef SIMULATE_CRT_ON_LCD
+        inline float get_input_gamma()      {   return get_crt_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_LCD
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_LCD_ON_CRT
+        inline float get_input_gamma()      {   return get_lcd_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_CRT
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else   //  Don't simulate anything:
+        inline float get_input_gamma()      {   return ntsc_gamma;          }
+        inline float get_output_gamma()     {   return ntsc_gamma;          }
+    #endif  //  SIMULATE_GBA_ON_CRT
+    #endif  //  SIMULATE_LCD_ON_CRT
+    #endif  //  SIMULATE_GBA_ON_LCD
+    #endif  //  SIMULATE_CRT_ON_LCD
+#endif  //  OVERRIDE_FINAL_GAMMA
+
+//  Set decoding/encoding gammas for the current pass.  Use static constants for
+//  linearize_input and gamma_encode_output, because they aren't derived, and
+//  they let the compiler do dead-code elimination.
+#ifndef GAMMA_ENCODE_EVERY_FBO
+    #ifdef FIRST_PASS
+        static const bool linearize_input = true;
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        static const bool linearize_input = false;
+        inline float get_pass_input_gamma()     {   return 1.0;                 }
+    #endif
+    #ifdef LAST_PASS
+        static const bool gamma_encode_output = true;
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        static const bool gamma_encode_output = false;
+        inline float get_pass_output_gamma()    {   return 1.0;                 }
+    #endif
+#else
+    static const bool linearize_input = true;
+    static const bool gamma_encode_output = true;
+    #ifdef FIRST_PASS
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        inline float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
+    #endif
+    #ifdef LAST_PASS
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        inline float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
+    #endif
+#endif
+
+//  Users might want to know if bilinear filtering will be gamma-correct:
+static const bool gamma_aware_bilinear = !linearize_input;
+
+
+//////////////////////  COLOR ENCODING/DECODING FUNCTIONS  /////////////////////
+
+inline float4 encode_output(const float4 color)
+{
+    if(gamma_encode_output)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_input(const float4 color)
+{
+    if(linearize_input)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_gamma_input(const float4 color, const float3 gamma)
+{
+    if(assume_opaque_alpha)
+    {
+        return float4(pow(color.rgb, gamma), 1.0);
+    }
+    else
+    {
+        return float4(pow(color.rgb, gamma), color.a);
+    }
+}
+
+//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯
+//#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D)))
+// EDIT: it's the 'const' in front of the coords that's doing it
+
+///////////////////////////  TEXTURE LOOKUP WRAPPERS  //////////////////////////
+
+//  "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a wide array of linearizing texture lookup wrapper functions.  The
+//  Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
+//  lookups are provided for completeness in case that changes someday.  Nobody
+//  is likely to use the *fetch and *proj functions, but they're included just
+//  in case.  The only tex*D texture sampling functions omitted are:
+//      - tex*Dcmpbias
+//      - tex*Dcmplod
+//      - tex*DARRAY*
+//      - tex*DMS*
+//      - Variants returning integers
+//  Standard line length restrictions are ignored below for vertical brevity.
+/*
+//  tex1D:
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex1Dbias:
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dbias(tex, tex_coords));   }
+
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dbias(tex, tex_coords, texel_off));    }
+
+//  tex1Dfetch:
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
+{   return decode_input(tex1Dfetch(tex, tex_coords));  }
+
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex1Dlod:
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dlod(tex, tex_coords));    }
+
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dlod(tex, tex_coords, texel_off));     }
+
+//  tex1Dproj:
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+*/
+//  tex2D:
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords, texel_off));    }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex2Dbias:
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
+//{   return decode_input(tex2Dbias(tex, tex_coords));   }
+
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dbias(tex, tex_coords, texel_off));    }
+
+//  tex2Dfetch:
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
+//{   return decode_input(tex2Dfetch(tex, tex_coords));  }
+
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords)
+{   return decode_input(textureLod(tex, tex_coords.xy, 0.0));    }
+
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));     }
+/*
+//  tex2Dproj:
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+*/
+/*
+//  tex3D:
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
+{   return decode_input(tex3D(tex, tex_coords));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, texel_off));    }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex3Dbias:
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dbias(tex, tex_coords));   }
+
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dbias(tex, tex_coords, texel_off));    }
+
+//  tex3Dfetch:
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
+{   return decode_input(tex3Dfetch(tex, tex_coords));  }
+
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex3Dlod:
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dlod(tex, tex_coords));    }
+
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dlod(tex, tex_coords, texel_off));     }
+
+//  tex3Dproj:
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dproj(tex, tex_coords));   }
+
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dproj(tex, tex_coords, texel_off));    }
+/////////*
+
+//  NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  This narrow selection of nonstandard tex2D* functions can be useful:
+
+//  tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0)));   }
+
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off));    }
+
+
+//  MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a narrower selection of tex2D* wrapper functions that decode an
+//  input sample with a specified gamma value.  These are useful for reading
+//  LUT's and for reading the input of pass0 in a later pass.
+
+//  tex2D:
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma);   }
+
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+/*
+//  tex2Dbias:
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma);   }
+
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma);    }
+
+//  tex2Dfetch:
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma);  }
+
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma);   }
+*/
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma);    }
+
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma);     }
+
+
+#endif  //  GAMMA_MANAGEMENT_H
+
+////////////////////////////  END GAMMA-MANAGEMENT  //////////////////////////
+
+//#include "quad-pixel-communication.h"
+
+///////////////////////  BEGIN QUAD-PIXEL-COMMUNICATION  //////////////////////
+
+#ifndef QUAD_PIXEL_COMMUNICATION_H
+#define QUAD_PIXEL_COMMUNICATION_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey*
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DISCLAIMER  /////////////////////////////////
+
+//  *This code was inspired by "Shader Amortization using Pixel Quad Message
+//  Passing" by Eric Penner, published in GPU Pro 2, Chapter VI.2.  My intent
+//  is not to plagiarize his fundamentally similar code and assert my own
+//  copyright, but the algorithmic helper functions require so little code that
+//  implementations can't vary by much except bugfixes and conventions.  I just
+//  wanted to license my own particular code here to avoid ambiguity and make it
+//  clear that as far as I'm concerned, people can do as they please with it.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  Given screen pixel numbers, derive a "quad vector" describing a fragment's
+//  position in its 2x2 pixel quad.  Given that vector, obtain the values of any
+//  variable at neighboring fragments.
+//  Requires:   Using this file in general requires:
+//              1.) ddx() and ddy() are present in the current Cg profile.
+//              2.) The GPU driver is using fine/high-quality derivatives.
+//                  Functions will give incorrect results if this is not true,
+//                  so a test function is included.
+
+
+/////////////////////  QUAD-PIXEL COMMUNICATION PRIMITIVES  ////////////////////
+
+float4 get_quad_vector_naive(float4 output_pixel_num_wrt_uvxy)
+{
+    //  Requires:   Two measures of the current fragment's output pixel number
+    //              in the range ([0, output_size.x), [0, output_size.y)):
+    //              1.) output_pixel_num_wrt_uvxy.xy increase with uv coords.
+    //              2.) output_pixel_num_wrt_uvxy.zw increase with screen xy.
+    //  Returns:    Two measures of the fragment's position in its 2x2 quad:
+    //              1.) The .xy components are its 2x2 placement with respect to
+    //                  uv direction (the origin (0, 0) is at the top-left):
+    //                  top-left     = (-1.0, -1.0) top-right    = ( 1.0, -1.0)
+    //                  bottom-left  = (-1.0,  1.0) bottom-right = ( 1.0,  1.0)
+    //                  You need this to arrange/weight shared texture samples.
+    //              2.) The .zw components are its 2x2 placement with respect to
+    //                  screen xy direction (position); the origin varies.
+    //                  quad_gather needs this measure to work correctly.
+    //              Note: quad_vector.zw = quad_vector.xy * float2(
+    //                      ddx(output_pixel_num_wrt_uvxy.x),
+    //                      ddy(output_pixel_num_wrt_uvxy.y));
+    //  Caveats:    This function assumes the GPU driver always starts 2x2 pixel
+    //              quads at even pixel numbers.  This assumption can be wrong
+    //              for odd output resolutions (nondeterministically so).
+    float4 pixel_odd = frac(output_pixel_num_wrt_uvxy * 0.5) * 2.0;
+    float4 quad_vector = pixel_odd * 2.0 - float4(1.0);
+    return quad_vector;
+}
+
+float4 get_quad_vector(float4 output_pixel_num_wrt_uvxy)
+{
+    //  Requires:   Same as get_quad_vector_naive() (see that first).
+    //  Returns:    Same as get_quad_vector_naive() (see that first), but it's
+    //              correct even if the 2x2 pixel quad starts at an odd pixel,
+    //              which can occur at odd resolutions.
+    float4 quad_vector_guess =
+        get_quad_vector_naive(output_pixel_num_wrt_uvxy);
+    //  If quad_vector_guess.zw doesn't increase with screen xy, we know
+    //  the 2x2 pixel quad starts at an odd pixel:
+    float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_guess.z),
+                                                ddy(quad_vector_guess.w));
+    return quad_vector_guess * odd_start_mirror.xyxy;
+}
+
+float4 get_quad_vector(float2 output_pixel_num_wrt_uv)
+{
+    //  Requires:   1.) ddx() and ddy() are present in the current Cg profile.
+    //              2.) output_pixel_num_wrt_uv must increase with uv coords and
+    //                  measure the current fragment's output pixel number in:
+    //                      ([0, output_size.x), [0, output_size.y))
+    //  Returns:    Same as get_quad_vector_naive() (see that first), but it's
+    //              correct even if the 2x2 pixel quad starts at an odd pixel,
+    //              which can occur at odd resolutions.
+    //  Caveats:    This function requires less information than the version
+    //              taking a float4, but it's potentially slower.
+    //  Do screen coords increase with or against uv?  Get the direction
+    //  with respect to (uv.x, uv.y) for (screen.x, screen.y) in {-1, 1}.
+    float2 screen_uv_mirror = float2(ddx(output_pixel_num_wrt_uv.x),
+                                        ddy(output_pixel_num_wrt_uv.y));
+    float2 pixel_odd_wrt_uv = frac(output_pixel_num_wrt_uv * 0.5) * 2.0;
+    float2 quad_vector_uv_guess = (pixel_odd_wrt_uv - float2(0.5)) * 2.0;
+    float2 quad_vector_screen_guess = quad_vector_uv_guess * screen_uv_mirror;
+    //  If quad_vector_screen_guess doesn't increase with screen xy, we know
+    //  the 2x2 pixel quad starts at an odd pixel:
+    float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_screen_guess.x),
+                                                ddy(quad_vector_screen_guess.y));
+    float4 quad_vector_guess = float4(
+        quad_vector_uv_guess, quad_vector_screen_guess);
+    return quad_vector_guess * odd_start_mirror.xyxy;
+}
+
+void quad_gather(float4 quad_vector, float4 curr,
+    out float4 adjx, out float4 adjy, out float4 diag)
+{
+    //  Requires:   1.) ddx() and ddy() are present in the current Cg profile.
+    //              2.) The GPU driver is using fine/high-quality derivatives.
+    //              3.) quad_vector describes the current fragment's location in
+    //                  its 2x2 pixel quad using get_quad_vector()'s conventions.
+    //              4.) curr is any vector you wish to get neighboring values of.
+    //  Returns:    Values of an input vector (curr) at neighboring fragments
+    //              adjacent x, adjacent y, and diagonal (via out parameters).
+    adjx = curr - ddx(curr) * quad_vector.z;
+    adjy = curr - ddy(curr) * quad_vector.w;
+    diag = adjx - ddy(adjx) * quad_vector.w;
+}
+
+void quad_gather(float4 quad_vector, float3 curr,
+    out float3 adjx, out float3 adjy, out float3 diag)
+{
+    //  Float3 version
+    adjx = curr - ddx(curr) * quad_vector.z;
+    adjy = curr - ddy(curr) * quad_vector.w;
+    diag = adjx - ddy(adjx) * quad_vector.w;
+}
+
+void quad_gather(float4 quad_vector, float2 curr,
+    out float2 adjx, out float2 adjy, out float2 diag)
+{
+    //  Float2 version
+    adjx = curr - ddx(curr) * quad_vector.z;
+    adjy = curr - ddy(curr) * quad_vector.w;
+    diag = adjx - ddy(adjx) * quad_vector.w;
+}
+
+float4 quad_gather(float4 quad_vector, float curr)
+{
+    //  Float version:
+    //  Returns:    return.x == current
+    //              return.y == adjacent x
+    //              return.z == adjacent y
+    //              return.w == diagonal
+    float4 all = float4(curr);
+    all.y = all.x - ddx(all.x) * quad_vector.z;
+    all.zw = all.xy - ddy(all.xy) * quad_vector.w;
+    return all;
+}
+
+float4 quad_gather_sum(float4 quad_vector, float4 curr)
+{
+    //  Requires:   Same as quad_gather()
+    //  Returns:    Sum of an input vector (curr) at all fragments in a quad.
+    float4 adjx, adjy, diag;
+    quad_gather(quad_vector, curr, adjx, adjy, diag);
+    return (curr + adjx + adjy + diag);
+}
+
+float3 quad_gather_sum(float4 quad_vector, float3 curr)
+{
+    //  Float3 version:
+    float3 adjx, adjy, diag;
+    quad_gather(quad_vector, curr, adjx, adjy, diag);
+    return (curr + adjx + adjy + diag);
+}
+
+float2 quad_gather_sum(float4 quad_vector, float2 curr)
+{
+    //  Float2 version:
+    float2 adjx, adjy, diag;
+    quad_gather(quad_vector, curr, adjx, adjy, diag);
+    return (curr + adjx + adjy + diag);
+}
+
+float quad_gather_sum(float4 quad_vector, float curr)
+{
+    //  Float version:
+    float4 all_values = quad_gather(quad_vector, curr);
+    return (all_values.x + all_values.y + all_values.z + all_values.w);
+}
+
+bool fine_derivatives_working(float4 quad_vector, float4 curr)
+{
+    //  Requires:   1.) ddx() and ddy() are present in the current Cg profile.
+    //              2.) quad_vector describes the current fragment's location in
+    //                  its 2x2 pixel quad using get_quad_vector()'s conventions.
+    //              3.) curr must be a test vector with non-constant derivatives
+    //                  (its value should change nonlinearly across fragments).
+    //  Returns:    true if fine/hybrid/high-quality derivatives are used, or
+    //              false if coarse derivatives are used or inconclusive
+    //  Usage:      Test whether quad-pixel communication is working!
+    //  Method:     We can confirm fine derivatives are used if the following
+    //              holds (ever, for any value at any fragment):
+    //                  (ddy(curr) != ddy(adjx)) or (ddx(curr) != ddx(adjy))
+    //              The more values we test (e.g. test a float4 two ways), the
+    //              easier it is to demonstrate fine derivatives are working.
+    //  TODO: Check for floating point exact comparison issues!
+    float4 ddx_curr = ddx(curr);
+    float4 ddy_curr = ddy(curr);
+    float4 adjx = curr - ddx_curr * quad_vector.z;
+    float4 adjy = curr - ddy_curr * quad_vector.w;
+    bool ddy_different = any(bool4(ddy_curr.x != ddy(adjx).x, ddy_curr.y != ddy(adjx).y, ddy_curr.z != ddy(adjx).z, ddy_curr.w != ddy(adjx).w));
+    bool ddx_different = any(bool4(ddx_curr.x != ddx(adjy).x, ddx_curr.y != ddx(adjy).y, ddx_curr.z != ddx(adjy).z, ddx_curr.w != ddx(adjy).w));
+    return any(bool2(ddy_different, ddx_different));
+}
+
+bool fine_derivatives_working_fast(float4 quad_vector, float curr)
+{
+    //  Requires:   Same as fine_derivatives_working()
+    //  Returns:    Same as fine_derivatives_working()
+    //  Usage:      This is faster than fine_derivatives_working() but more
+    //              likely to return false negatives, so it's less useful for
+    //              offline testing/debugging.  It's also useless as the basis
+    //              for dynamic runtime branching as of May 2014: Derivatives
+    //              (and quad-pixel communication) are currently disallowed in
+    //              branches.  However, future GPU's may allow you to use them
+    //              in dynamic branches if you promise the branch condition
+    //              evaluates the same for every fragment in the quad (and/or if
+    //              the driver enforces that promise by making a single fragment
+    //              control branch decisions).  If that ever happens, this
+    //              version may become a more economical choice.
+    float ddx_curr = ddx(curr);
+    float ddy_curr = ddy(curr);
+    float adjx = curr - ddx_curr * quad_vector.z;
+    return (ddy_curr != ddy(adjx));
+}
+
+#endif  //  QUAD_PIXEL_COMMUNICATION_H
+
+////////////////////////  END QUAD-PIXEL-COMMUNICATION  ///////////////////////
+
+//#include "special-functions.h"
+
+///////////////////////////  BEGIN SPECIAL-FUNCTIONS  //////////////////////////
+
+#ifndef SPECIAL_FUNCTIONS_H
+#define SPECIAL_FUNCTIONS_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file implements the following mathematical special functions:
+//  1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2))
+//  2.) gamma(s), a real-numbered extension of the integer factorial function
+//  It also implements normalized_ligamma(s, z), a normalized lower incomplete
+//  gamma function for s < 0.5 only.  Both gamma() and normalized_ligamma() can
+//  be called with an _impl suffix to use an implementation version with a few
+//  extra precomputed parameters (which may be useful for the caller to reuse).
+//  See below for details.
+//
+//  Design Rationale:
+//  Pretty much every line of code in this file is duplicated four times for
+//  different input types (float4/float3/float2/float).  This is unfortunate,
+//  but Cg doesn't allow function templates.  Macros would be far less verbose,
+//  but they would make the code harder to document and read.  I don't expect
+//  these functions will require a whole lot of maintenance changes unless
+//  someone ever has need for more robust incomplete gamma functions, so code
+//  duplication seems to be the lesser evil in this case.
+
+
+///////////////////////////  GAUSSIAN ERROR FUNCTION  //////////////////////////
+
+float4 erf6(float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Return an Abramowitz/Stegun approximation of erf(), where:
+    //                  erf(x) = 2/sqrt(pi) * integral(e**(-x**2))
+    //              This approximation has a max absolute error of 2.5*10**-5
+    //              with solid numerical robustness and efficiency.  See:
+	//                  https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions
+	static const float4 one = float4(1.0);
+	const float4 sign_x = sign(x);
+	const float4 t = one/(one + 0.47047*abs(x));
+	const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float3 erf6(const float3 x)
+{
+    //  Float3 version:
+	static const float3 one = float3(1.0);
+	const float3 sign_x = sign(x);
+	const float3 t = one/(one + 0.47047*abs(x));
+	const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float2 erf6(const float2 x)
+{
+    //  Float2 version:
+	static const float2 one = float2(1.0);
+	const float2 sign_x = sign(x);
+	const float2 t = one/(one + 0.47047*abs(x));
+	const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float erf6(const float x)
+{
+    //  Float version:
+	const float sign_x = sign(x);
+	const float t = 1.0/(1.0 + 0.47047*abs(x));
+	const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float4 erft(const float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Approximate erf() with the hyperbolic tangent.  The error is
+    //              visually noticeable, but it's blazing fast and perceptually
+    //              close...at least on ATI hardware.  See:
+    //                  http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html
+    //  Warning:    Only use this if your hardware drivers correctly implement
+    //              tanh(): My nVidia 8800GTS returns garbage output.
+	return tanh(1.202760580 * x);
+}
+
+float3 erft(const float3 x)
+{
+    //  Float3 version:
+	return tanh(1.202760580 * x);
+}
+
+float2 erft(const float2 x)
+{
+    //  Float2 version:
+	return tanh(1.202760580 * x);
+}
+
+float erft(const float x)
+{
+    //  Float version:
+	return tanh(1.202760580 * x);
+}
+
+inline float4 erf(const float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Some approximation of erf(x), depending on user settings.
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float3 erf(const float3 x)
+{
+    //  Float3 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float2 erf(const float2 x)
+{
+    //  Float2 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float erf(const float x)
+{
+    //  Float version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+
+///////////////////////////  COMPLETE GAMMA FUNCTION  //////////////////////////
+
+float4 gamma_impl(const float4 s, const float4 s_inv)
+{
+    //  Requires:   1.) s is the standard parameter to the gamma function, and
+    //                  it should lie in the [0, 36] range.
+    //              2.) s_inv = 1.0/s.  This implementation function requires
+    //                  the caller to precompute this value, giving users the
+    //                  opportunity to reuse it.
+    //  Returns:    Return approximate gamma function (real-numbered factorial)
+    //              output using the Lanczos approximation with two coefficients
+    //              calculated using Paul Godfrey's method here:
+    //                  http://my.fit.edu/~gabdo/gamma.txt
+    //              An optimal g value for s in [0, 36] is ~1.12906830989, with
+    //              a maximum relative error of 0.000463 for 2**16 equally
+    //              evals.  We could use three coeffs (0.0000346 error) without
+    //              hurting latency, but this allows more parallelism with
+    //              outside instructions.
+	static const float4 g = float4(1.12906830989);
+	static const float4 c0 = float4(0.8109119309638332633713423362694399653724431);
+	static const float4 c1 = float4(0.4808354605142681877121661197951496120000040);
+	static const float4 e = float4(2.71828182845904523536028747135266249775724709);
+	const float4 sph = s + float4(0.5);
+	const float4 lanczos_sum = c0 + c1/(s + float4(1.0));
+	const float4 base = (sph + g)/e;  //  or (s + g + float4(0.5))/e
+	//  gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s).
+	//  This has less error for small s's than (s -= 1.0) at the beginning.
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float3 gamma_impl(const float3 s, const float3 s_inv)
+{
+    //  Float3 version:
+	static const float3 g = float3(1.12906830989);
+	static const float3 c0 = float3(0.8109119309638332633713423362694399653724431);
+	static const float3 c1 = float3(0.4808354605142681877121661197951496120000040);
+	static const float3 e = float3(2.71828182845904523536028747135266249775724709);
+	const float3 sph = s + float3(0.5);
+	const float3 lanczos_sum = c0 + c1/(s + float3(1.0));
+	const float3 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float2 gamma_impl(const float2 s, const float2 s_inv)
+{
+    //  Float2 version:
+	static const float2 g = float2(1.12906830989);
+	static const float2 c0 = float2(0.8109119309638332633713423362694399653724431);
+	static const float2 c1 = float2(0.4808354605142681877121661197951496120000040);
+	static const float2 e = float2(2.71828182845904523536028747135266249775724709);
+	const float2 sph = s + float2(0.5);
+	const float2 lanczos_sum = c0 + c1/(s + float2(1.0));
+	const float2 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float gamma_impl(const float s, const float s_inv)
+{
+    //  Float version:
+	static const float g = 1.12906830989;
+	static const float c0 = 0.8109119309638332633713423362694399653724431;
+	static const float c1 = 0.4808354605142681877121661197951496120000040;
+	static const float e = 2.71828182845904523536028747135266249775724709;
+	const float sph = s + 0.5;
+	const float lanczos_sum = c0 + c1/(s + 1.0);
+	const float base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float4 gamma(const float4 s)
+{
+    //  Requires:   s is the standard parameter to the gamma function, and it
+    //              should lie in the [0, 36] range.
+    //  Returns:    Return approximate gamma function output with a maximum
+    //              relative error of 0.000463.  See gamma_impl for details.
+	return gamma_impl(s, float4(1.0)/s);
+}
+
+float3 gamma(const float3 s)
+{
+    //  Float3 version:
+	return gamma_impl(s, float3(1.0)/s);
+}
+
+float2 gamma(const float2 s)
+{
+    //  Float2 version:
+	return gamma_impl(s, float2(1.0)/s);
+}
+
+float gamma(const float s)
+{
+    //  Float version:
+	return gamma_impl(s, 1.0/s);
+}
+
+
+////////////////  INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT)  ///////////////
+
+//  Lower incomplete gamma function for small s and z (implementation):
+float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z <= ~0.775075
+    //              3.) s_inv = 1.0/s (precomputed for outside reuse)
+    //  Returns:    A series representation for the lower incomplete gamma
+    //              function for small s and small z (4 terms).
+    //  The actual "rolled up" summation looks like:
+	//      last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0;
+	//      sum = last_sign * last_pow / ((s + k) * last_factorial)
+	//      for(int i = 0; i < 4; ++i)
+	//      {
+	//          last_sign *= -1.0; last_pow *= z; last_factorial *= i;
+	//          sum += last_sign * last_pow / ((s + k) * last_factorial);
+	//      }
+	//  Unrolled, constant-unfolded and arranged for madds and parallelism:
+	const float4 scale = pow(z, s);
+	float4 sum = s_inv;  //  Summation iteration 0 result
+	//  Summation iterations 1, 2, and 3:
+	const float4 z_sq = z*z;
+	const float4 denom1 = s + float4(1.0);
+	const float4 denom2 = 2.0*s + float4(4.0);
+	const float4 denom3 = 6.0*s + float4(18.0);
+	//float4 denom4 = 24.0*s + float4(96.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	//sum += z_sq * z_sq / denom4;
+	//  Scale and return:
+	return scale * sum;
+}
+
+float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv)
+{
+    //  Float3 version:
+	const float3 scale = pow(z, s);
+	float3 sum = s_inv;
+	const float3 z_sq = z*z;
+	const float3 denom1 = s + float3(1.0);
+	const float3 denom2 = 2.0*s + float3(4.0);
+	const float3 denom3 = 6.0*s + float3(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv)
+{
+    //  Float2 version:
+	const float2 scale = pow(z, s);
+	float2 sum = s_inv;
+	const float2 z_sq = z*z;
+	const float2 denom1 = s + float2(1.0);
+	const float2 denom2 = 2.0*s + float2(4.0);
+	const float2 denom3 = 6.0*s + float2(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float ligamma_small_z_impl(const float s, const float z, const float s_inv)
+{
+    //  Float version:
+	const float scale = pow(z, s);
+	float sum = s_inv;
+	const float z_sq = z*z;
+	const float denom1 = s + 1.0;
+	const float denom2 = 2.0*s + 4.0;
+	const float denom3 = 6.0*s + 18.0;
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+//  Upper incomplete gamma function for small s and large z (implementation):
+float4 uigamma_large_z_impl(const float4 s, const float4 z)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z > ~0.775075
+    //  Returns:    Gauss's continued fraction representation for the upper
+    //              incomplete gamma function (4 terms).
+	//  The "rolled up" continued fraction looks like this.  The denominator
+    //  is truncated, and it's calculated "from the bottom up:"
+	//      denom = float4('inf');
+	//      float4 one = float4(1.0);
+	//      for(int i = 4; i > 0; --i)
+	//      {
+	//          denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom;
+	//      }
+	//  Unrolled and constant-unfolded for madds and parallelism:
+	const float4 numerator = pow(z, s) * exp(-z);
+	float4 denom = float4(7.0) + z - s;
+	denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom;
+	denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom;
+	denom = float4(1.0) + z - s + (s - float4(1.0))/denom;
+	return numerator / denom;
+}
+
+float3 uigamma_large_z_impl(const float3 s, const float3 z)
+{
+    //  Float3 version:
+	const float3 numerator = pow(z, s) * exp(-z);
+	float3 denom = float3(7.0) + z - s;
+	denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom;
+	denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom;
+	denom = float3(1.0) + z - s + (s - float3(1.0))/denom;
+	return numerator / denom;
+}
+
+float2 uigamma_large_z_impl(const float2 s, const float2 z)
+{
+    //  Float2 version:
+	const float2 numerator = pow(z, s) * exp(-z);
+	float2 denom = float2(7.0) + z - s;
+	denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom;
+	denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom;
+	denom = float2(1.0) + z - s + (s - float2(1.0))/denom;
+	return numerator / denom;
+}
+
+float uigamma_large_z_impl(const float s, const float z)
+{
+    //  Float version:
+	const float numerator = pow(z, s) * exp(-z);
+	float denom = 7.0 + z - s;
+	denom = 5.0 + z - s + (3.0*s - 9.0)/denom;
+	denom = 3.0 + z - s + (2.0*s - 4.0)/denom;
+	denom = 1.0 + z - s + (s - 1.0)/denom;
+	return numerator / denom;
+}
+
+//  Normalized lower incomplete gamma function for small s (implementation):
+float4 normalized_ligamma_impl(const float4 s, const float4 z,
+    const float4 s_inv, const float4 gamma_s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) s_inv = 1/s (precomputed for outside reuse)
+    //              3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse)
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  Since we only care about s < 0.5, we only need
+    //              to evaluate two branches (not four) based on z.  Each branch
+    //              uses four terms, with a max relative error of ~0.00182.  The
+    //              branch threshold and specifics were adapted for fewer terms
+    //              from Gil/Segura/Temme's paper here:
+    //                  http://oai.cwi.nl/oai/asset/20433/20433B.pdf
+	//  Evaluate both branches: Real branches test slower even when available.
+	static const float4 thresh = float4(0.775075);
+	bool4 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	z_is_large.w = z.w > thresh.w;
+	const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	//  Combine the results from both branches:
+	bool4 inverse_z_is_large = not(z_is_large);
+	return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large);
+}
+
+float3 normalized_ligamma_impl(const float3 s, const float3 z,
+    const float3 s_inv, const float3 gamma_s_inv)
+{
+    //  Float3 version:
+	static const float3 thresh = float3(0.775075);
+	bool3 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool3 inverse_z_is_large = not(z_is_large);
+	return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large);
+}
+
+float2 normalized_ligamma_impl(const float2 s, const float2 z,
+    const float2 s_inv, const float2 gamma_s_inv)
+{
+    //  Float2 version:
+	static const float2 thresh = float2(0.775075);
+	bool2 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool2 inverse_z_is_large = not(z_is_large);
+	return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large);
+}
+
+float normalized_ligamma_impl(const float s, const float z,
+    const float s_inv, const float gamma_s_inv)
+{
+    //  Float version:
+	static const float thresh = 0.775075;
+	const bool z_is_large = z > thresh;
+	const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	return large_z * float(z_is_large) + small_z * float(!z_is_large);
+}
+
+//  Normalized lower incomplete gamma function for small s:
+float4 normalized_ligamma(const float4 s, const float4 z)
+{
+    //  Requires:   s < ~0.5
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  See normalized_ligamma_impl() for details.
+	const float4 s_inv = float4(1.0)/s;
+	const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float3 normalized_ligamma(const float3 s, const float3 z)
+{
+    //  Float3 version:
+	const float3 s_inv = float3(1.0)/s;
+	const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float2 normalized_ligamma(const float2 s, const float2 z)
+{
+    //  Float2 version:
+	const float2 s_inv = float2(1.0)/s;
+	const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float normalized_ligamma(const float s, const float z)
+{
+    //  Float version:
+	const float s_inv = 1.0/s;
+	const float gamma_s_inv = 1.0/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+#endif  //  SPECIAL_FUNCTIONS_H
+
+////////////////////////////  END SPECIAL-FUNCTIONS  ///////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////////  HELPERS  //////////////////////////////////
+
+inline float4 uv2_to_uv4(float2 tex_uv)
+{
+    //  Make a float2 uv offset safe for adding to float4 tex2Dlod coords:
+    return float4(tex_uv, 0.0, 0.0);
+}
+
+//  Make a length squared helper macro (for usage with static constants):
+#define LENGTH_SQ(vec) (dot(vec, vec))
+
+inline float get_fast_gaussian_weight_sum_inv(const float sigma)
+{
+    //  We can use the Gaussian integral to calculate the asymptotic weight for
+    //  the center pixel.  Since the unnormalized center pixel weight is 1.0,
+    //  the normalized weight is the same as the weight sum inverse.  Given a
+    //  large enough blur (9+), the asymptotic weight sum is close and faster:
+    //      center_weight = 0.5 *
+    //          (erf(0.5/(sigma*sqrt(2.0))) - erf(-0.5/(sigma*sqrt(2.0))))
+    //      erf(-x) == -erf(x), so we get 0.5 * (2.0 * erf(blah blah)):
+    //  However, we can get even faster results with curve-fitting.  These are
+    //  also closer than the asymptotic results, because they were constructed
+    //  from 64 blurs sizes from [3, 131) and 255 equally-spaced sigmas from
+    //  (0, blurN_std_dev), so the results for smaller sigmas are biased toward
+    //  smaller blurs.  The max error is 0.0031793913.
+    //  Relative FPS: 134.3 with erf, 135.8 with curve-fitting.
+    //static const float temp = 0.5/sqrt(2.0);
+    //return erf(temp/sigma);
+    return min(exp(exp(0.348348412457428/
+        (sigma - 0.0860587260734721))), 0.399334576340352/sigma);
+}
+
+
+////////////////////  ARBITRARILY RESIZABLE SEPARABLE BLURS  ///////////////////
+
+float3 tex2Dblur11resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 11x Gaussian blurred texture lookup using a 11-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  Calculate Gaussian blur kernel weights and a normalization factor for
+    //  distances of 0-4, ignoring constant factors (since we're normalizing).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float weight_sum_inv = 1.0 /
+        (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5));
+    //  Statically normalize weights, sum weighted samples, and return.  Blurs are
+    //  currently optimized for dynamic weights.
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w5 * tex2D_linearize(tex, tex_uv - 5.0 * dxdy).rgb;
+    sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
+    sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb;
+    sum += w5 * tex2D_linearize(tex, tex_uv + 5.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur9resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 9x Gaussian blurred texture lookup using a 9-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
+    sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur7resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 7x Gaussian blurred texture lookup using a 7-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3));
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur5resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 5x Gaussian blurred texture lookup using a 5-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2));
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur3resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 3x Gaussian blurred texture lookup using a 3-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1);
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+
+///////////////////////////  FAST SEPARABLE BLURS  ///////////////////////////
+
+float3 tex2Dblur11fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   1.) Global requirements must be met (see file description).
+    //              2.) filter_linearN must = "true" in your .cgp file.
+    //              3.) For gamma-correct bilinear filtering, global
+    //                  gamma_aware_bilinear == true (from gamma-management.h)
+    //  Returns:    A 1D 11x Gaussian blurred texture lookup using 6 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float weight_sum_inv = 1.0 /
+        (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    const float w01 = w0 * 0.5 + w1;
+    const float w23 = w2 + w3;
+    const float w45 = w4 + w5;
+    const float w01_ratio = w1/w01;
+    const float w23_ratio = w3/w23;
+    const float w45_ratio = w5/w45;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w45 * tex2D_linearize(tex, tex_uv - (4.0 + w45_ratio) * dxdy).rgb;
+    sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb;
+    sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb;
+    sum += w45 * tex2D_linearize(tex, tex_uv + (4.0 + w45_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur9fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 9x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 4 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    const float w12 = w1 + w2;
+    const float w34 = w3 + w4;
+    const float w12_ratio = w2/w12;
+    const float w34_ratio = w4/w34;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w34 * tex2D_linearize(tex, tex_uv - (3.0 + w34_ratio) * dxdy).rgb;
+    sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb;
+    sum += w34 * tex2D_linearize(tex, tex_uv + (3.0 + w34_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur7fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 7x Gaussian blurred texture lookup using 4 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    const float w01 = w0 * 0.5 + w1;
+    const float w23 = w2 + w3;
+    const float w01_ratio = w1/w01;
+    const float w23_ratio = w3/w23;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb;
+    sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur5fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 5x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 2 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    const float w12 = w1 + w2;
+    const float w12_ratio = w2/w12;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur3fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 3x Gaussian blurred texture lookup using 2 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    const float w01 = w0 * 0.5 + w1;
+    const float w01_ratio = w1/w01;
+    //  Weights for all samples are the same, so just average them:
+    return 0.5 * (
+        tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb +
+        tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb);
+}
+
+
+////////////////////////////  HUGE SEPARABLE BLURS  ////////////////////////////
+
+//  Huge separable blurs come only in "fast" versions.
+float3 tex2Dblur43fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 43x Gaussian blurred texture lookup using 22 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float w6 = exp(-36.0 * denom_inv);
+    const float w7 = exp(-49.0 * denom_inv);
+    const float w8 = exp(-64.0 * denom_inv);
+    const float w9 = exp(-81.0 * denom_inv);
+    const float w10 = exp(-100.0 * denom_inv);
+    const float w11 = exp(-121.0 * denom_inv);
+    const float w12 = exp(-144.0 * denom_inv);
+    const float w13 = exp(-169.0 * denom_inv);
+    const float w14 = exp(-196.0 * denom_inv);
+    const float w15 = exp(-225.0 * denom_inv);
+    const float w16 = exp(-256.0 * denom_inv);
+    const float w17 = exp(-289.0 * denom_inv);
+    const float w18 = exp(-324.0 * denom_inv);
+    const float w19 = exp(-361.0 * denom_inv);
+    const float w20 = exp(-400.0 * denom_inv);
+    const float w21 = exp(-441.0 * denom_inv);
+    //const float weight_sum_inv = 1.0 /
+    //    (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 +
+    //        w12 + w13 + w14 + w15 + w16 + w17 + w18 + w19 + w20 + w21));
+    const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    const float w0_1 = w0 * 0.5 + w1;
+    const float w2_3 = w2 + w3;
+    const float w4_5 = w4 + w5;
+    const float w6_7 = w6 + w7;
+    const float w8_9 = w8 + w9;
+    const float w10_11 = w10 + w11;
+    const float w12_13 = w12 + w13;
+    const float w14_15 = w14 + w15;
+    const float w16_17 = w16 + w17;
+    const float w18_19 = w18 + w19;
+    const float w20_21 = w20 + w21;
+    const float w0_1_ratio = w1/w0_1;
+    const float w2_3_ratio = w3/w2_3;
+    const float w4_5_ratio = w5/w4_5;
+    const float w6_7_ratio = w7/w6_7;
+    const float w8_9_ratio = w9/w8_9;
+    const float w10_11_ratio = w11/w10_11;
+    const float w12_13_ratio = w13/w12_13;
+    const float w14_15_ratio = w15/w14_15;
+    const float w16_17_ratio = w17/w16_17;
+    const float w18_19_ratio = w19/w18_19;
+    const float w20_21_ratio = w21/w20_21;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w20_21 * tex2D_linearize(tex, tex_uv - (20.0 + w20_21_ratio) * dxdy).rgb;
+    sum += w18_19 * tex2D_linearize(tex, tex_uv - (18.0 + w18_19_ratio) * dxdy).rgb;
+    sum += w16_17 * tex2D_linearize(tex, tex_uv - (16.0 + w16_17_ratio) * dxdy).rgb;
+    sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb;
+    sum += w16_17 * tex2D_linearize(tex, tex_uv + (16.0 + w16_17_ratio) * dxdy).rgb;
+    sum += w18_19 * tex2D_linearize(tex, tex_uv + (18.0 + w18_19_ratio) * dxdy).rgb;
+    sum += w20_21 * tex2D_linearize(tex, tex_uv + (20.0 + w20_21_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur31fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 31x Gaussian blurred texture lookup using 16 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float w6 = exp(-36.0 * denom_inv);
+    const float w7 = exp(-49.0 * denom_inv);
+    const float w8 = exp(-64.0 * denom_inv);
+    const float w9 = exp(-81.0 * denom_inv);
+    const float w10 = exp(-100.0 * denom_inv);
+    const float w11 = exp(-121.0 * denom_inv);
+    const float w12 = exp(-144.0 * denom_inv);
+    const float w13 = exp(-169.0 * denom_inv);
+    const float w14 = exp(-196.0 * denom_inv);
+    const float w15 = exp(-225.0 * denom_inv);
+    //const float weight_sum_inv = 1.0 /
+    //    (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 +
+    //        w9 + w10 + w11 + w12 + w13 + w14 + w15));
+    const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    const float w0_1 = w0 * 0.5 + w1;
+    const float w2_3 = w2 + w3;
+    const float w4_5 = w4 + w5;
+    const float w6_7 = w6 + w7;
+    const float w8_9 = w8 + w9;
+    const float w10_11 = w10 + w11;
+    const float w12_13 = w12 + w13;
+    const float w14_15 = w14 + w15;
+    const float w0_1_ratio = w1/w0_1;
+    const float w2_3_ratio = w3/w2_3;
+    const float w4_5_ratio = w5/w4_5;
+    const float w6_7_ratio = w7/w6_7;
+    const float w8_9_ratio = w9/w8_9;
+    const float w10_11_ratio = w11/w10_11;
+    const float w12_13_ratio = w13/w12_13;
+    const float w14_15_ratio = w15/w14_15;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur25fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 25x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 12 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float w6 = exp(-36.0 * denom_inv);
+    const float w7 = exp(-49.0 * denom_inv);
+    const float w8 = exp(-64.0 * denom_inv);
+    const float w9 = exp(-81.0 * denom_inv);
+    const float w10 = exp(-100.0 * denom_inv);
+    const float w11 = exp(-121.0 * denom_inv);
+    const float w12 = exp(-144.0 * denom_inv);
+    //const float weight_sum_inv = 1.0 / (w0 + 2.0 * (
+    //    w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12));
+    const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    const float w1_2 = w1 + w2;
+    const float w3_4 = w3 + w4;
+    const float w5_6 = w5 + w6;
+    const float w7_8 = w7 + w8;
+    const float w9_10 = w9 + w10;
+    const float w11_12 = w11 + w12;
+    const float w1_2_ratio = w2/w1_2;
+    const float w3_4_ratio = w4/w3_4;
+    const float w5_6_ratio = w6/w5_6;
+    const float w7_8_ratio = w8/w7_8;
+    const float w9_10_ratio = w10/w9_10;
+    const float w11_12_ratio = w12/w11_12;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w11_12 * tex2D_linearize(tex, tex_uv - (11.0 + w11_12_ratio) * dxdy).rgb;
+    sum += w9_10 * tex2D_linearize(tex, tex_uv - (9.0 + w9_10_ratio) * dxdy).rgb;
+    sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w9_10 * tex2D_linearize(tex, tex_uv + (9.0 + w9_10_ratio) * dxdy).rgb;
+    sum += w11_12 * tex2D_linearize(tex, tex_uv + (11.0 + w11_12_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur17fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 17x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 8 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float w6 = exp(-36.0 * denom_inv);
+    const float w7 = exp(-49.0 * denom_inv);
+    const float w8 = exp(-64.0 * denom_inv);
+    //const float weight_sum_inv = 1.0 / (w0 + 2.0 * (
+    //    w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8));
+    const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    const float w1_2 = w1 + w2;
+    const float w3_4 = w3 + w4;
+    const float w5_6 = w5 + w6;
+    const float w7_8 = w7 + w8;
+    const float w1_2_ratio = w2/w1_2;
+    const float w3_4_ratio = w4/w3_4;
+    const float w5_6_ratio = w6/w5_6;
+    const float w7_8_ratio = w8/w7_8;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+
+////////////////////  ARBITRARILY RESIZABLE ONE-PASS BLURS  ////////////////////
+
+float3 tex2Dblur3x3resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 3x3 Gaussian blurred mipmapped texture lookup of the
+    //              resized input.
+    //  Description:
+    //  This is the only arbitrarily resizable one-pass blur; tex2Dblur5x5resize
+    //  would perform like tex2Dblur9x9, MUCH slower than tex2Dblur5resize.
+    const float denom_inv = 0.5/(sigma*sigma);
+    //  Load each sample.  We need all 3x3 samples.  Quad-pixel communication
+    //  won't help either: This should perform like tex2Dblur5x5, but sharing a
+    //  4x4 sample field would perform more like tex2Dblur8x8shared (worse).
+    const float2 sample4_uv = tex_uv;
+    const float2 dx = float2(dxdy.x, 0.0);
+    const float2 dy = float2(0.0, dxdy.y);
+    const float2 sample1_uv = sample4_uv - dy;
+    const float2 sample7_uv = sample4_uv + dy;
+    const float3 sample0 = tex2D_linearize(tex, sample1_uv - dx).rgb;
+    const float3 sample1 = tex2D_linearize(tex, sample1_uv).rgb;
+    const float3 sample2 = tex2D_linearize(tex, sample1_uv + dx).rgb;
+    const float3 sample3 = tex2D_linearize(tex, sample4_uv - dx).rgb;
+    const float3 sample4 = tex2D_linearize(tex, sample4_uv).rgb;
+    const float3 sample5 = tex2D_linearize(tex, sample4_uv + dx).rgb;
+    const float3 sample6 = tex2D_linearize(tex, sample7_uv - dx).rgb;
+    const float3 sample7 = tex2D_linearize(tex, sample7_uv).rgb;
+    const float3 sample8 = tex2D_linearize(tex, sample7_uv + dx).rgb;
+    //  Statically compute Gaussian sample weights:
+    const float w4 = 1.0;
+    const float w1_3_5_7 = exp(-LENGTH_SQ(float2(1.0, 0.0)) * denom_inv);
+    const float w0_2_6_8 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
+    const float weight_sum_inv = 1.0/(w4 + 4.0 * (w1_3_5_7 + w0_2_6_8));
+    //  Weight and sum the samples:
+    const float3 sum = w4 * sample4 +
+        w1_3_5_7 * (sample1 + sample3 + sample5 + sample7) +
+        w0_2_6_8 * (sample0 + sample2 + sample6 + sample8);
+    return sum * weight_sum_inv;
+}
+
+
+////////////////////////////  FASTER ONE-PASS BLURS  ///////////////////////////
+
+float3 tex2Dblur9x9(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Perform a 1-pass 9x9 blur with 5x5 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 9x9 Gaussian blurred mipmapped texture lookup composed of
+    //              5x5 carefully selected bilinear samples.
+    //  Description:
+    //  Perform a 1-pass 9x9 blur with 5x5 bilinear samples.  Adjust the
+    //  bilinear sample location to reflect the true Gaussian weights for each
+    //  underlying texel.  The following diagram illustrates the relative
+    //  locations of bilinear samples.  Each sample with the same number has the
+    //  same weight (notice the symmetry).  The letters a, b, c, d distinguish
+    //  quadrants, and the letters U, D, L, R, C (up, down, left, right, center)
+    //  distinguish 1D directions along the line containing the pixel center:
+    //      6a 5a 2U 5b 6b
+    //      4a 3a 1U 3b 4b
+    //      2L 1L 0C 1R 2R
+    //      4c 3c 1D 3d 4d
+    //      6c 5c 2D 5d 6d
+    //  The following diagram illustrates the underlying equally spaced texels,
+    //  named after the sample that accesses them and subnamed by their location
+    //  within their 2x2, 2x1, 1x2, or 1x1 texel block:
+    //      6a4 6a3 5a4 5a3 2U2 5b3 5b4 6b3 6b4
+    //      6a2 6a1 5a2 5a1 2U1 5b1 5b2 6b1 6b2
+    //      4a4 4a3 3a4 3a3 1U2 3b3 3b4 4b3 4b4
+    //      4a2 4a1 3a2 3a1 1U1 3b1 3b2 4b1 4b2
+    //      2L2 2L1 1L2 1L1 0C1 1R1 1R2 2R1 2R2
+    //      4c2 4c1 3c2 3c1 1D1 3d1 3d2 4d1 4d2
+    //      4c4 4c3 3c4 3c3 1D2 3d3 3d4 4d3 4d4
+    //      6c2 6c1 5c2 5c1 2D1 5d1 5d2 6d1 6d2
+    //      6c4 6c3 5c4 5c3 2D2 5d3 5d4 6d3 6d4
+    //  Note there is only one C texel and only two texels for each U, D, L, or
+    //  R sample.  The center sample is effectively a nearest neighbor sample,
+    //  and the U/D/L/R samples use 1D linear filtering.  All other texels are
+    //  read with bilinear samples somewhere within their 2x2 texel blocks.
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute sampling offsets within each 2x2 texel block, based
+    //  on 1D sampling ratios between texels [1, 2] and [3, 4] texels away from
+    //  the center, and reuse them independently for both dimensions.  Compute
+    //  these offsets based on the relative 1D Gaussian weights of the texels
+    //  in question.  (w1off means "Gaussian weight for the texel 1.0 texels
+    //  away from the pixel center," etc.).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w1off = exp(-1.0 * denom_inv);
+    const float w2off = exp(-4.0 * denom_inv);
+    const float w3off = exp(-9.0 * denom_inv);
+    const float w4off = exp(-16.0 * denom_inv);
+    const float texel1to2ratio = w2off/(w1off + w2off);
+    const float texel3to4ratio = w4off/(w3off + w4off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including x-axis-aligned:
+    const float2 sample1R_texel_offset = float2(1.0, 0.0) + float2(texel1to2ratio, 0.0);
+    const float2 sample2R_texel_offset = float2(3.0, 0.0) + float2(texel3to4ratio, 0.0);
+    const float2 sample3d_texel_offset = float2(1.0, 1.0) + float2(texel1to2ratio, texel1to2ratio);
+    const float2 sample4d_texel_offset = float2(3.0, 1.0) + float2(texel3to4ratio, texel1to2ratio);
+    const float2 sample5d_texel_offset = float2(1.0, 3.0) + float2(texel1to2ratio, texel3to4ratio);
+    const float2 sample6d_texel_offset = float2(3.0, 3.0) + float2(texel3to4ratio, texel3to4ratio);
+
+    //  CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
+    //  Statically compute Gaussian texel weights for the bottom-right quadrant.
+    //  Read underscores as "and."
+    const float w1R1 = w1off;
+    const float w1R2 = w2off;
+    const float w2R1 = w3off;
+    const float w2R2 = w4off;
+    const float w3d1 =     exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
+    const float w3d2_3d3 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv);
+    const float w3d4 =     exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv);
+    const float w4d1_5d1 = exp(-LENGTH_SQ(float2(3.0, 1.0)) * denom_inv);
+    const float w4d2_5d3 = exp(-LENGTH_SQ(float2(4.0, 1.0)) * denom_inv);
+    const float w4d3_5d2 = exp(-LENGTH_SQ(float2(3.0, 2.0)) * denom_inv);
+    const float w4d4_5d4 = exp(-LENGTH_SQ(float2(4.0, 2.0)) * denom_inv);
+    const float w6d1 =     exp(-LENGTH_SQ(float2(3.0, 3.0)) * denom_inv);
+    const float w6d2_6d3 = exp(-LENGTH_SQ(float2(4.0, 3.0)) * denom_inv);
+    const float w6d4 =     exp(-LENGTH_SQ(float2(4.0, 4.0)) * denom_inv);
+    //  Statically add texel weights in each sample to get sample weights:
+    const float w0 = 1.0;
+    const float w1 = w1R1 + w1R2;
+    const float w2 = w2R1 + w2R2;
+    const float w3 = w3d1 + 2.0 * w3d2_3d3 + w3d4;
+    const float w4 = w4d1_5d1 + w4d2_5d3 + w4d3_5d2 + w4d4_5d4;
+    const float w5 = w4;
+    const float w6 = w6d1 + 2.0 * w6d2_6d3 + w6d4;
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv =
+        1.0/(w0 + 4.0 * (w1 + w2 + w3 + w4 + w5 + w6));
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 25 samples (1 nearest, 8 linear, 16 bilinear) using symmetry:
+    const float2 mirror_x = float2(-1.0, 1.0);
+    const float2 mirror_y = float2(1.0, -1.0);
+    const float2 mirror_xy = float2(-1.0, -1.0);
+    const float2 dxdy_mirror_x = dxdy * mirror_x;
+    const float2 dxdy_mirror_y = dxdy * mirror_y;
+    const float2 dxdy_mirror_xy = dxdy * mirror_xy;
+    //  Sampling order doesn't seem to affect performance, so just be clear:
+    const float3 sample0C = tex2D_linearize(tex, tex_uv).rgb;
+    const float3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb;
+    const float3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb;
+    const float3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb;
+    const float3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb;
+    const float3 sample2R = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset).rgb;
+    const float3 sample2D = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset.yx).rgb;
+    const float3 sample2L = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset).rgb;
+    const float3 sample2U = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset.yx).rgb;
+    const float3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb;
+    const float3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb;
+    const float3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb;
+    const float3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb;
+    const float3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb;
+    const float3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb;
+    const float3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb;
+    const float3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb;
+    const float3 sample5d = tex2D_linearize(tex, tex_uv + dxdy * sample5d_texel_offset).rgb;
+    const float3 sample5c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample5d_texel_offset).rgb;
+    const float3 sample5b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample5d_texel_offset).rgb;
+    const float3 sample5a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample5d_texel_offset).rgb;
+    const float3 sample6d = tex2D_linearize(tex, tex_uv + dxdy * sample6d_texel_offset).rgb;
+    const float3 sample6c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample6d_texel_offset).rgb;
+    const float3 sample6b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample6d_texel_offset).rgb;
+    const float3 sample6a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample6d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    float3 sum = w0 * sample0C;
+    sum += w1 * (sample1R + sample1D + sample1L + sample1U);
+    sum += w2 * (sample2R + sample2D + sample2L + sample2U);
+    sum += w3 * (sample3d + sample3c + sample3b + sample3a);
+    sum += w4 * (sample4d + sample4c + sample4b + sample4a);
+    sum += w5 * (sample5d + sample5c + sample5b + sample5a);
+    sum += w6 * (sample6d + sample6c + sample6b + sample6a);
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur7x7(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Perform a 1-pass 7x7 blur with 5x5 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 7x7 Gaussian blurred mipmapped texture lookup composed of
+    //              4x4 carefully selected bilinear samples.
+    //  Description:
+    //  First see the descriptions for tex2Dblur9x9() and tex2Dblur7().  This
+    //  blur mixes concepts from both.  The sample layout is as follows:
+    //      4a 3a 3b 4b
+    //      2a 1a 1b 2b
+    //      2c 1c 1d 2d
+    //      4c 3c 3d 4d
+    //  The texel layout is as follows.  Note that samples 3a/3b, 1a/1b, 1c/1d,
+    //  and 3c/3d share a vertical column of texels, and samples 2a/2c, 1a/1c,
+    //  1b/1d, and 2b/2d share a horizontal row of texels (all sample1's share
+    //  the center texel):
+    //      4a4  4a3  3a4  3ab3 3b4  4b3  4b4
+    //      4a2  4a1  3a2  3ab1 3b2  4b1  4b2
+    //      2a4  2a3  1a4  1ab3 1b4  2b3  2b4
+    //      2ac2 2ac1 1ac2 1*   1bd2 2bd1 2bd2
+    //      2c4  2c3  1c4  1cd3 1d4  2d3  2d4
+    //      4c2  4c1  3c2  3cd1 3d2  4d1  4d2
+    //      4c4  4c3  3c4  3cd3 3d4  4d3  4d4
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off = 1.0;
+    const float w1off = exp(-1.0 * denom_inv);
+    const float w2off = exp(-4.0 * denom_inv);
+    const float w3off = exp(-9.0 * denom_inv);
+    const float texel0to1ratio = w1off/(w0off * 0.5 + w1off);
+    const float texel2to3ratio = w3off/(w2off + w3off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including axis-aligned:
+    const float2 sample1d_texel_offset = float2(texel0to1ratio, texel0to1ratio);
+    const float2 sample2d_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample3d_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample4d_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+
+    //  CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
+    //  Statically compute Gaussian texel weights for the bottom-right quadrant.
+    //  Read underscores as "and."
+    const float w1abcd = 1.0;
+    const float w1bd2_1cd3 = exp(-LENGTH_SQ(float2(1.0, 0.0)) * denom_inv);
+    const float w2bd1_3cd1 = exp(-LENGTH_SQ(float2(2.0, 0.0)) * denom_inv);
+    const float w2bd2_3cd2 = exp(-LENGTH_SQ(float2(3.0, 0.0)) * denom_inv);
+    const float w1d4 =       exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
+    const float w2d3_3d2 =   exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv);
+    const float w2d4_3d4 =   exp(-LENGTH_SQ(float2(3.0, 1.0)) * denom_inv);
+    const float w4d1 =       exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv);
+    const float w4d2_4d3 =   exp(-LENGTH_SQ(float2(3.0, 2.0)) * denom_inv);
+    const float w4d4 =       exp(-LENGTH_SQ(float2(3.0, 3.0)) * denom_inv);
+    //  Statically add texel weights in each sample to get sample weights.
+    //  Split weights for shared texels between samples sharing them:
+    const float w1 = w1abcd * 0.25 + w1bd2_1cd3 + w1d4;
+    const float w2_3 = (w2bd1_3cd1 + w2bd2_3cd2) * 0.5 + w2d3_3d2 + w2d4_3d4;
+    const float w4 = w4d1 + 2.0 * w4d2_4d3 + w4d4;
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv =
+        1.0/(4.0 * (w1 + 2.0 * w2_3 + w4));
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 16 samples using symmetry:
+    const float2 mirror_x = float2(-1.0, 1.0);
+    const float2 mirror_y = float2(1.0, -1.0);
+    const float2 mirror_xy = float2(-1.0, -1.0);
+    const float2 dxdy_mirror_x = dxdy * mirror_x;
+    const float2 dxdy_mirror_y = dxdy * mirror_y;
+    const float2 dxdy_mirror_xy = dxdy * mirror_xy;
+    const float3 sample1a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample1d_texel_offset).rgb;
+    const float3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb;
+    const float3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb;
+    const float3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb;
+    const float3 sample1b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample1d_texel_offset).rgb;
+    const float3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb;
+    const float3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb;
+    const float3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb;
+    const float3 sample1c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample1d_texel_offset).rgb;
+    const float3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb;
+    const float3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb;
+    const float3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb;
+    const float3 sample1d = tex2D_linearize(tex, tex_uv + dxdy * sample1d_texel_offset).rgb;
+    const float3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb;
+    const float3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb;
+    const float3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w1 * (sample1a + sample1b + sample1c + sample1d);
+    sum += w2_3 * (sample2a + sample2b + sample2c + sample2d);
+    sum += w2_3 * (sample3a + sample3b + sample3c + sample3d);
+    sum += w4 * (sample4a + sample4b + sample4c + sample4d);
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur5x5(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Perform a 1-pass 5x5 blur with 3x3 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 5x5 Gaussian blurred mipmapped texture lookup composed of
+    //              3x3 carefully selected bilinear samples.
+    //  Description:
+    //  First see the description for tex2Dblur9x9().  This blur uses the same
+    //  concept and sample/texel locations except on a smaller scale.  Samples:
+    //      2a 1U 2b
+    //      1L 0C 1R
+    //      2c 1D 2d
+    //  Texels:
+    //      2a4 2a3 1U2 2b3 2b4
+    //      2a2 2a1 1U1 2b1 2b2
+    //      1L2 1L1 0C1 1R1 1R2
+    //      2c2 2c1 1D1 2d1 2d2
+    //      2c4 2c3 1D2 2d3 2d4
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w1off = exp(-1.0 * denom_inv);
+    const float w2off = exp(-4.0 * denom_inv);
+    const float texel1to2ratio = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including x-axis-aligned:
+    const float2 sample1R_texel_offset = float2(1.0, 0.0) + float2(texel1to2ratio, 0.0);
+    const float2 sample2d_texel_offset = float2(1.0, 1.0) + float2(texel1to2ratio, texel1to2ratio);
+
+    //  CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
+    //  Statically compute Gaussian texel weights for the bottom-right quadrant.
+    //  Read underscores as "and."
+    const float w1R1 = w1off;
+    const float w1R2 = w2off;
+    const float w2d1 =   exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
+    const float w2d2_3 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv);
+    const float w2d4 =   exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv);
+    //  Statically add texel weights in each sample to get sample weights:
+    const float w0 = 1.0;
+    const float w1 = w1R1 + w1R2;
+    const float w2 = w2d1 + 2.0 * w2d2_3 + w2d4;
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv = 1.0/(w0 + 4.0 * (w1 + w2));
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 9 samples (1 nearest, 4 linear, 4 bilinear) using symmetry:
+    const float2 mirror_x = float2(-1.0, 1.0);
+    const float2 mirror_y = float2(1.0, -1.0);
+    const float2 mirror_xy = float2(-1.0, -1.0);
+    const float2 dxdy_mirror_x = dxdy * mirror_x;
+    const float2 dxdy_mirror_y = dxdy * mirror_y;
+    const float2 dxdy_mirror_xy = dxdy * mirror_xy;
+    const float3 sample0C = tex2D_linearize(tex, tex_uv).rgb;
+    const float3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb;
+    const float3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb;
+    const float3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb;
+    const float3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb;
+    const float3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb;
+    const float3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb;
+    const float3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb;
+    const float3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    float3 sum = w0 * sample0C;
+    sum += w1 * (sample1R + sample1D + sample1L + sample1U);
+    sum += w2 * (sample2a + sample2b + sample2c + sample2d);
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur3x3(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Perform a 1-pass 3x3 blur with 5x5 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 3x3 Gaussian blurred mipmapped texture lookup composed of
+    //              2x2 carefully selected bilinear samples.
+    //  Description:
+    //  First see the descriptions for tex2Dblur9x9() and tex2Dblur7().  This
+    //  blur mixes concepts from both.  The sample layout is as follows:
+    //      0a 0b
+    //      0c 0d
+    //  The texel layout is as follows.  Note that samples 0a/0b and 0c/0d share
+    //  a vertical column of texels, and samples 0a/0c and 0b/0d share a
+    //  horizontal row of texels (all samples share the center texel):
+    //      0a3  0ab2 0b3
+    //      0ac1 0*0  0bd1
+    //      0c3  0cd2 0d3
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off = 1.0;
+    const float w1off = exp(-1.0 * denom_inv);
+    const float texel0to1ratio = w1off/(w0off * 0.5 + w1off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including axis-aligned:
+    const float2 sample0d_texel_offset = float2(texel0to1ratio, texel0to1ratio);
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 4 samples using symmetry:
+    const float2 mirror_x = float2(-1.0, 1.0);
+    const float2 mirror_y = float2(1.0, -1.0);
+    const float2 mirror_xy = float2(-1.0, -1.0);
+    const float2 dxdy_mirror_x = dxdy * mirror_x;
+    const float2 dxdy_mirror_y = dxdy * mirror_y;
+    const float2 dxdy_mirror_xy = dxdy * mirror_xy;
+    const float3 sample0a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample0d_texel_offset).rgb;
+    const float3 sample0b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample0d_texel_offset).rgb;
+    const float3 sample0c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample0d_texel_offset).rgb;
+    const float3 sample0d = tex2D_linearize(tex, tex_uv + dxdy * sample0d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Weights for all samples are the same, so just average them:
+    return 0.25 * (sample0a + sample0b + sample0c + sample0d);
+}
+
+
+//////////////////  LINEAR ONE-PASS BLURS WITH SHARED SAMPLES  /////////////////
+
+float3 tex2Dblur12x12shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
+    const float sigma)
+{
+    //  Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
+    //  Requires:   1.) Same as tex2Dblur9()
+    //              2.) ddx() and ddy() are present in the current Cg profile.
+    //              3.) The GPU driver is using fine/high-quality derivatives.
+    //              4.) quad_vector *correctly* describes the current fragment's
+    //                  location in its pixel quad, by the conventions noted in
+    //                  get_quad_vector[_naive].
+    //              5.) tex_uv.w = log2(video_size/output_size).y
+    //              6.) tex2Dlod() is present in the current Cg profile.
+    //  Optional:   Tune artifacts vs. excessive blurriness with the global
+    //              float error_blurring.
+    //  Returns:    A blurred texture lookup using a "virtual" 12x12 Gaussian
+    //              blur (a 6x6 blur of carefully selected bilinear samples)
+    //              of the given mip level.  There will be subtle inaccuracies,
+    //              especially for small or high-frequency detailed sources.
+    //  Description:
+    //  Perform a 1-pass blur with shared texture lookups across a pixel quad.
+    //  We'll get neighboring samples with high-quality ddx/ddy derivatives, as
+    //  in GPU Pro 2, Chapter VI.2, "Shader Amortization using Pixel Quad
+    //  Message Passing" by Eric Penner.
+    //
+    //  Our "virtual" 12x12 blur will be comprised of ((6 - 1)^2)/4 + 3 = 12
+    //  bilinear samples, where bilinear sampling positions are computed from
+    //  the relative Gaussian weights of the 4 surrounding texels.  The catch is
+    //  that the appropriate texel weights and sample coords differ for each
+    //  fragment, but we're reusing most of the same samples across a quad of
+    //  destination fragments.  (We do use unique coords for the four nearest
+    //  samples at each fragment.)  Mixing bilinear filtering and sample-sharing
+    //  therefore introduces some error into the weights, and this can get nasty
+    //  when the source image is small or high-frequency.  Computing bilinear
+    //  ratios based on weights at the sample field center results in sharpening
+    //  and ringing artifacts, but we can move samples closer to halfway between
+    //  texels to try blurring away the error (which can move features around by
+    //  a texel or so).  Tune this with the global float "error_blurring".
+    //
+    //  The pixel quad's sample field covers 12x12 texels, accessed through 6x6
+    //  bilinear (2x2 texel) taps.  Each fragment depends on a window of 10x10
+    //  texels (5x5 bilinear taps), and each fragment is responsible for loading
+    //  a 6x6 texel quadrant as a 3x3 block of bilinear taps, plus 3 more taps
+    //  to use unique bilinear coords for sample0* for each fragment.  This
+    //  diagram illustrates the relative locations of bilinear samples 1-9 for
+    //  each quadrant a, b, c, d (note samples will not be equally spaced):
+    //      8a 7a 6a 6b 7b 8b
+    //      5a 4a 3a 3b 4b 5b
+    //      2a 1a 0a 0b 1b 2b
+    //      2c 1c 0c 0d 1d 2d
+    //      5c 4c 3c 3d 4d 5d
+    //      8c 7c 6c 6d 7d 8d
+    //  The following diagram illustrates the underlying equally spaced texels,
+    //  named after the sample that accesses them and subnamed by their location
+    //  within their 2x2 texel block:
+    //      8a3 8a2 7a3 7a2 6a3 6a2 6b2 6b3 7b2 7b3 8b2 8b3
+    //      8a1 8a0 7a1 7a0 6a1 6a0 6b0 6b1 7b0 7b1 8b0 8b1
+    //      5a3 5a2 4a3 4a2 3a3 3a2 3b2 3b3 4b2 4b3 5b2 5b3
+    //      5a1 5a0 4a1 4a0 3a1 3a0 3b0 3b1 4b0 4b1 5b0 5b1
+    //      2a3 2a2 1a3 1a2 0a3 0a2 0b2 0b3 1b2 1b3 2b2 2b3
+    //      2a1 2a0 1a1 1a0 0a1 0a0 0b0 0b1 1b0 1b1 2b0 2b1
+    //      2c1 2c0 1c1 1c0 0c1 0c0 0d0 0d1 1d0 1d1 2d0 2d1
+    //      2c3 2c2 1c3 1c2 0c3 0c2 0d2 0d3 1d2 1d3 2d2 2d3
+    //      5c1 5c0 4c1 4c0 3c1 3c0 3d0 3d1 4d0 4d1 5d0 5d1
+    //      5c3 5c2 4c3 4c2 3c3 3c2 3d2 3d3 4d2 4d3 5d2 5d3
+    //      8c1 8c0 7c1 7c0 6c1 6c0 6d0 6d1 7d0 7d1 8d0 8d1
+    //      8c3 8c2 7c3 7c2 6c3 6c2 6d2 6d3 7d2 7d3 8d2 8d3
+    //  With this symmetric arrangement, we don't have to know which absolute
+    //  quadrant a sample lies in to assign kernel weights; it's enough to know
+    //  the sample number and the relative quadrant of the sample (relative to
+    //  the current quadrant):
+    //      {current, adjacent x, adjacent y, diagonal}
+
+    //  COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Statically compute sampling offsets within each 2x2 texel block, based
+    //  on appropriate 1D Gaussian sampling ratio between texels [0, 1], [2, 3],
+    //  and [4, 5] away from the fragment, and reuse them independently for both
+    //  dimensions.  Use the sample field center as the estimated destination,
+    //  but nudge the result closer to halfway between texels to blur error.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off   = 1.0;
+    const float w0_5off = exp(-(0.5*0.5) * denom_inv);
+    const float w1off   = exp(-(1.0*1.0) * denom_inv);
+    const float w1_5off = exp(-(1.5*1.5) * denom_inv);
+    const float w2off   = exp(-(2.0*2.0) * denom_inv);
+    const float w2_5off = exp(-(2.5*2.5) * denom_inv);
+    const float w3_5off = exp(-(3.5*3.5) * denom_inv);
+    const float w4_5off = exp(-(4.5*4.5) * denom_inv);
+    const float w5_5off = exp(-(5.5*5.5) * denom_inv);
+    const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
+    const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
+    const float texel4to5ratio = lerp(w5_5off/(w4_5off + w5_5off), 0.5, error_blurring);
+    //  We don't share sample0*, so use the nearest destination fragment:
+    const float texel0to1ratio_nearest = w1off/(w0off + w1off);
+    const float texel1to2ratio_nearest = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the bottom-right fragment to each
+    //  bilinear sample in the bottom-right quadrant:
+    const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample2_texel_offset = float2(4.0, 0.0) + float2(texel4to5ratio, texel0to1ratio);
+    const float2 sample3_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample4_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+    const float2 sample5_texel_offset = float2(4.0, 2.0) + float2(texel4to5ratio, texel2to3ratio);
+    const float2 sample6_texel_offset = float2(0.0, 4.0) + float2(texel0to1ratio, texel4to5ratio);
+    const float2 sample7_texel_offset = float2(2.0, 4.0) + float2(texel2to3ratio, texel4to5ratio);
+    const float2 sample8_texel_offset = float2(4.0, 4.0) + float2(texel4to5ratio, texel4to5ratio);
+
+    //  CALCULATE KERNEL WEIGHTS:
+    //  Statically compute bilinear sample weights at each destination fragment
+    //  based on the sum of their 4 underlying texel weights.  Assume a same-
+    //  resolution blur, so each symmetrically named sample weight will compute
+    //  the same at every fragment in the pixel quad: We can therefore compute
+    //  texel weights based only on the bottom-right quadrant (fragment at 0d0).
+    //  Too avoid too much boilerplate code, use a macro to get all 4 texel
+    //  weights for a bilinear sample based on the offset of its top-left texel:
+    #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
+        (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
+    const float w8diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -6.0);
+    const float w7diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -6.0);
+    const float w6diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -6.0);
+    const float w6adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -6.0);
+    const float w7adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -6.0);
+    const float w8adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -6.0);
+    const float w5diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -4.0);
+    const float w4diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0);
+    const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0);
+    const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0);
+    const float w4adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0);
+    const float w5adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -4.0);
+    const float w2diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -2.0);
+    const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0);
+    const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
+    const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
+    const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
+    const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -2.0);
+    const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 0.0);
+    const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0);
+    const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
+    const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
+    const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
+    const float w2curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 0.0);
+    const float w5adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 2.0);
+    const float w4adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0);
+    const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
+    const float w3curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
+    const float w4curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
+    const float w5curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 2.0);
+    const float w8adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 4.0);
+    const float w7adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 4.0);
+    const float w6adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 4.0);
+    const float w6curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 4.0);
+    const float w7curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 4.0);
+    const float w8curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 4.0);
+    #undef GET_TEXEL_QUAD_WEIGHTS
+    //  Statically pack weights for runtime:
+    const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
+    const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag);
+    const float4 w2 = float4(w2curr, w2adjx, w2adjy, w2diag);
+    const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag);
+    const float4 w4 = float4(w4curr, w4adjx, w4adjy, w4diag);
+    const float4 w5 = float4(w5curr, w5adjx, w5adjy, w5diag);
+    const float4 w6 = float4(w6curr, w6adjx, w6adjy, w6diag);
+    const float4 w7 = float4(w7curr, w7adjx, w7adjy, w7diag);
+    const float4 w8 = float4(w8curr, w8adjx, w8adjy, w8diag);
+    //  Get the weight sum inverse (normalization factor):
+    const float4 weight_sum4 = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8;
+    const float2 weight_sum2 = weight_sum4.xy + weight_sum4.zw;
+    const float weight_sum = weight_sum2.x + weight_sum2.y;
+    const float weight_sum_inv = 1.0/(weight_sum);
+
+    //  LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
+    const float2 dxdy_curr = dxdy * quad_vector.xy;
+    //  Load bilinear samples for the current quadrant (for this fragment):
+    const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
+    const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
+    const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
+    const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
+    const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
+    const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
+    const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
+    const float3 sample4curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample4_texel_offset)).rgb;
+    const float3 sample5curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample5_texel_offset)).rgb;
+    const float3 sample6curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample6_texel_offset)).rgb;
+    const float3 sample7curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample7_texel_offset)).rgb;
+    const float3 sample8curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample8_texel_offset)).rgb;
+
+    //  GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
+    //  Fetch the samples from other fragments in the 2x2 quad:
+    float3 sample1adjx, sample1adjy, sample1diag;
+    float3 sample2adjx, sample2adjy, sample2diag;
+    float3 sample3adjx, sample3adjy, sample3diag;
+    float3 sample4adjx, sample4adjy, sample4diag;
+    float3 sample5adjx, sample5adjy, sample5diag;
+    float3 sample6adjx, sample6adjy, sample6diag;
+    float3 sample7adjx, sample7adjy, sample7diag;
+    float3 sample8adjx, sample8adjy, sample8diag;
+    quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
+    quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
+    quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag);
+    quad_gather(quad_vector, sample4curr, sample4adjx, sample4adjy, sample4diag);
+    quad_gather(quad_vector, sample5curr, sample5adjx, sample5adjy, sample5diag);
+    quad_gather(quad_vector, sample6curr, sample6adjx, sample6adjy, sample6diag);
+    quad_gather(quad_vector, sample7curr, sample7adjx, sample7adjy, sample7diag);
+    quad_gather(quad_vector, sample8curr, sample8adjx, sample8adjy, sample8diag);
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    //  Fill each row of a matrix with an rgb sample and pre-multiply by the
+    //  weights to obtain a weighted result:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
+    sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag));
+    sum += mul(w2, float4x3(sample2curr, sample2adjx, sample2adjy, sample2diag));
+    sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag));
+    sum += mul(w4, float4x3(sample4curr, sample4adjx, sample4adjy, sample4diag));
+    sum += mul(w5, float4x3(sample5curr, sample5adjx, sample5adjy, sample5diag));
+    sum += mul(w6, float4x3(sample6curr, sample6adjx, sample6adjy, sample6diag));
+    sum += mul(w7, float4x3(sample7curr, sample7adjx, sample7adjy, sample7diag));
+    sum += mul(w8, float4x3(sample8curr, sample8adjx, sample8adjy, sample8diag));
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur10x10shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
+    const float sigma)
+{
+    //  Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
+    //  Requires:   Same as tex2Dblur12x12shared()
+    //  Returns:    A blurred texture lookup using a "virtual" 10x10 Gaussian
+    //              blur (a 5x5 blur of carefully selected bilinear samples)
+    //              of the given mip level.  There will be subtle inaccuracies,
+    //              especially for small or high-frequency detailed sources.
+    //  Description:
+    //  First see the description for tex2Dblur12x12shared().  This
+    //  function shares the same concept and sample placement, but each fragment
+    //  only uses 25 of the 36 samples taken across the pixel quad (to cover a
+    //  5x5 sample area, or 10x10 texel area), and it uses a lower standard
+    //  deviation to compensate.  Thanks to symmetry, the 11 omitted samples
+    //  are always the "same:"
+    //      8adjx, 2adjx, 5adjx,
+    //      6adjy, 7adjy, 8adjy,
+    //      2diag, 5diag, 6diag, 7diag, 8diag
+
+    //  COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off   = 1.0;
+    const float w0_5off = exp(-(0.5*0.5) * denom_inv);
+    const float w1off   = exp(-(1.0*1.0) * denom_inv);
+    const float w1_5off = exp(-(1.5*1.5) * denom_inv);
+    const float w2off   = exp(-(2.0*2.0) * denom_inv);
+    const float w2_5off = exp(-(2.5*2.5) * denom_inv);
+    const float w3_5off = exp(-(3.5*3.5) * denom_inv);
+    const float w4_5off = exp(-(4.5*4.5) * denom_inv);
+    const float w5_5off = exp(-(5.5*5.5) * denom_inv);
+    const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
+    const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
+    const float texel4to5ratio = lerp(w5_5off/(w4_5off + w5_5off), 0.5, error_blurring);
+    //  We don't share sample0*, so use the nearest destination fragment:
+    const float texel0to1ratio_nearest = w1off/(w0off + w1off);
+    const float texel1to2ratio_nearest = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the bottom-right fragment to each
+    //  bilinear sample in the bottom-right quadrant:
+    const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample2_texel_offset = float2(4.0, 0.0) + float2(texel4to5ratio, texel0to1ratio);
+    const float2 sample3_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample4_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+    const float2 sample5_texel_offset = float2(4.0, 2.0) + float2(texel4to5ratio, texel2to3ratio);
+    const float2 sample6_texel_offset = float2(0.0, 4.0) + float2(texel0to1ratio, texel4to5ratio);
+    const float2 sample7_texel_offset = float2(2.0, 4.0) + float2(texel2to3ratio, texel4to5ratio);
+    const float2 sample8_texel_offset = float2(4.0, 4.0) + float2(texel4to5ratio, texel4to5ratio);
+
+    //  CALCULATE KERNEL WEIGHTS:
+    //  Statically compute bilinear sample weights at each destination fragment
+    //  from the sum of their 4 texel weights (details in tex2Dblur12x12shared).
+    #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
+        (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
+    //  We only need 25 of the 36 sample weights.  Skip the following weights:
+    //      8adjx, 2adjx, 5adjx,
+    //      6adjy, 7adjy, 8adjy,
+    //      2diag, 5diag, 6diag, 7diag, 8diag
+    const float w4diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0);
+    const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0);
+    const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0);
+    const float w4adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0);
+    const float w5adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -4.0);
+    const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0);
+    const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
+    const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
+    const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
+    const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -2.0);
+    const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0);
+    const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
+    const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
+    const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
+    const float w2curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 0.0);
+    const float w4adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0);
+    const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
+    const float w3curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
+    const float w4curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
+    const float w5curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 2.0);
+    const float w7adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 4.0);
+    const float w6adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 4.0);
+    const float w6curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 4.0);
+    const float w7curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 4.0);
+    const float w8curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 4.0);
+    #undef GET_TEXEL_QUAD_WEIGHTS
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv = 1.0/(w0curr + w1curr + w2curr + w3curr +
+        w4curr + w5curr + w6curr + w7curr + w8curr +
+        w0adjx + w1adjx + w3adjx + w4adjx + w6adjx + w7adjx +
+        w0adjy + w1adjy + w2adjy + w3adjy + w4adjy + w5adjy +
+        w0diag + w1diag + w3diag + w4diag);
+    //  Statically pack most weights for runtime.  Note the mixed packing:
+    const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
+    const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag);
+    const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag);
+    const float4 w4 = float4(w4curr, w4adjx, w4adjy, w4diag);
+    const float4 w2and5 = float4(w2curr, w2adjy, w5curr, w5adjy);
+    const float4 w6and7 = float4(w6curr, w6adjx, w7curr, w7adjx);
+
+    //  LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
+    const float2 dxdy_curr = dxdy * quad_vector.xy;
+    //  Load bilinear samples for the current quadrant (for this fragment):
+    const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
+    const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
+    const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
+    const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
+    const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
+    const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
+    const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
+    const float3 sample4curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample4_texel_offset)).rgb;
+    const float3 sample5curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample5_texel_offset)).rgb;
+    const float3 sample6curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample6_texel_offset)).rgb;
+    const float3 sample7curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample7_texel_offset)).rgb;
+    const float3 sample8curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample8_texel_offset)).rgb;
+
+    //  GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
+    //  Fetch the samples from other fragments in the 2x2 quad in order of need:
+    float3 sample1adjx, sample1adjy, sample1diag;
+    float3 sample2adjx, sample2adjy, sample2diag;
+    float3 sample3adjx, sample3adjy, sample3diag;
+    float3 sample4adjx, sample4adjy, sample4diag;
+    float3 sample5adjx, sample5adjy, sample5diag;
+    float3 sample6adjx, sample6adjy, sample6diag;
+    float3 sample7adjx, sample7adjy, sample7diag;
+    quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
+    quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
+    quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag);
+    quad_gather(quad_vector, sample4curr, sample4adjx, sample4adjy, sample4diag);
+    quad_gather(quad_vector, sample5curr, sample5adjx, sample5adjy, sample5diag);
+    quad_gather(quad_vector, sample6curr, sample6adjx, sample6adjy, sample6diag);
+    quad_gather(quad_vector, sample7curr, sample7adjx, sample7adjy, sample7diag);
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    //  Fill each row of a matrix with an rgb sample and pre-multiply by the
+    //  weights to obtain a weighted result.  First do the simple ones:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
+    sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag));
+    sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag));
+    sum += mul(w4, float4x3(sample4curr, sample4adjx, sample4adjy, sample4diag));
+    //  Now do the mixed-sample ones:
+    sum += mul(w2and5, float4x3(sample2curr, sample2adjy, sample5curr, sample5adjy));
+    sum += mul(w6and7, float4x3(sample6curr, sample6adjx, sample7curr, sample7adjx));
+    sum += w8curr * sample8curr;
+    //  Normalize the sum (so the weights add to 1.0) and return:
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur8x8shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
+    const float sigma)
+{
+    //  Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
+    //  Requires:   Same as tex2Dblur12x12shared()
+    //  Returns:    A blurred texture lookup using a "virtual" 8x8 Gaussian
+    //              blur (a 4x4 blur of carefully selected bilinear samples)
+    //              of the given mip level.  There will be subtle inaccuracies,
+    //              especially for small or high-frequency detailed sources.
+    //  Description:
+    //  First see the description for tex2Dblur12x12shared().  This function
+    //  shares the same concept and a similar sample placement, except each
+    //  quadrant contains 4x4 texels and 2x2 samples instead of 6x6 and 3x3
+    //  respectively.  There could be a total of 16 samples, 4 of which each
+    //  fragment is responsible for, but each fragment loads 0a/0b/0c/0d with
+    //  its own offset to reduce shared sample artifacts, bringing the sample
+    //  count for each fragment to 7.  Sample placement:
+    //      3a 2a 2b 3b
+    //      1a 0a 0b 1b
+    //      1c 0c 0d 1d
+    //      3c 2c 2d 3d
+    //  Texel placement:
+    //      3a3 3a2 2a3 2a2 2b2 2b3 3b2 3b3
+    //      3a1 3a0 2a1 2a0 2b0 2b1 3b0 3b1
+    //      1a3 1a2 0a3 0a2 0b2 0b3 1b2 1b3
+    //      1a1 1a0 0a1 0a0 0b0 0b1 1b0 1b1
+    //      1c1 1c0 0c1 0c0 0d0 0d1 1d0 1d1
+    //      1c3 1c2 0c3 0c2 0d2 0d3 1d2 1d3
+    //      3c1 3c0 2c1 2c0 2d0 2d1 3d0 4d1
+    //      3c3 3c2 2c3 2c2 2d2 2d3 3d2 4d3
+    
+    //  COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off   = 1.0;
+    const float w0_5off = exp(-(0.5*0.5) * denom_inv);
+    const float w1off   = exp(-(1.0*1.0) * denom_inv);
+    const float w1_5off = exp(-(1.5*1.5) * denom_inv);
+    const float w2off   = exp(-(2.0*2.0) * denom_inv);
+    const float w2_5off = exp(-(2.5*2.5) * denom_inv);
+    const float w3_5off = exp(-(3.5*3.5) * denom_inv);
+    const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
+    const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
+    //  We don't share sample0*, so use the nearest destination fragment:
+    const float texel0to1ratio_nearest = w1off/(w0off + w1off);
+    const float texel1to2ratio_nearest = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the bottom-right fragment to each
+    //  bilinear sample in the bottom-right quadrant:
+    const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample2_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample3_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+
+    //  CALCULATE KERNEL WEIGHTS:
+    //  Statically compute bilinear sample weights at each destination fragment
+    //  from the sum of their 4 texel weights (details in tex2Dblur12x12shared).
+    #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
+        (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
+    const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0);
+    const float w2diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0);
+    const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0);
+    const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0);
+    const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0);
+    const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
+    const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
+    const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
+    const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0);
+    const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
+    const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
+    const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
+    const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0);
+    const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
+    const float w2curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
+    const float w3curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
+    #undef GET_TEXEL_QUAD_WEIGHTS
+    //  Statically pack weights for runtime:
+    const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
+    const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag);
+    const float4 w2 = float4(w2curr, w2adjx, w2adjy, w2diag);
+    const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag);
+    //  Get the weight sum inverse (normalization factor):
+    const float4 weight_sum4 = w0 + w1 + w2 + w3;
+    const float2 weight_sum2 = weight_sum4.xy + weight_sum4.zw;
+    const float weight_sum = weight_sum2.x + weight_sum2.y;
+    const float weight_sum_inv = 1.0/(weight_sum);
+
+    //  LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
+    const float2 dxdy_curr = dxdy * quad_vector.xy;
+    //  Load bilinear samples for the current quadrant (for this fragment):
+    const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
+    const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
+    const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
+    const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
+    const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
+    const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
+    const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
+
+    //  GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
+    //  Fetch the samples from other fragments in the 2x2 quad:
+    float3 sample1adjx, sample1adjy, sample1diag;
+    float3 sample2adjx, sample2adjy, sample2diag;
+    float3 sample3adjx, sample3adjy, sample3diag;
+    quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
+    quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
+    quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag);
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    //  Fill each row of a matrix with an rgb sample and pre-multiply by the
+    //  weights to obtain a weighted result:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
+    sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag));
+    sum += mul(w2, float4x3(sample2curr, sample2adjx, sample2adjy, sample2diag));
+    sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag));
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur6x6shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
+    const float sigma)
+{
+    //  Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
+    //  Requires:   Same as tex2Dblur12x12shared()
+    //  Returns:    A blurred texture lookup using a "virtual" 6x6 Gaussian
+    //              blur (a 3x3 blur of carefully selected bilinear samples)
+    //              of the given mip level.  There will be some inaccuracies,subtle inaccuracies,
+    //              especially for small or high-frequency detailed sources.
+    //  Description:
+    //  First see the description for tex2Dblur8x8shared().  This
+    //  function shares the same concept and sample placement, but each fragment
+    //  only uses 9 of the 16 samples taken across the pixel quad (to cover a
+    //  3x3 sample area, or 6x6 texel area), and it uses a lower standard
+    //  deviation to compensate.  Thanks to symmetry, the 7 omitted samples
+    //  are always the "same:"
+    //      1adjx, 3adjx
+    //      2adjy, 3adjy
+    //      1diag, 2diag, 3diag
+
+    //  COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off   = 1.0;
+    const float w0_5off = exp(-(0.5*0.5) * denom_inv);
+    const float w1off   = exp(-(1.0*1.0) * denom_inv);
+    const float w1_5off = exp(-(1.5*1.5) * denom_inv);
+    const float w2off   = exp(-(2.0*2.0) * denom_inv);
+    const float w2_5off = exp(-(2.5*2.5) * denom_inv);
+    const float w3_5off = exp(-(3.5*3.5) * denom_inv);
+    const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
+    const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
+    //  We don't share sample0*, so use the nearest destination fragment:
+    const float texel0to1ratio_nearest = w1off/(w0off + w1off);
+    const float texel1to2ratio_nearest = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the bottom-right fragment to each
+    //  bilinear sample in the bottom-right quadrant:
+    const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample2_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample3_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+
+    //  CALCULATE KERNEL WEIGHTS:
+    //  Statically compute bilinear sample weights at each destination fragment
+    //  from the sum of their 4 texel weights (details in tex2Dblur12x12shared).
+    #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
+        (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
+    //  We only need 9 of the 16 sample weights.  Skip the following weights:
+    //      1adjx, 3adjx
+    //      2adjy, 3adjy
+    //      1diag, 2diag, 3diag
+    const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
+    const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
+    const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
+    const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
+    const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
+    const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
+    const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
+    const float w2curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
+    const float w3curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
+    #undef GET_TEXEL_QUAD_WEIGHTS
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv = 1.0/(w0curr + w1curr + w2curr + w3curr +
+        w0adjx + w2adjx + w0adjy + w1adjy + w0diag);
+    //  Statically pack some weights for runtime:
+    const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
+
+    //  LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
+    const float2 dxdy_curr = dxdy * quad_vector.xy;
+    //  Load bilinear samples for the current quadrant (for this fragment):
+    const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
+    const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
+    const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
+    const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
+    const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
+    const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
+    const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
+
+    //  GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
+    //  Fetch the samples from other fragments in the 2x2 quad:
+    float3 sample1adjx, sample1adjy, sample1diag;
+    float3 sample2adjx, sample2adjy, sample2diag;
+    quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
+    quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    //  Fill each row of a matrix with an rgb sample and pre-multiply by the
+    //  weights to obtain a weighted result for sample1*, and handle the rest
+    //  of the weights more directly/verbosely:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
+    sum += w1curr * sample1curr + w1adjy * sample1adjy + w2curr * sample2curr +
+            w2adjx * sample2adjx + w3curr * sample3curr;
+    return sum * weight_sum_inv;
+}
+
+
+///////////////////////  MAX OPTIMAL SIGMA BLUR WRAPPERS  //////////////////////
+
+//  The following blurs are static wrappers around the dynamic blurs above.
+//  HOPEFULLY, the compiler will be smart enough to do constant-folding.
+
+//  Resizable separable blurs:
+inline float3 tex2Dblur11resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur11resize(tex, tex_uv, dxdy, blur11_std_dev);
+}
+inline float3 tex2Dblur9resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur9resize(tex, tex_uv, dxdy, blur9_std_dev);
+}
+inline float3 tex2Dblur7resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur7resize(tex, tex_uv, dxdy, blur7_std_dev);
+}
+inline float3 tex2Dblur5resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur5resize(tex, tex_uv, dxdy, blur5_std_dev);
+}
+inline float3 tex2Dblur3resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur3resize(tex, tex_uv, dxdy, blur3_std_dev);
+}
+//  Fast separable blurs:
+inline float3 tex2Dblur11fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur11fast(tex, tex_uv, dxdy, blur11_std_dev);
+}
+inline float3 tex2Dblur9fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur9fast(tex, tex_uv, dxdy, blur9_std_dev);
+}
+inline float3 tex2Dblur7fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur7fast(tex, tex_uv, dxdy, blur7_std_dev);
+}
+inline float3 tex2Dblur5fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur5fast(tex, tex_uv, dxdy, blur5_std_dev);
+}
+inline float3 tex2Dblur3fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur3fast(tex, tex_uv, dxdy, blur3_std_dev);
+}
+//  Huge, "fast" separable blurs:
+inline float3 tex2Dblur43fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur43fast(tex, tex_uv, dxdy, blur43_std_dev);
+}
+inline float3 tex2Dblur31fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur31fast(tex, tex_uv, dxdy, blur31_std_dev);
+}
+inline float3 tex2Dblur25fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur25fast(tex, tex_uv, dxdy, blur25_std_dev);
+}
+inline float3 tex2Dblur17fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur17fast(tex, tex_uv, dxdy, blur17_std_dev);
+}
+//  Resizable one-pass blurs:
+inline float3 tex2Dblur3x3resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur3x3resize(tex, tex_uv, dxdy, blur3_std_dev);
+}
+//  "Fast" one-pass blurs:
+inline float3 tex2Dblur9x9(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur9x9(tex, tex_uv, dxdy, blur9_std_dev);
+}
+inline float3 tex2Dblur7x7(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur7x7(tex, tex_uv, dxdy, blur7_std_dev);
+}
+inline float3 tex2Dblur5x5(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur5x5(tex, tex_uv, dxdy, blur5_std_dev);
+}
+inline float3 tex2Dblur3x3(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur3x3(tex, tex_uv, dxdy, blur3_std_dev);
+}
+//  "Fast" shared-sample one-pass blurs:
+inline float3 tex2Dblur12x12shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
+{
+    return tex2Dblur12x12shared(tex, tex_uv, dxdy, quad_vector, blur12_std_dev);
+}
+inline float3 tex2Dblur10x10shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
+{
+    return tex2Dblur10x10shared(tex, tex_uv, dxdy, quad_vector, blur10_std_dev);
+}
+inline float3 tex2Dblur8x8shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
+{
+    return tex2Dblur8x8shared(tex, tex_uv, dxdy, quad_vector, blur8_std_dev);
+}
+inline float3 tex2Dblur6x6shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
+{
+    return tex2Dblur6x6shared(tex, tex_uv, dxdy, quad_vector, blur6_std_dev);
+}
+
+
+#endif  //  BLUR_FUNCTIONS_H
+
+////////////////////////////  END BLUR-FUNCTIONS  ///////////////////////////
+
+///////////////////////////////  BLOOM CONSTANTS  //////////////////////////////
+
+//  Compute constants with manual inlines of the functions below:
+static const float bloom_diff_thresh = 1.0/256.0;
+
+
+
+///////////////////////////////////  HELPERS  //////////////////////////////////
+
+inline float get_min_sigma_to_blur_triad(const float triad_size,
+    const float thresh)
+{
+    //  Requires:   1.) triad_size is the final phosphor triad size in pixels
+    //              2.) thresh is the max desired pixel difference in the
+    //                  blurred triad (e.g. 1.0/256.0).
+    //  Returns:    Return the minimum sigma that will fully blur a phosphor
+    //              triad on the screen to an even color, within thresh.
+    //              This closed-form function was found by curve-fitting data.
+    //  Estimate: max error = ~0.086036, mean sq. error = ~0.0013387:
+    return -0.05168 + 0.6113*triad_size -
+        1.122*triad_size*sqrt(0.000416 + thresh);
+    //  Estimate: max error = ~0.16486, mean sq. error = ~0.0041041:
+    //return 0.5985*triad_size - triad_size*sqrt(thresh)
+}
+
+inline float get_absolute_scale_blur_sigma(const float thresh)
+{
+    //  Requires:   1.) min_expected_triads must be a global float.  The number
+    //                  of horizontal phosphor triads in the final image must be
+    //                  >= min_allowed_viewport_triads.x for realistic results.
+    //              2.) bloom_approx_scale_x must be a global float equal to the
+    //                  absolute horizontal scale of BLOOM_APPROX.
+    //              3.) bloom_approx_scale_x/min_allowed_viewport_triads.x
+    //                  should be <= 1.1658025090 to keep the final result <
+    //                  0.62666015625 (the largest sigma ensuring the largest
+    //                  unused texel weight stays < 1.0/256.0 for a 3x3 blur).
+    //              4.) thresh is the max desired pixel difference in the
+    //                  blurred triad (e.g. 1.0/256.0).
+    //  Returns:    Return the minimum Gaussian sigma that will blur the pass
+    //              output as much as it would have taken to blur away
+    //              bloom_approx_scale_x horizontal phosphor triads.
+    //  Description:
+    //  BLOOM_APPROX should look like a downscaled phosphor blur.  Ideally, we'd
+    //  use the same blur sigma as the actual phosphor bloom and scale it down
+    //  to the current resolution with (bloom_approx_scale_x/viewport_size_x), but
+    //  we don't know the viewport size in this pass.  Instead, we'll blur as
+    //  much as it would take to blur away min_allowed_viewport_triads.x.  This
+    //  will blur "more than necessary" if the user actually uses more triads,
+    //  but that's not terrible either, because blurring a constant fraction of
+    //  the viewport may better resemble a true optical bloom anyway (since the
+    //  viewport will generally be about the same fraction of each player's
+    //  field of view, regardless of screen size and resolution).
+    //  Assume an extremely large viewport size for asymptotic results.
+    return bloom_approx_scale_x/max_viewport_size_x *
+        get_min_sigma_to_blur_triad(
+            max_viewport_size_x/min_allowed_viewport_triads.x, thresh);
+}
+
+inline float get_center_weight(const float sigma)
+{
+    //  Given a Gaussian blur sigma, get the blur weight for the center texel.
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        return get_fast_gaussian_weight_sum_inv(sigma);
+    #else
+        const float denom_inv = 0.5/(sigma*sigma);
+        const float w0 = 1.0;
+        const float w1 = exp(-1.0 * denom_inv);
+        const float w2 = exp(-4.0 * denom_inv);
+        const float w3 = exp(-9.0 * denom_inv);
+        const float w4 = exp(-16.0 * denom_inv);
+        const float w5 = exp(-25.0 * denom_inv);
+        const float w6 = exp(-36.0 * denom_inv);
+        const float w7 = exp(-49.0 * denom_inv);
+        const float w8 = exp(-64.0 * denom_inv);
+        const float w9 = exp(-81.0 * denom_inv);
+        const float w10 = exp(-100.0 * denom_inv);
+        const float w11 = exp(-121.0 * denom_inv);
+        const float w12 = exp(-144.0 * denom_inv);
+        const float w13 = exp(-169.0 * denom_inv);
+        const float w14 = exp(-196.0 * denom_inv);
+        const float w15 = exp(-225.0 * denom_inv);
+        const float w16 = exp(-256.0 * denom_inv);
+        const float w17 = exp(-289.0 * denom_inv);
+        const float w18 = exp(-324.0 * denom_inv);
+        const float w19 = exp(-361.0 * denom_inv);
+        const float w20 = exp(-400.0 * denom_inv);
+        const float w21 = exp(-441.0 * denom_inv);
+        //  Note: If the implementation uses a smaller blur than the max allowed,
+        //  the worst case scenario is that the center weight will be overestimated,
+        //  so we'll put a bit more energy into the brightpass...no huge deal.
+        //  Then again, if the implementation uses a larger blur than the max
+        //  "allowed" because of dynamic branching, the center weight could be
+        //  underestimated, which is more of a problem...consider always using
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+            //  43x blur:
+            const float weight_sum_inv = 1.0 /
+                (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 +
+                w11 + w12 + w13 + w14 + w15 + w16 + w17 + w18 + w19 + w20 + w21));
+        #else
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+            //  31x blur:
+            const float weight_sum_inv = 1.0 /
+                (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 +
+                w8 + w9 + w10 + w11 + w12 + w13 + w14 + w15));
+        #else
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+            //  25x blur:
+            const float weight_sum_inv = 1.0 / (w0 + 2.0 * (
+                w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12));
+        #else
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+            //  17x blur:
+            const float weight_sum_inv = 1.0 / (w0 + 2.0 * (
+                w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8));
+        #else
+            //  9x blur:
+            const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+        const float center_weight = weight_sum_inv * weight_sum_inv;
+        return center_weight;
+    #endif
+}
+
+inline float3 tex2DblurNfast(const sampler2D texture, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  If sigma is static, we can safely branch and use the smallest blur
+    //  that's big enough.  Ignore #define hints, because we'll only use a
+    //  large blur if we actually need it, and the branches cost nothing.
+    #ifndef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #define PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE
+    #else
+        //  It's still worth branching if the profile supports dynamic branches:
+        //  It's much faster than using a hugely excessive blur, but each branch
+        //  eats ~1% FPS.
+        #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+            #define PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE
+        #endif
+    #endif
+    //  Failed optimization notes:
+    //  I originally created a same-size mipmapped 5-tap separable blur10 that
+    //  could handle any sigma by reaching into lower mip levels.  It was
+    //  as fast as blur25fast for runtime sigmas and a tad faster than
+    //  blur31fast for static sigmas, but mipmapping two viewport-size passes
+    //  ate 10% of FPS across all codepaths, so it wasn't worth it.
+    #ifdef PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE
+        if(sigma <= blur9_std_dev)
+        {
+            return tex2Dblur9fast(texture, tex_uv, dxdy, sigma);
+        }
+        else if(sigma <= blur17_std_dev)
+        {
+            return tex2Dblur17fast(texture, tex_uv, dxdy, sigma);
+        }
+        else if(sigma <= blur25_std_dev)
+        {
+            return tex2Dblur25fast(texture, tex_uv, dxdy, sigma);
+        }
+        else if(sigma <= blur31_std_dev)
+        {
+            return tex2Dblur31fast(texture, tex_uv, dxdy, sigma);
+        }
+        else
+        {
+            return tex2Dblur43fast(texture, tex_uv, dxdy, sigma);
+        }
+    #else
+        //  If we can't afford to branch, we can only guess at what blur
+        //  size we need.  Therefore, use the largest blur allowed.
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+            return tex2Dblur43fast(texture, tex_uv, dxdy, sigma);
+        #else
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+            return tex2Dblur31fast(texture, tex_uv, dxdy, sigma);
+        #else
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+            return tex2Dblur25fast(texture, tex_uv, dxdy, sigma);
+        #else
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+            return tex2Dblur17fast(texture, tex_uv, dxdy, sigma);
+        #else
+            return tex2Dblur9fast(texture, tex_uv, dxdy, sigma);
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    #endif  //  PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE
+}
+
+inline float get_bloom_approx_sigma(const float output_size_x_runtime,
+    const float estimated_viewport_size_x)
+{
+    //  Requires:   1.) output_size_x_runtime == BLOOM_APPROX.output_size.x.
+    //                  This is included for dynamic codepaths just in case the
+    //                  following two globals are incorrect:
+    //              2.) bloom_approx_size_x_for_skip should == the same
+    //                  if PHOSPHOR_BLOOM_FAKE is #defined
+    //              3.) bloom_approx_size_x should == the same otherwise
+    //  Returns:    For gaussian4x4, return a dynamic small bloom sigma that's
+    //              as close to optimal as possible given available information.
+    //              For blur3x3, return the a static small bloom sigma that
+    //              works well for typical cases.  Otherwise, we're using simple
+    //              bilinear filtering, so use static calculations.
+    //  Assume the default static value.  This is a compromise that ensures
+    //  typical triads are blurred, even if unusually large ones aren't.
+    static const float mask_num_triads_static =
+        max(min_allowed_viewport_triads.x, mask_num_triads_desired_static);
+    const float mask_num_triads_from_size =
+        estimated_viewport_size_x/mask_triad_size_desired;
+    const float mask_num_triads_runtime = max(min_allowed_viewport_triads.x,
+        lerp(mask_num_triads_from_size, mask_num_triads_desired,
+            mask_specify_num_triads));
+    //  Assume an extremely large viewport size for asymptotic results:
+    static const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0);
+    if(bloom_approx_filter > 1.5)   //  4x4 true Gaussian resize
+    {
+        //  Use the runtime num triads and output size:
+        const float asymptotic_triad_size =
+            max_viewport_size_x/mask_num_triads_runtime;
+        const float asymptotic_sigma = get_min_sigma_to_blur_triad(
+            asymptotic_triad_size, bloom_diff_thresh);
+        const float bloom_approx_sigma =
+            asymptotic_sigma * output_size_x_runtime/max_viewport_size_x;
+        //  The BLOOM_APPROX input has to be ORIG_LINEARIZED to avoid moire, but
+        //  account for the Gaussian scanline sigma from the last pass too.
+        //  The bloom will be too wide horizontally but tall enough vertically.
+        return length(float2(bloom_approx_sigma, beam_max_sigma));
+    }
+    else    //  3x3 blur resize (the bilinear resize doesn't need a sigma)
+    {
+        //  We're either using blur3x3 or bilinear filtering.  The biggest
+        //  reason to choose blur3x3 is to avoid dynamic weights, so use a
+        //  static calculation.
+        #ifdef PHOSPHOR_BLOOM_FAKE
+            static const float output_size_x_static =
+                bloom_approx_size_x_for_fake;
+        #else
+            static const float output_size_x_static = bloom_approx_size_x;
+        #endif
+        static const float asymptotic_triad_size =
+            max_viewport_size_x/mask_num_triads_static;
+        const float asymptotic_sigma = get_min_sigma_to_blur_triad(
+            asymptotic_triad_size, bloom_diff_thresh);
+        const float bloom_approx_sigma =
+            asymptotic_sigma * output_size_x_static/max_viewport_size_x;
+        //  The BLOOM_APPROX input has to be ORIG_LINEARIZED to avoid moire, but
+        //  try accounting for the Gaussian scanline sigma from the last pass
+        //  too; use the static default value:
+        return length(float2(bloom_approx_sigma, beam_max_sigma_static));
+    }
+}
+
+inline float get_final_bloom_sigma(const float bloom_sigma_runtime)
+{
+    //  Requires:   1.) bloom_sigma_runtime is a precalculated sigma that's
+    //                  optimal for the [known] triad size.
+    //              2.) Call this from a fragment shader (not a vertex shader),
+    //                  or blurring with static sigmas won't be constant-folded.
+    //  Returns:    Return the optimistic static sigma if the triad size is
+    //              known at compile time.  Otherwise return the optimal runtime
+    //              sigma (10% slower) or an implementation-specific compromise
+    //              between an optimistic or pessimistic static sigma.
+    //  Notes:      Call this from the fragment shader, NOT the vertex shader,
+    //              so static sigmas can be constant-folded!
+    const float bloom_sigma_optimistic = get_min_sigma_to_blur_triad(
+        mask_triad_size_desired_static, bloom_diff_thresh);
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        return bloom_sigma_runtime;
+    #else
+        //  Overblurring looks as bad as underblurring, so assume average-size
+        //  triads, not worst-case huge triads:
+        return bloom_sigma_optimistic;
+    #endif
+}
+
+
+#endif  //  BLOOM_FUNCTIONS_H
+
+////////////////////////////  END BLOOM-FUNCTIONS  ///////////////////////////
+
+///////////////////////////  END FRAGMENT-INCLUDES  //////////////////////////
+
+void main() {
+    //  Blur the vertically blurred brightpass horizontally by 9/17/25/43x:
+    const float bloom_sigma = get_final_bloom_sigma(bloom_sigma_runtime);
+    const float3 blurred_brightpass = tex2DblurNfast(bloom_texture,
+        bloom_tex_uv, bloom_dxdy, bloom_sigma);
+
+    //  Sample the masked scanlines.  Alpha contains the auto-dim factor:
+    const float3 intensity_dim =
+        tex2D_linearize(MASKED_SCANLINEStexture, scanline_tex_uv).rgb;
+    const float auto_dim_factor = levels_autodim_temp;
+    const float undim_factor = 1.0/auto_dim_factor;
+
+    //  Calculate the mask dimpass, add it to the blurred brightpass, and
+    //  undim (from scanline auto-dim) and amplify (from mask dim) the result:
+    const float mask_amplify = get_mask_amplify();
+    const float3 brightpass = tex2D_linearize(BRIGHTPASStexture,
+        brightpass_tex_uv).rgb;
+    const float3 dimpass = intensity_dim - brightpass;
+    const float3 phosphor_bloom = (dimpass + blurred_brightpass) *
+        mask_amplify * undim_factor * levels_contrast;
+
+    //  Sample the halation texture, and let some light bleed into refractive
+    //  diffusion.  Conceptually this occurs before the phosphor bloom, but
+    //  adding it in earlier passes causes black crush in the diffusion colors.
+    const float3 diffusion_color = levels_contrast * tex2D_linearize(
+        HALATION_BLURtexture, halation_tex_uv).rgb;
+    const float3 final_bloom = lerp(phosphor_bloom,
+        diffusion_color, diffusion_weight);
+
+    //  Encode and output the bloomed image:
+    FragColor = encode_output(float4(final_bloom, 1.0));
+}
\ No newline at end of file
diff --git a/shaders/CRT-Royale.shader/bloom-horizontal-reconstitute.vs b/shaders/CRT-Royale.shader/bloom-horizontal-reconstitute.vs
new file mode 100644
index 00000000..5d9ad005
--- /dev/null
+++ b/shaders/CRT-Royale.shader/bloom-horizontal-reconstitute.vs
@@ -0,0 +1,6570 @@
+#version 150
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+in vec4 position;
+in vec2 texCoord;
+
+// These things didn't want to function in the vertex, so I just commented them
+out Vertex {
+   vec2 vTexCoord;
+//   vec2 video_uv;
+//   vec2 scanline_tex_uv;
+//   vec2 halation_tex_uv;
+//   vec2 brightpass_tex_uv;
+//   vec2 bloom_tex_uv;
+   vec2 bloom_dxdy;
+   float bloom_sigma_runtime;
+};
+
+uniform vec4 targetSize;
+uniform vec4 sourceSize[];
+
+// USER SETTINGS BLOCK //
+
+#define crt_gamma 2.500000
+#define lcd_gamma 2.200000
+#define levels_contrast 1.0
+#define halation_weight 0.0
+#define diffusion_weight 0.075
+#define bloom_underestimate_levels 0.8
+#define bloom_excess 0.000000
+#define beam_min_sigma 0.020000
+#define beam_max_sigma 0.300000
+#define beam_spot_power 0.330000
+#define beam_min_shape 2.000000
+#define beam_max_shape 4.000000
+#define beam_shape_power 0.250000
+#define beam_horiz_filter 0.000000
+#define beam_horiz_sigma 0.35
+#define beam_horiz_linear_rgb_weight 1.000000
+#define convergence_offset_x_r -0.000000
+#define convergence_offset_x_g 0.000000
+#define convergence_offset_x_b 0.000000
+#define convergence_offset_y_r 0.000000
+#define convergence_offset_y_g -0.000000
+#define convergence_offset_y_b 0.000000
+#define mask_type 1.000000
+#define mask_sample_mode_desired 0.000000
+#define mask_specify_num_triads 0.000000
+#define mask_triad_size_desired 3.000000
+#define mask_num_triads_desired 480.000000
+#define aa_subpixel_r_offset_x_runtime -0.0
+#define aa_subpixel_r_offset_y_runtime 0.000000
+#define aa_cubic_c 0.500000
+#define aa_gauss_sigma 0.500000
+#define geom_mode_runtime 0.000000
+#define geom_radius 2.000000
+#define geom_view_dist 2.000000
+#define geom_tilt_angle_x 0.000000
+#define geom_tilt_angle_y 0.000000
+#define geom_aspect_ratio_x 432.000000
+#define geom_aspect_ratio_y 329.000000
+#define geom_overscan_x 1.000000
+#define geom_overscan_y 1.000000
+#define border_size 0.015
+#define border_darkness 2.0
+#define border_compress 2.500000
+#define interlace_bff 0.000000
+#define interlace_1080i 0.000000
+
+// END USER SETTINGS BLOCK //
+
+// compatibility macros for transparently converting HLSLisms into GLSLisms
+#define mul(a,b) (b*a)
+#define lerp(a,b,c) mix(a,b,c)
+#define saturate(c) clamp(c, 0.0, 1.0)
+#define frac(x) (fract(x))
+#define float2 vec2
+#define float3 vec3
+#define float4 vec4
+#define bool2 bvec2
+#define bool3 bvec3
+#define bool4 bvec4
+#define float2x2 mat2x2
+#define float3x3 mat3x3
+#define float4x4 mat4x4
+#define float4x3 mat4x3
+#define float2x4 mat2x4
+#define IN params
+#define texture_size sourceSize[0].xy
+#define video_size sourceSize[0].xy
+#define output_size targetSize.xy
+#define frame_count phase
+#define static  
+#define inline  
+#define const  
+#define fmod(x,y) mod(x,y)
+#define ddx(c) dFdx(c)
+#define ddy(c) dFdy(c)
+#define atan2(x,y) atan(y,x)
+#define rsqrt(c) inversesqrt(c)
+
+#define MASKED_SCANLINEStexture source[2]
+#define MASKED_SCANLINEStexture_size sourceSize[2].xy
+#define MASKED_SCANLINESvideo_size sourceSize[2].xy
+#define HALATION_BLURtexture source[5]
+#define HALATION_BLURtexture_size sourceSize[5].xy
+#define HALATION_BLURvideo_size sourceSize[5].xy
+#define BRIGHTPASStexture source[1]
+#define BRIGHTPASStexture_size sourceSize[1].xy
+#define BRIGHTPASSvideo_size sourceSize[1].xy
+
+#if defined(GL_ES)
+	#define COMPAT_PRECISION mediump
+#else
+	#define COMPAT_PRECISION
+#endif
+
+#if __VERSION__ >= 130
+	#define COMPAT_TEXTURE texture
+#else
+	#define COMPAT_TEXTURE texture2D
+#endif
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+////////////////////////////  END USER-SETTINGS  //////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  END DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////////////
+
+//#include "bind-shader-params.h"
+
+/////////////////////////////  BEGIN BIND-SHADER-PARAMS  ////////////////////////////
+
+#ifndef BIND_SHADER_PARAMS_H
+#define BIND_SHADER_PARAMS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+/////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+////////////////////   END DERIVED-SETTINGS-AND-CONSTANTS   /////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+//  Override some parameters for gamma-management.h and tex2Dantialias.h:
+#define OVERRIDE_DEVICE_GAMMA
+static const float gba_gamma = 3.5; //  Irrelevant but necessary to define.
+#define ANTIALIAS_OVERRIDE_BASICS
+#define ANTIALIAS_OVERRIDE_PARAMETERS
+
+//  Provide accessors for vector constants that pack scalar uniforms:
+inline float2 get_aspect_vector(const float geom_aspect_ratio)
+{
+    //  Get an aspect ratio vector.  Enforce geom_max_aspect_ratio, and prevent
+    //  the absolute scale from affecting the uv-mapping for curvature:
+    const float geom_clamped_aspect_ratio =
+        min(geom_aspect_ratio, geom_max_aspect_ratio);
+    const float2 geom_aspect =
+        normalize(float2(geom_clamped_aspect_ratio, 1.0));
+    return geom_aspect;
+}
+
+inline float2 get_geom_overscan_vector()
+{
+    return float2(geom_overscan_x, geom_overscan_y);
+}
+
+inline float2 get_geom_tilt_angle_vector()
+{
+    return float2(geom_tilt_angle_x, geom_tilt_angle_y);
+}
+
+inline float3 get_convergence_offsets_x_vector()
+{
+    return float3(convergence_offset_x_r, convergence_offset_x_g,
+        convergence_offset_x_b);
+}
+
+inline float3 get_convergence_offsets_y_vector()
+{
+    return float3(convergence_offset_y_r, convergence_offset_y_g,
+        convergence_offset_y_b);
+}
+
+inline float2 get_convergence_offsets_r_vector()
+{
+    return float2(convergence_offset_x_r, convergence_offset_y_r);
+}
+
+inline float2 get_convergence_offsets_g_vector()
+{
+    return float2(convergence_offset_x_g, convergence_offset_y_g);
+}
+
+inline float2 get_convergence_offsets_b_vector()
+{
+    return float2(convergence_offset_x_b, convergence_offset_y_b);
+}
+
+inline float2 get_aa_subpixel_r_offset()
+{
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+            //  WARNING: THIS IS EXTREMELY EXPENSIVE.
+            return float2(aa_subpixel_r_offset_x_runtime,
+                aa_subpixel_r_offset_y_runtime);
+        #else
+            return aa_subpixel_r_offset_static;
+        #endif
+    #else
+        return aa_subpixel_r_offset_static;
+    #endif
+}
+
+//  Provide accessors settings which still need "cooking:"
+inline float get_mask_amplify()
+{
+    static const float mask_grille_amplify = 1.0/mask_grille_avg_color;
+    static const float mask_slot_amplify = 1.0/mask_slot_avg_color;
+    static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color;
+    return mask_type < 0.5 ? mask_grille_amplify :
+        mask_type < 1.5 ? mask_slot_amplify :
+        mask_shadow_amplify;
+}
+
+inline float get_mask_sample_mode()
+{
+    #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_desired;
+        #else
+            return clamp(mask_sample_mode_desired, 1.0, 2.0);
+        #endif
+    #else
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_static;
+        #else
+            return clamp(mask_sample_mode_static, 1.0, 2.0);
+        #endif
+    #endif
+}
+
+#endif  //  BIND_SHADER_PARAMS_H
+
+////////////////////////////  END BIND-SHADER-PARAMS  ///////////////////////////
+
+///////////////////////////////  VERTEX INCLUDES  //////////////////////////////
+
+//#include "../../../../include/gamma-management.h"
+
+////////////////////////////  BEGIN GAMMA-MANAGEMENT  //////////////////////////
+
+#ifndef GAMMA_MANAGEMENT_H
+#define GAMMA_MANAGEMENT_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//  
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides gamma-aware tex*D*() and encode_output() functions.
+//  Requires:   Before #include-ing this file, the including file must #define
+//              the following macros when applicable and follow their rules:
+//              1.) #define FIRST_PASS if this is the first pass.
+//              2.) #define LAST_PASS if this is the last pass.
+//              3.) If sRGB is available, set srgb_framebufferN = "true" for
+//                  every pass except the last in your .cgp preset.
+//              4.) If sRGB isn't available but you want gamma-correctness with
+//                  no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
+//              5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
+//              6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
+//              7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
+//              8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
+//              If an option in [5, 8] is #defined in the first or last pass, it
+//              should be #defined for both.  It shouldn't make a difference
+//              whether it's #defined for intermediate passes or not.
+//  Optional:   The including file (or an earlier included file) may optionally
+//              #define a number of macros indicating it will override certain
+//              macros and associated constants are as follows:
+//              static constants with either static or uniform constants.  The
+//              1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
+//                  static const float ntsc_gamma
+//                  static const float pal_gamma
+//                  static const float crt_reference_gamma_high
+//                  static const float crt_reference_gamma_low
+//                  static const float lcd_reference_gamma
+//                  static const float crt_office_gamma
+//                  static const float lcd_office_gamma
+//              2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
+//                  static const float crt_gamma
+//                  static const float gba_gamma
+//                  static const float lcd_gamma
+//              3.) OVERRIDE_FINAL_GAMMA: The user must first define:
+//                  static const float input_gamma
+//                  static const float intermediate_gamma
+//                  static const float output_gamma
+//                  (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
+//              4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
+//                  static const bool assume_opaque_alpha
+//              The gamma constant overrides must be used in every pass or none,
+//              and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
+//              OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
+//  Usage:      After setting macros appropriately, ignore gamma correction and
+//              replace all tex*D*() calls with equivalent gamma-aware
+//              tex*D*_linearize calls, except:
+//              1.) When you read an LUT, use regular tex*D or a gamma-specified
+//                  function, depending on its gamma encoding:
+//                      tex*D*_linearize_gamma (takes a runtime gamma parameter)
+//              2.) If you must read pass0's original input in a later pass, use
+//                  tex2D_linearize_ntsc_gamma.  If you want to read pass0's
+//                  input with gamma-corrected bilinear filtering, consider
+//                  creating a first linearizing pass and reading from the input
+//                  of pass1 later.
+//              Then, return encode_output(color) from every fragment shader.
+//              Finally, use the global gamma_aware_bilinear boolean if you want
+//              to statically branch based on whether bilinear filtering is
+//              gamma-correct or not (e.g. for placing Gaussian blur samples).
+//
+//  Detailed Policy:
+//  tex*D*_linearize() functions enforce a consistent gamma-management policy
+//  based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings.  They assume
+//  their input texture has the same encoding characteristics as the input for
+//  the current pass (which doesn't apply to the exceptions listed above).
+//  Similarly, encode_output() enforces a policy based on the LAST_PASS and
+//  GAMMA_ENCODE_EVERY_FBO settings.  Together, they result in one of the
+//  following two pipelines.
+//  Typical pipeline with intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = linear_color;     //  Automatic sRGB encoding
+//      linear_color = intermediate_output;     //  Automatic sRGB decoding
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Typical pipeline without intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
+//      linear_color = pow(intermediate_output, intermediate_gamma);
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
+//  easily get gamma-correctness without banding on devices where sRGB isn't
+//  supported.
+//
+//  Use This Header to Maximize Code Reuse:
+//  The purpose of this header is to provide a consistent interface for texture
+//  reads and output gamma-encoding that localizes and abstracts away all the
+//  annoying details.  This greatly reduces the amount of code in each shader
+//  pass that depends on the pass number in the .cgp preset or whether sRGB
+//  FBO's are being used: You can trivially change the gamma behavior of your
+//  whole pass by commenting or uncommenting 1-3 #defines.  To reuse the same
+//  code in your first, Nth, and last passes, you can even put it all in another
+//  header file and #include it from skeleton .cg files that #define the
+//  appropriate pass-specific settings.
+//
+//  Rationale for Using Three Macros:
+//  This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
+//  SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
+//  a lower maintenance burden on each pass.  At first glance it seems we could
+//  accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
+//  This works for simple use cases where input_gamma == output_gamma, but it
+//  breaks down for more complex scenarios like CRT simulation, where the pass
+//  number determines the gamma encoding of the input and output.
+
+
+///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
+
+//  Set standard gamma constants, but allow users to override them:
+#ifndef OVERRIDE_STANDARD_GAMMA
+    //  Standard encoding gammas:
+    static const float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
+    static const float pal_gamma = 2.8;     //  Never actually 2.8 in practice
+    //  Typical device decoding gammas (only use for emulating devices):
+    //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
+    //  gammas: The standards purposely undercorrected for an analog CRT's
+    //  assumed 2.5 reference display gamma to maintain contrast in assumed
+    //  [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
+    //  These unstated assumptions about display gamma and perceptual rendering
+    //  intent caused a lot of confusion, and more modern CRT's seemed to target
+    //  NTSC 2.2 gamma with circuitry.  LCD displays seem to have followed suit
+    //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
+    //  displays designed to view sRGB in bright environments.  (Standards are
+    //  also in flux again with BT.1886, but it's underspecified for displays.)
+    static const float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
+    static const float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
+    static const float lcd_reference_gamma = 2.5;       //  To match CRT
+    static const float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
+    static const float lcd_office_gamma = 2.2;  //  Approximates sRGB
+#endif  //  OVERRIDE_STANDARD_GAMMA
+
+//  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
+//  but only if they're aware of it.
+#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
+    static const bool assume_opaque_alpha = false;
+#endif
+
+
+///////////////////////  DERIVED CONSTANTS AS FUNCTIONS  ///////////////////////
+
+//  gamma-management.h should be compatible with overriding gamma values with
+//  runtime user parameters, but we can only define other global constants in
+//  terms of static constants, not uniform user parameters.  To get around this
+//  limitation, we need to define derived constants using functions.
+
+//  Set device gamma constants, but allow users to override them:
+#ifdef OVERRIDE_DEVICE_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_crt_gamma()    {   return crt_gamma;   }
+    inline float get_gba_gamma()    {   return gba_gamma;   }
+    inline float get_lcd_gamma()    {   return lcd_gamma;   }
+#else
+    inline float get_crt_gamma()    {   return crt_reference_gamma_high;    }
+    inline float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
+    inline float get_lcd_gamma()    {   return lcd_office_gamma;            }
+#endif  //  OVERRIDE_DEVICE_GAMMA
+
+//  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
+#ifdef OVERRIDE_FINAL_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_intermediate_gamma()   {   return intermediate_gamma;  }
+    inline float get_input_gamma()          {   return input_gamma;         }
+    inline float get_output_gamma()         {   return output_gamma;        }
+#else
+    //  If we gamma-correct every pass, always use ntsc_gamma between passes to
+    //  ensure middle passes don't need to care if anything is being simulated:
+    inline float get_intermediate_gamma()   {   return ntsc_gamma;          }
+    #ifdef SIMULATE_CRT_ON_LCD
+        inline float get_input_gamma()      {   return get_crt_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_LCD
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_LCD_ON_CRT
+        inline float get_input_gamma()      {   return get_lcd_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_CRT
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else   //  Don't simulate anything:
+        inline float get_input_gamma()      {   return ntsc_gamma;          }
+        inline float get_output_gamma()     {   return ntsc_gamma;          }
+    #endif  //  SIMULATE_GBA_ON_CRT
+    #endif  //  SIMULATE_LCD_ON_CRT
+    #endif  //  SIMULATE_GBA_ON_LCD
+    #endif  //  SIMULATE_CRT_ON_LCD
+#endif  //  OVERRIDE_FINAL_GAMMA
+
+//  Set decoding/encoding gammas for the current pass.  Use static constants for
+//  linearize_input and gamma_encode_output, because they aren't derived, and
+//  they let the compiler do dead-code elimination.
+#ifndef GAMMA_ENCODE_EVERY_FBO
+    #ifdef FIRST_PASS
+        static const bool linearize_input = true;
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        static const bool linearize_input = false;
+        inline float get_pass_input_gamma()     {   return 1.0;                 }
+    #endif
+    #ifdef LAST_PASS
+        static const bool gamma_encode_output = true;
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        static const bool gamma_encode_output = false;
+        inline float get_pass_output_gamma()    {   return 1.0;                 }
+    #endif
+#else
+    static const bool linearize_input = true;
+    static const bool gamma_encode_output = true;
+    #ifdef FIRST_PASS
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        inline float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
+    #endif
+    #ifdef LAST_PASS
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        inline float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
+    #endif
+#endif
+
+//  Users might want to know if bilinear filtering will be gamma-correct:
+static const bool gamma_aware_bilinear = !linearize_input;
+
+
+//////////////////////  COLOR ENCODING/DECODING FUNCTIONS  /////////////////////
+
+inline float4 encode_output(const float4 color)
+{
+    if(gamma_encode_output)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_input(const float4 color)
+{
+    if(linearize_input)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_gamma_input(const float4 color, const float3 gamma)
+{
+    if(assume_opaque_alpha)
+    {
+        return float4(pow(color.rgb, gamma), 1.0);
+    }
+    else
+    {
+        return float4(pow(color.rgb, gamma), color.a);
+    }
+}
+
+//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯
+//#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D)))
+// EDIT: it's the 'const' in front of the coords that's doing it
+
+///////////////////////////  TEXTURE LOOKUP WRAPPERS  //////////////////////////
+
+//  "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a wide array of linearizing texture lookup wrapper functions.  The
+//  Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
+//  lookups are provided for completeness in case that changes someday.  Nobody
+//  is likely to use the *fetch and *proj functions, but they're included just
+//  in case.  The only tex*D texture sampling functions omitted are:
+//      - tex*Dcmpbias
+//      - tex*Dcmplod
+//      - tex*DARRAY*
+//      - tex*DMS*
+//      - Variants returning integers
+//  Standard line length restrictions are ignored below for vertical brevity.
+/*
+//  tex1D:
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex1Dbias:
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dbias(tex, tex_coords));   }
+
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dbias(tex, tex_coords, texel_off));    }
+
+//  tex1Dfetch:
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
+{   return decode_input(tex1Dfetch(tex, tex_coords));  }
+
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex1Dlod:
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dlod(tex, tex_coords));    }
+
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dlod(tex, tex_coords, texel_off));     }
+
+//  tex1Dproj:
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+*/
+//  tex2D:
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords, texel_off));    }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex2Dbias:
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
+//{   return decode_input(tex2Dbias(tex, tex_coords));   }
+
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dbias(tex, tex_coords, texel_off));    }
+
+//  tex2Dfetch:
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
+//{   return decode_input(tex2Dfetch(tex, tex_coords));  }
+
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords)
+{   return decode_input(textureLod(tex, tex_coords.xy, 0.0));    }
+
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));     }
+/*
+//  tex2Dproj:
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+*/
+/*
+//  tex3D:
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
+{   return decode_input(tex3D(tex, tex_coords));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, texel_off));    }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex3Dbias:
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dbias(tex, tex_coords));   }
+
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dbias(tex, tex_coords, texel_off));    }
+
+//  tex3Dfetch:
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
+{   return decode_input(tex3Dfetch(tex, tex_coords));  }
+
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex3Dlod:
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dlod(tex, tex_coords));    }
+
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dlod(tex, tex_coords, texel_off));     }
+
+//  tex3Dproj:
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dproj(tex, tex_coords));   }
+
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dproj(tex, tex_coords, texel_off));    }
+/////////*
+
+//  NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  This narrow selection of nonstandard tex2D* functions can be useful:
+
+//  tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0)));   }
+
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off));    }
+
+
+//  MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a narrower selection of tex2D* wrapper functions that decode an
+//  input sample with a specified gamma value.  These are useful for reading
+//  LUT's and for reading the input of pass0 in a later pass.
+
+//  tex2D:
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma);   }
+
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+/*
+//  tex2Dbias:
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma);   }
+
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma);    }
+
+//  tex2Dfetch:
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma);  }
+
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma);   }
+*/
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma);    }
+
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma);     }
+
+
+#endif  //  GAMMA_MANAGEMENT_H
+
+////////////////////////////  END GAMMA-MANAGEMENT  //////////////////////////
+
+//#include "phosphor-mask-resizing.h"
+
+////////////////////////  BEGIN PHOSPHOR-MASK-RESIZING  ////////////////////////
+
+#ifndef PHOSPHOR_MASK_RESIZING_H
+#define PHOSPHOR_MASK_RESIZING_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//#include "../user-settings.h"
+//#include "derived-settings-and-constants.h"
+
+/////////////////////////////  CODEPATH SELECTION  /////////////////////////////
+
+//  Choose a looping strategy based on what's allowed:
+//  Dynamic loops not allowed: Use a flat static loop.
+//  Dynamic loops accomodated: Coarsely branch around static loops.
+//  Dynamic loops assumed allowed: Use a flat dynamic loop.
+#ifndef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #ifdef ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+        #define BREAK_LOOPS_INTO_PIECES
+    #else
+        #define USE_SINGLE_STATIC_LOOP
+    #endif
+#endif  //  No else needed: Dynamic loops assumed.
+
+
+//////////////////////////////////  CONSTANTS  /////////////////////////////////
+
+//  The larger the resized tile, the fewer samples we'll need for downsizing.
+//  See if we can get a static min tile size > mask_min_allowed_tile_size:
+static const float mask_min_allowed_tile_size = ceil(
+    mask_min_allowed_triad_size * mask_triads_per_tile);
+static const float mask_min_expected_tile_size = 
+        mask_min_allowed_tile_size;
+//  Limit the number of sinc resize taps by the maximum minification factor:
+static const float pi_over_lobes = pi/mask_sinc_lobes;
+static const float max_sinc_resize_samples_float = 2.0 * mask_sinc_lobes *
+    mask_resize_src_lut_size.x/mask_min_expected_tile_size;
+//  Vectorized loops sample in multiples of 4.  Round up to be safe:
+static const float max_sinc_resize_samples_m4 = ceil(
+    max_sinc_resize_samples_float * 0.25) * 4.0;
+
+
+/////////////////////////  RESAMPLING FUNCTION HELPERS  ////////////////////////
+
+inline float get_dynamic_loop_size(const float magnification_scale)
+{
+    //  Requires:   The following global constants must be defined:
+    //              1.) mask_sinc_lobes
+    //              2.) max_sinc_resize_samples_m4
+    //  Returns:    The minimum number of texture samples for a correct downsize
+    //              at magnification_scale.
+    //  We're downsizing, so the filter is sized across 2*lobes output pixels
+    //  (not 2*lobes input texels).  This impacts distance measurements and the
+    //  minimum number of input samples needed.
+    const float min_samples_float = 2.0 * mask_sinc_lobes / magnification_scale;
+    const float min_samples_m4 = ceil(min_samples_float * 0.25) * 4.0;
+    #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+        const float max_samples_m4 = max_sinc_resize_samples_m4;
+    #else   // ifdef BREAK_LOOPS_INTO_PIECES
+        //  Simulating loops with branches imposes a 128-sample limit.
+        const float max_samples_m4 = min(128.0, max_sinc_resize_samples_m4);
+    #endif
+    return min(min_samples_m4, max_samples_m4);
+}
+
+float2 get_first_texel_tile_uv_and_dist(const float2 tex_uv, 
+    const float2 tex_size, const float dr, 
+    const float input_tiles_per_texture_r, const float samples,
+    static const bool vertical)
+{
+    //  Requires:   1.) dr == du == 1.0/texture_size.x or
+    //                  dr == dv == 1.0/texture_size.y
+    //                  (whichever direction we're resampling in).
+    //                  It's a scalar to save register space.
+    //              2.) input_tiles_per_texture_r is the number of input tiles
+    //                  that can fit in the input texture in the direction we're
+    //                  resampling this pass.
+    //              3.) vertical indicates whether we're resampling vertically
+    //                  this pass (or horizontally).
+    //  Returns:    Pack and return the first sample's tile_uv coord in [0, 1]
+    //              and its texel distance from the destination pixel, in the
+    //              resized dimension only.
+    //  We'll start with the topmost or leftmost sample and work down or right,
+    //  so get the first sample location and distance.  Modify both dimensions
+    //  as if we're doing a one-pass 2D resize; we'll throw away the unneeded
+    //  (and incorrect) dimension at the end.
+    const float2 curr_texel = tex_uv * tex_size;
+    const float2 prev_texel =
+        floor(curr_texel - float2(under_half)) + float2(0.5);
+    const float2 first_texel = prev_texel - float2(samples/2.0 - 1.0);
+    const float2 first_texel_uv_wrap_2D = first_texel * dr;
+    const float2 first_texel_dist_2D = curr_texel - first_texel;
+    //  Convert from tex_uv to tile_uv coords so we can sub fracs for fmods.
+    const float2 first_texel_tile_uv_wrap_2D =
+        first_texel_uv_wrap_2D * input_tiles_per_texture_r;
+    //  Project wrapped coordinates to the [0, 1] range.  We'll do this with all
+    //  samples,but the first texel is special, since it might be negative.
+    const float2 coord_negative =
+        float2((first_texel_tile_uv_wrap_2D.x < 0.),(first_texel_tile_uv_wrap_2D.y < 0.));
+    const float2 first_texel_tile_uv_2D =
+        frac(first_texel_tile_uv_wrap_2D) + coord_negative;
+    //  Pack the first texel's tile_uv coord and texel distance in 1D:
+    const float2 tile_u_and_dist =
+        float2(first_texel_tile_uv_2D.x, first_texel_dist_2D.x);
+    const float2 tile_v_and_dist =
+        float2(first_texel_tile_uv_2D.y, first_texel_dist_2D.y);
+    return vertical ? tile_v_and_dist : tile_u_and_dist;
+    //return lerp(tile_u_and_dist, tile_v_and_dist, float(vertical));
+}
+
+inline float4 tex2Dlod0try(const sampler2D tex, const float2 tex_uv)
+{
+    //  Mipmapping and anisotropic filtering get confused by sinc-resampling.
+    //  One [slow] workaround is to select the lowest mip level:
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        return textureLod(tex, float4(tex_uv, 0.0, 0.0).xy);
+    #else
+        #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+            return tex2Dbias(tex, float4(tex_uv, 0.0, -16.0));
+        #else
+            return texture(tex, tex_uv);
+        #endif
+    #endif
+}
+
+
+//////////////////////////////  LOOP BODY MACROS  //////////////////////////////
+
+//  Using inline functions can exceed the temporary register limit, so we're
+//  stuck with #define macros (I'm TRULY sorry).  They're declared here instead
+//  of above to be closer to the actual invocation sites.  Steps:
+//  1.) Get the exact texel location.
+//  2.) Sample the phosphor mask (already assumed encoded in linear RGB).
+//  3.) Get the distance from the current pixel and sinc weight:
+//          sinc(dist) = sin(pi * dist)/(pi * dist)
+//      We can also use the slower/smoother Lanczos instead:
+//          L(x) = sinc(dist) * sinc(dist / lobes)
+//  4.) Accumulate the weight sum in weights, and accumulate the weighted texels
+//      in pixel_color (we'll normalize outside the loop at the end).
+//  We vectorize the loop to help reduce the Lanczos window's cost.
+
+    //  The r coord is the coord in the dimension we're resizing along (u or v),
+    //  and first_texel_tile_uv_rrrr is a float4 of the first texel's u or v
+    //  tile_uv coord in [0, 1].  tex_uv_r will contain the tile_uv u or v coord
+    //  for four new texel samples.
+    #define CALCULATE_R_COORD_FOR_4_SAMPLES                                    \
+        const float4 true_i = float4(i_base + i) + float4(0.0, 1.0, 2.0, 3.0); \
+        const float4 tile_uv_r = frac(                                         \
+            first_texel_tile_uv_rrrr + true_i * tile_dr);                      \
+        const float4 tex_uv_r = tile_uv_r * tile_size_uv_r;
+
+    #ifdef PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+        #define CALCULATE_SINC_RESAMPLE_WEIGHTS                                \
+            const float4 pi_dist_over_lobes = pi_over_lobes * dist;            \
+            const float4 weights = min(sin(pi_dist) * sin(pi_dist_over_lobes) /\
+                (pi_dist*pi_dist_over_lobes), float4(1.0));
+    #else
+        #define CALCULATE_SINC_RESAMPLE_WEIGHTS                                \
+            const float4 weights = min(sin(pi_dist)/pi_dist, float4(1.0));
+    #endif
+
+    #define UPDATE_COLOR_AND_WEIGHT_SUMS                                       \
+        const float4 dist = magnification_scale *                              \
+            abs(first_dist_unscaled - true_i);                                 \
+        const float4 pi_dist = pi * dist;                                      \
+        CALCULATE_SINC_RESAMPLE_WEIGHTS;                                       \
+        pixel_color += new_sample0 * weights.xxx;                              \
+        pixel_color += new_sample1 * weights.yyy;                              \
+        pixel_color += new_sample2 * weights.zzz;                              \
+        pixel_color += new_sample3 * weights.www;                              \
+        weight_sum += weights;
+
+    #define VERTICAL_SINC_RESAMPLE_LOOP_BODY                                   \
+        CALCULATE_R_COORD_FOR_4_SAMPLES;                                       \
+        const float3 new_sample0 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.x)).rgb;                                 \
+        const float3 new_sample1 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.y)).rgb;                                 \
+        const float3 new_sample2 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.z)).rgb;                                 \
+        const float3 new_sample3 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.w)).rgb;                                 \
+        UPDATE_COLOR_AND_WEIGHT_SUMS;
+
+    #define HORIZONTAL_SINC_RESAMPLE_LOOP_BODY                                 \
+        CALCULATE_R_COORD_FOR_4_SAMPLES;                                       \
+        const float3 new_sample0 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.x, tex_uv.y)).rgb;                                 \
+        const float3 new_sample1 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.y, tex_uv.y)).rgb;                                 \
+        const float3 new_sample2 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.z, tex_uv.y)).rgb;                                 \
+        const float3 new_sample3 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.w, tex_uv.y)).rgb;                                 \
+        UPDATE_COLOR_AND_WEIGHT_SUMS;
+
+
+////////////////////////////  RESAMPLING FUNCTIONS  ////////////////////////////
+
+float3 downsample_vertical_sinc_tiled(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size, static const float dr,
+    const float magnification_scale, static const float tile_size_uv_r)
+{
+    //  Requires:   1.) dr == du == 1.0/texture_size.x or
+    //                  dr == dv == 1.0/texture_size.y
+    //                  (whichever direction we're resampling in).
+    //                  It's a scalar to save register space.
+    //              2.) tile_size_uv_r is the number of texels an input tile
+    //                  takes up in the input texture, in the direction we're
+    //                  resampling this pass.
+    //              3.) magnification_scale must be <= 1.0.
+    //  Returns:    Return a [Lanczos] sinc-resampled pixel of a vertically
+    //              downsized input tile embedded in an input texture.  (The
+    //              vertical version is special-cased though: It assumes the
+    //              tile size equals the [static] texture size, since it's used
+    //              on an LUT texture input containing one tile.  For more
+    //              generic use, eliminate the "static" in the parameters.)
+    //  The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension
+    //  we're resizing along, e.g. "dy" in this case.
+    #ifdef USE_SINGLE_STATIC_LOOP
+        //  A static loop can be faster, but it might blur too much from using
+        //  more samples than it should.
+        static const int samples = int(max_sinc_resize_samples_m4);
+    #else
+        const int samples = int(get_dynamic_loop_size(magnification_scale));
+    #endif
+
+    //  Get the first sample location (scalar tile uv coord along the resized
+    //  dimension) and distance from the output location (in texels):
+    static const float input_tiles_per_texture_r = 1.0/tile_size_uv_r;
+    //  true = vertical resize:
+    const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist(
+        tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, true);
+    const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx;
+    const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy;
+    //  Get the tile sample offset:
+    static const float tile_dr = dr * input_tiles_per_texture_r;
+
+    //  Sum up each weight and weighted sample color, varying the looping
+    //  strategy based on our expected dynamic loop capabilities.  See the
+    //  loop body macros above.
+    int i_base = 0;
+    float4 weight_sum = float4(0.0);
+    float3 pixel_color = float3(0.0);
+    static const int i_step = 4;
+    #ifdef BREAK_LOOPS_INTO_PIECES
+        if(samples - i_base >= 64)
+        {
+            for(int i = 0; i < 64; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 64;
+        }
+        if(samples - i_base >= 32)
+        {
+            for(int i = 0; i < 32; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 32;
+        }
+        if(samples - i_base >= 16)
+        {
+            for(int i = 0; i < 16; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 16;
+        }
+        if(samples - i_base >= 8)
+        {
+            for(int i = 0; i < 8; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 8;
+        }
+        if(samples - i_base >= 4)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 4;
+        }
+        //  Do another 4-sample block for a total of 128 max samples.
+        if(samples - i_base > 0)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+        }
+    #else
+        for(int i = 0; i < samples; i += i_step)
+        {
+            VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+        }
+    #endif
+    //  Normalize so the weight_sum == 1.0, and return:
+    const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw;
+    const float3 scalar_weight_sum = float3(weight_sum_reduce.x + 
+        weight_sum_reduce.y);
+    return (pixel_color/scalar_weight_sum);
+}
+
+float3 downsample_horizontal_sinc_tiled(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size, const float dr,
+    const float magnification_scale, const float tile_size_uv_r)
+{
+    //  Differences from downsample_horizontal_sinc_tiled:
+    //  1.) The dr and tile_size_uv_r parameters are not static consts.
+    //  2.) The "vertical" parameter to get_first_texel_tile_uv_and_dist is
+    //      set to false instead of true.
+    //  3.) The horizontal version of the loop body is used.
+    //  TODO: If we can get guaranteed compile-time dead code elimination,
+    //  we can combine the vertical/horizontal downsampling functions by:
+    //  1.) Add an extra static const bool parameter called "vertical."
+    //  2.) Supply it with the result of get_first_texel_tile_uv_and_dist().
+    //  3.) Use a conditional assignment in the loop body macro.  This is the
+    //      tricky part: We DO NOT want to incur the extra conditional
+    //      assignment in the inner loop at runtime!
+    //  The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension
+    //  we're resizing along, e.g. "dx" in this case.
+    #ifdef USE_SINGLE_STATIC_LOOP
+        //  If we have to load all samples, we might as well use them.
+        static const int samples = int(max_sinc_resize_samples_m4);
+    #else
+        const int samples = int(get_dynamic_loop_size(magnification_scale));
+    #endif
+
+    //  Get the first sample location (scalar tile uv coord along resized
+    //  dimension) and distance from the output location (in texels):
+    const float input_tiles_per_texture_r = 1.0/tile_size_uv_r;
+    //  false = horizontal resize:
+    const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist(
+        tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, false);
+    const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx;
+    const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy;
+    //  Get the tile sample offset:
+    const float tile_dr = dr * input_tiles_per_texture_r;
+
+    //  Sum up each weight and weighted sample color, varying the looping
+    //  strategy based on our expected dynamic loop capabilities.  See the
+    //  loop body macros above.
+    int i_base = 0;
+    float4 weight_sum = float4(0.0);
+    float3 pixel_color = float3(0.0);
+    static const int i_step = 4;
+    #ifdef BREAK_LOOPS_INTO_PIECES
+        if(samples - i_base >= 64)
+        {
+            for(int i = 0; i < 64; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 64;
+        }
+        if(samples - i_base >= 32)
+        {
+            for(int i = 0; i < 32; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 32;
+        }
+        if(samples - i_base >= 16)
+        {
+            for(int i = 0; i < 16; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 16;
+        }
+        if(samples - i_base >= 8)
+        {
+            for(int i = 0; i < 8; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 8;
+        }
+        if(samples - i_base >= 4)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 4;
+        }
+        //  Do another 4-sample block for a total of 128 max samples.
+        if(samples - i_base > 0)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+        }
+    #else
+        for(int i = 0; i < samples; i += i_step)
+        {
+            HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+        }
+    #endif
+    //  Normalize so the weight_sum == 1.0, and return:
+    const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw;
+    const float3 scalar_weight_sum = float3(weight_sum_reduce.x +
+        weight_sum_reduce.y);
+    return (pixel_color/scalar_weight_sum);
+}
+
+
+////////////////////////////  TILE SIZE CALCULATION  ///////////////////////////
+
+float2 get_resized_mask_tile_size(const float2 estimated_viewport_size,
+    const float2 estimated_mask_resize_output_size,
+    const bool solemnly_swear_same_inputs_for_every_pass)
+{
+    //  Requires:   The following global constants must be defined according to
+    //              certain constraints:
+    //              1.) mask_resize_num_triads: Must be high enough that our
+    //                  mask sampling method won't have artifacts later
+    //                  (long story; see derived-settings-and-constants.h)
+    //              2.) mask_resize_src_lut_size: Texel size of our mask LUT
+    //              3.) mask_triads_per_tile: Num horizontal triads in our LUT
+    //              4.) mask_min_allowed_triad_size: User setting (the more
+    //                  restrictive it is, the faster the resize will go)
+    //              5.) mask_min_allowed_tile_size_x < mask_resize_src_lut_size.x
+    //              6.) mask_triad_size_desired_{runtime, static}
+    //              7.) mask_num_triads_desired_{runtime, static}
+    //              8.) mask_specify_num_triads must be 0.0/1.0 (false/true)
+    //              The function parameters must be defined as follows:
+    //              1.) estimated_viewport_size == (final viewport size);
+    //                  If mask_specify_num_triads is 1.0/true and the viewport
+    //                  estimate is wrong, the number of triads will differ from
+    //                  the user's preference by about the same factor.
+    //              2.) estimated_mask_resize_output_size: Must equal the
+    //                  output size of the MASK_RESIZE pass.
+    //                  Exception: The x component may be estimated garbage if
+    //                  and only if the caller throws away the x result.
+    //              3.) solemnly_swear_same_inputs_for_every_pass: Set to false,
+    //                  unless you can guarantee that every call across every
+    //                  pass will use the same sizes for the other parameters.
+    //              When calling this across multiple passes, always use the
+    //              same y viewport size/scale, and always use the same x
+    //              viewport size/scale when using the x result.
+    //  Returns:    Return the final size of a manually resized mask tile, after
+    //              constraining the desired size to avoid artifacts.  Under
+    //              unusual circumstances, tiles may become stretched vertically
+    //              (see wall of text below).
+    //  Stated tile properties must be correct:
+    static const float tile_aspect_ratio_inv =
+        mask_resize_src_lut_size.y/mask_resize_src_lut_size.x;
+    static const float tile_aspect_ratio = 1.0/tile_aspect_ratio_inv;
+    static const float2 tile_aspect = float2(1.0, tile_aspect_ratio_inv);
+    //  If mask_specify_num_triads is 1.0/true and estimated_viewport_size.x is
+    //  wrong, the user preference will be misinterpreted:
+    const float desired_tile_size_x = mask_triads_per_tile * lerp(
+        mask_triad_size_desired,
+        estimated_viewport_size.x / mask_num_triads_desired,
+        mask_specify_num_triads);
+    if(get_mask_sample_mode() > 0.5)
+    {
+        //  We don't need constraints unless we're sampling MASK_RESIZE.
+        return desired_tile_size_x * tile_aspect;
+    }
+    //  Make sure we're not upsizing:
+    const float temp_tile_size_x =
+        min(desired_tile_size_x, mask_resize_src_lut_size.x);
+    //  Enforce min_tile_size and max_tile_size in both dimensions:
+    const float2 temp_tile_size = temp_tile_size_x * tile_aspect;
+    static const float2 min_tile_size =
+        mask_min_allowed_tile_size * tile_aspect;
+    const float2 max_tile_size =
+        estimated_mask_resize_output_size / mask_resize_num_tiles;
+    const float2 clamped_tile_size =
+        clamp(temp_tile_size, min_tile_size, max_tile_size);
+    //  Try to maintain tile_aspect_ratio.  This is the tricky part:
+    //  If we're currently resizing in the y dimension, the x components
+    //  could be MEANINGLESS.  (If estimated_mask_resize_output_size.x is
+    //  bogus, then so is max_tile_size.x and clamped_tile_size.x.)
+    //  We can't adjust the y size based on clamped_tile_size.x.  If it
+    //  clamps when it shouldn't, it won't clamp again when later passes
+    //  call this function with the correct sizes, and the discrepancy will
+    //  break the sampling coords in MASKED_SCANLINES.  Instead, we'll limit
+    //  the x size based on the y size, but not vice versa, unless the
+    //  caller swears the parameters were the same (correct) in every pass.
+    //  As a result, triads could appear vertically stretched if:
+    //  a.) mask_resize_src_lut_size.x > mask_resize_src_lut_size.y: Wide
+    //      LUT's might clamp x more than y (all provided LUT's are square)
+    //  b.) true_viewport_size.x < true_viewport_size.y: The user is playing
+    //      with a vertically oriented screen (not accounted for anyway)
+    //  c.) mask_resize_viewport_scale.x < masked_resize_viewport_scale.y:
+    //      Viewport scales are equal by default.
+    //  If any of these are the case, you can fix the stretching by setting:
+    //      mask_resize_viewport_scale.x = mask_resize_viewport_scale.y *
+    //          (1.0 / min_expected_aspect_ratio) *
+    //          (mask_resize_src_lut_size.x / mask_resize_src_lut_size.y)
+    const float x_tile_size_from_y =
+        clamped_tile_size.y * tile_aspect_ratio;
+    const float y_tile_size_from_x = lerp(clamped_tile_size.y,
+        clamped_tile_size.x * tile_aspect_ratio_inv,
+        float(solemnly_swear_same_inputs_for_every_pass));
+    const float2 reclamped_tile_size = float2(
+        min(clamped_tile_size.x, x_tile_size_from_y),
+        min(clamped_tile_size.y, y_tile_size_from_x));
+    //  We need integer tile sizes in both directions for tiled sampling to
+    //  work correctly.  Use floor (to make sure we don't round up), but be
+    //  careful to avoid a rounding bug where floor decreases whole numbers:
+    const float2 final_resized_tile_size =
+        floor(reclamped_tile_size + float2(FIX_ZERO(0.0)));
+    return final_resized_tile_size;
+}
+
+
+/////////////////////////  FINAL MASK SAMPLING HELPERS  ////////////////////////
+
+float4 get_mask_sampling_parameters(const float2 mask_resize_texture_size,
+    const float2 mask_resize_video_size, const float2 true_viewport_size,
+    out float2 mask_tiles_per_screen)
+{
+    //  Requires:   1.) Requirements of get_resized_mask_tile_size() must be
+    //                  met, particularly regarding global constants.
+    //              The function parameters must be defined as follows:
+    //              1.) mask_resize_texture_size == MASK_RESIZE.texture_size
+    //                  if get_mask_sample_mode() is 0 (otherwise anything)
+    //              2.) mask_resize_video_size == MASK_RESIZE.video_size
+    //                  if get_mask_sample_mode() is 0 (otherwise anything)
+    //              3.) true_viewport_size == output_size for a pass set to
+    //                  1.0 viewport scale (i.e. it must be correct)
+    //  Returns:    Return a float4 containing:
+    //                  xy: tex_uv coords for the start of the mask tile
+    //                  zw: tex_uv size of the mask tile from start to end
+    //              mask_tiles_per_screen is an out parameter containing the
+    //              number of mask tiles that will fit on the screen.
+    //  First get the final resized tile size.  The viewport size and mask
+    //  resize viewport scale must be correct, but don't solemnly swear they
+    //  were correct in both mask resize passes unless you know it's true.
+    //  (We can better ensure a correct tile aspect ratio if the parameters are
+    //  guaranteed correct in all passes...but if we lie, we'll get inconsistent
+    //  sizes across passes, resulting in broken texture coordinates.)
+    const float mask_sample_mode = get_mask_sample_mode();
+    const float2 mask_resize_tile_size = get_resized_mask_tile_size(
+        true_viewport_size, mask_resize_video_size, false);
+    if(mask_sample_mode < 0.5)
+    {
+        //  Sample MASK_RESIZE: The resized tile is a fraction of the texture
+        //  size and starts at a nonzero offset to allow for border texels:
+        const float2 mask_tile_uv_size = mask_resize_tile_size /
+            mask_resize_texture_size;
+        const float2 skipped_tiles = mask_start_texels/mask_resize_tile_size;
+        const float2 mask_tile_start_uv = skipped_tiles * mask_tile_uv_size;
+        //  mask_tiles_per_screen must be based on the *true* viewport size:
+        mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size;
+        return float4(mask_tile_start_uv, mask_tile_uv_size);
+    }
+    else
+    {
+        //  If we're tiling at the original size (1:1 pixel:texel), redefine a
+        //  "tile" to be the full texture containing many triads.  Otherwise,
+        //  we're hardware-resampling an LUT, and the texture truly contains a
+        //  single unresized phosphor mask tile anyway.
+        static const float2 mask_tile_uv_size = float2(1.0);
+        static const float2 mask_tile_start_uv = float2(0.0);
+        if(mask_sample_mode > 1.5)
+        {
+            //  Repeat the full LUT at a 1:1 pixel:texel ratio without resizing:
+            mask_tiles_per_screen = true_viewport_size/mask_texture_large_size;
+        }
+        else
+        {
+            //  Hardware-resize the original LUT:
+            mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size;
+        }
+        return float4(mask_tile_start_uv, mask_tile_uv_size);
+    }
+}
+/*
+float2 fix_tiling_discontinuities_normalized(const float2 tile_uv,
+    float2 duv_dx, float2 duv_dy)
+{
+    //  Requires:   1.) duv_dx == ddx(tile_uv)
+    //              2.) duv_dy == ddy(tile_uv)
+    //              3.) tile_uv contains tile-relative uv coords in [0, 1],
+    //                  such that (0.5, 0.5) is the center of a tile, etc.
+    //                  ("Tile" can mean texture, the video embedded in the
+    //                  texture, or some other "tile" embedded in a texture.)
+    //  Returns:    Return new tile_uv coords that contain no discontinuities
+    //              across a 2x2 pixel quad.
+    //  Description:
+    //  When uv coords wrap from 1.0 to 0.0, they create a discontinuity in the
+    //  derivatives, which we assume happened if the absolute difference between
+    //  any fragment in a 2x2 block is > ~half a tile.  If the current block has
+    //  a u or v discontinuity and the current fragment is in the first half of
+    //  the tile along that axis (i.e. it wrapped from 1.0 to 0.0), add a tile
+    //  to that coord to make the 2x2 block continuous.  (It will now have a
+    //  coord > 1.0 in the padding area beyond the tile.)  This function takes
+    //  derivatives as parameters so the caller can reuse them.
+    //  In case we're using high-quality (nVidia-style) derivatives, ensure
+    //  diagonically opposite fragments see each other for correctness:
+    duv_dx = abs(duv_dx) + abs(ddy(duv_dx));
+    duv_dy = abs(duv_dy) + abs(ddx(duv_dy));
+    const float2 pixel_in_first_half_tile = float2((tile_uv.x < 0.5),(tile_uv.y < 0.5));
+    const float2 jump_exists = float2(((duv_dx + duv_dy).x > 0.5),((duv_dx + duv_dy).y > 0.5));
+    return tile_uv + jump_exists * pixel_in_first_half_tile;
+}
+*/
+float2 convert_phosphor_tile_uv_wrap_to_tex_uv(const float2 tile_uv_wrap,
+    const float4 mask_tile_start_uv_and_size)
+{
+    //  Requires:   1.) tile_uv_wrap contains tile-relative uv coords, where the
+    //                  tile spans from [0, 1], such that (0.5, 0.5) is at the
+    //                  tile center.  The input coords can range from [0, inf],
+    //                  and their fractional parts map to a repeated tile.
+    //                  ("Tile" can mean texture, the video embedded in the
+    //                  texture, or some other "tile" embedded in a texture.)
+    //              2.) mask_tile_start_uv_and_size.xy contains tex_uv coords
+    //                  for the start of the embedded tile in the full texture.
+    //              3.) mask_tile_start_uv_and_size.zw contains the [fractional]
+    //                  tex_uv size of the embedded tile in the full texture.
+    //  Returns:    Return tex_uv coords (used for texture sampling)
+    //              corresponding to tile_uv_wrap.
+    if(get_mask_sample_mode() < 0.5)
+    {
+        //  Manually repeat the resized mask tile to fill the screen:
+        //  First get fractional tile_uv coords.  Using frac/fmod on coords
+        //  confuses anisotropic filtering; fix it as user options dictate.
+        //  derived-settings-and-constants.h disables incompatible options.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            float2 tile_uv = frac(tile_uv_wrap * 0.5) * 2.0;
+        #else
+            float2 tile_uv = frac(tile_uv_wrap);
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            const float2 tile_uv_dx = ddx(tile_uv);
+            const float2 tile_uv_dy = ddy(tile_uv);
+            tile_uv = fix_tiling_discontinuities_normalized(tile_uv,
+                tile_uv_dx, tile_uv_dy);
+        #endif
+        //  The tile is embedded in a padded FBO, and it may start at a
+        //  nonzero offset if border texels are used to avoid artifacts:
+        const float2 mask_tex_uv = mask_tile_start_uv_and_size.xy +
+            tile_uv * mask_tile_start_uv_and_size.zw;
+        return mask_tex_uv;
+    }
+    else
+    {
+        //  Sample from the input phosphor mask texture with hardware tiling.
+        //  If we're tiling at the original size (mode 2), the "tile" is the
+        //  whole texture, and it contains a large number of triads mapped with
+        //  a 1:1 pixel:texel ratio.  OTHERWISE, the texture contains a single
+        //  unresized tile.  tile_uv_wrap already has correct coords for both!
+        return tile_uv_wrap;
+    }
+}
+
+
+#endif  //  PHOSPHOR_MASK_RESIZING_H
+
+/////////////////////////  END PHOSPHOR-MASK-RESIZING  /////////////////////////
+
+//#include "scanline-functions.h"
+
+/////////////////////////////  BEGIN SCANLINE-FUNCTIONS  ////////////////////////////
+
+#ifndef SCANLINE_FUNCTIONS_H
+#define SCANLINE_FUNCTIONS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+////////////////////////////  END USER-SETTINGS  //////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  END DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////////////
+
+//#include "../../../../include/special-functions.h"
+
+///////////////////////////  BEGIN SPECIAL-FUNCTIONS  //////////////////////////
+
+#ifndef SPECIAL_FUNCTIONS_H
+#define SPECIAL_FUNCTIONS_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file implements the following mathematical special functions:
+//  1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2))
+//  2.) gamma(s), a real-numbered extension of the integer factorial function
+//  It also implements normalized_ligamma(s, z), a normalized lower incomplete
+//  gamma function for s < 0.5 only.  Both gamma() and normalized_ligamma() can
+//  be called with an _impl suffix to use an implementation version with a few
+//  extra precomputed parameters (which may be useful for the caller to reuse).
+//  See below for details.
+//
+//  Design Rationale:
+//  Pretty much every line of code in this file is duplicated four times for
+//  different input types (float4/float3/float2/float).  This is unfortunate,
+//  but Cg doesn't allow function templates.  Macros would be far less verbose,
+//  but they would make the code harder to document and read.  I don't expect
+//  these functions will require a whole lot of maintenance changes unless
+//  someone ever has need for more robust incomplete gamma functions, so code
+//  duplication seems to be the lesser evil in this case.
+
+
+///////////////////////////  GAUSSIAN ERROR FUNCTION  //////////////////////////
+
+float4 erf6(float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Return an Abramowitz/Stegun approximation of erf(), where:
+    //                  erf(x) = 2/sqrt(pi) * integral(e**(-x**2))
+    //              This approximation has a max absolute error of 2.5*10**-5
+    //              with solid numerical robustness and efficiency.  See:
+	//                  https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions
+	static const float4 one = float4(1.0);
+	const float4 sign_x = sign(x);
+	const float4 t = one/(one + 0.47047*abs(x));
+	const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float3 erf6(const float3 x)
+{
+    //  Float3 version:
+	static const float3 one = float3(1.0);
+	const float3 sign_x = sign(x);
+	const float3 t = one/(one + 0.47047*abs(x));
+	const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float2 erf6(const float2 x)
+{
+    //  Float2 version:
+	static const float2 one = float2(1.0);
+	const float2 sign_x = sign(x);
+	const float2 t = one/(one + 0.47047*abs(x));
+	const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float erf6(const float x)
+{
+    //  Float version:
+	const float sign_x = sign(x);
+	const float t = 1.0/(1.0 + 0.47047*abs(x));
+	const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float4 erft(const float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Approximate erf() with the hyperbolic tangent.  The error is
+    //              visually noticeable, but it's blazing fast and perceptually
+    //              close...at least on ATI hardware.  See:
+    //                  http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html
+    //  Warning:    Only use this if your hardware drivers correctly implement
+    //              tanh(): My nVidia 8800GTS returns garbage output.
+	return tanh(1.202760580 * x);
+}
+
+float3 erft(const float3 x)
+{
+    //  Float3 version:
+	return tanh(1.202760580 * x);
+}
+
+float2 erft(const float2 x)
+{
+    //  Float2 version:
+	return tanh(1.202760580 * x);
+}
+
+float erft(const float x)
+{
+    //  Float version:
+	return tanh(1.202760580 * x);
+}
+
+inline float4 erf(const float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Some approximation of erf(x), depending on user settings.
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float3 erf(const float3 x)
+{
+    //  Float3 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float2 erf(const float2 x)
+{
+    //  Float2 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float erf(const float x)
+{
+    //  Float version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+
+///////////////////////////  COMPLETE GAMMA FUNCTION  //////////////////////////
+
+float4 gamma_impl(const float4 s, const float4 s_inv)
+{
+    //  Requires:   1.) s is the standard parameter to the gamma function, and
+    //                  it should lie in the [0, 36] range.
+    //              2.) s_inv = 1.0/s.  This implementation function requires
+    //                  the caller to precompute this value, giving users the
+    //                  opportunity to reuse it.
+    //  Returns:    Return approximate gamma function (real-numbered factorial)
+    //              output using the Lanczos approximation with two coefficients
+    //              calculated using Paul Godfrey's method here:
+    //                  http://my.fit.edu/~gabdo/gamma.txt
+    //              An optimal g value for s in [0, 36] is ~1.12906830989, with
+    //              a maximum relative error of 0.000463 for 2**16 equally
+    //              evals.  We could use three coeffs (0.0000346 error) without
+    //              hurting latency, but this allows more parallelism with
+    //              outside instructions.
+	static const float4 g = float4(1.12906830989);
+	static const float4 c0 = float4(0.8109119309638332633713423362694399653724431);
+	static const float4 c1 = float4(0.4808354605142681877121661197951496120000040);
+	static const float4 e = float4(2.71828182845904523536028747135266249775724709);
+	const float4 sph = s + float4(0.5);
+	const float4 lanczos_sum = c0 + c1/(s + float4(1.0));
+	const float4 base = (sph + g)/e;  //  or (s + g + float4(0.5))/e
+	//  gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s).
+	//  This has less error for small s's than (s -= 1.0) at the beginning.
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float3 gamma_impl(const float3 s, const float3 s_inv)
+{
+    //  Float3 version:
+	static const float3 g = float3(1.12906830989);
+	static const float3 c0 = float3(0.8109119309638332633713423362694399653724431);
+	static const float3 c1 = float3(0.4808354605142681877121661197951496120000040);
+	static const float3 e = float3(2.71828182845904523536028747135266249775724709);
+	const float3 sph = s + float3(0.5);
+	const float3 lanczos_sum = c0 + c1/(s + float3(1.0));
+	const float3 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float2 gamma_impl(const float2 s, const float2 s_inv)
+{
+    //  Float2 version:
+	static const float2 g = float2(1.12906830989);
+	static const float2 c0 = float2(0.8109119309638332633713423362694399653724431);
+	static const float2 c1 = float2(0.4808354605142681877121661197951496120000040);
+	static const float2 e = float2(2.71828182845904523536028747135266249775724709);
+	const float2 sph = s + float2(0.5);
+	const float2 lanczos_sum = c0 + c1/(s + float2(1.0));
+	const float2 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float gamma_impl(const float s, const float s_inv)
+{
+    //  Float version:
+	static const float g = 1.12906830989;
+	static const float c0 = 0.8109119309638332633713423362694399653724431;
+	static const float c1 = 0.4808354605142681877121661197951496120000040;
+	static const float e = 2.71828182845904523536028747135266249775724709;
+	const float sph = s + 0.5;
+	const float lanczos_sum = c0 + c1/(s + 1.0);
+	const float base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float4 gamma(const float4 s)
+{
+    //  Requires:   s is the standard parameter to the gamma function, and it
+    //              should lie in the [0, 36] range.
+    //  Returns:    Return approximate gamma function output with a maximum
+    //              relative error of 0.000463.  See gamma_impl for details.
+	return gamma_impl(s, float4(1.0)/s);
+}
+
+float3 gamma(const float3 s)
+{
+    //  Float3 version:
+	return gamma_impl(s, float3(1.0)/s);
+}
+
+float2 gamma(const float2 s)
+{
+    //  Float2 version:
+	return gamma_impl(s, float2(1.0)/s);
+}
+
+float gamma(const float s)
+{
+    //  Float version:
+	return gamma_impl(s, 1.0/s);
+}
+
+
+////////////////  INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT)  ///////////////
+
+//  Lower incomplete gamma function for small s and z (implementation):
+float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z <= ~0.775075
+    //              3.) s_inv = 1.0/s (precomputed for outside reuse)
+    //  Returns:    A series representation for the lower incomplete gamma
+    //              function for small s and small z (4 terms).
+    //  The actual "rolled up" summation looks like:
+	//      last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0;
+	//      sum = last_sign * last_pow / ((s + k) * last_factorial)
+	//      for(int i = 0; i < 4; ++i)
+	//      {
+	//          last_sign *= -1.0; last_pow *= z; last_factorial *= i;
+	//          sum += last_sign * last_pow / ((s + k) * last_factorial);
+	//      }
+	//  Unrolled, constant-unfolded and arranged for madds and parallelism:
+	const float4 scale = pow(z, s);
+	float4 sum = s_inv;  //  Summation iteration 0 result
+	//  Summation iterations 1, 2, and 3:
+	const float4 z_sq = z*z;
+	const float4 denom1 = s + float4(1.0);
+	const float4 denom2 = 2.0*s + float4(4.0);
+	const float4 denom3 = 6.0*s + float4(18.0);
+	//float4 denom4 = 24.0*s + float4(96.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	//sum += z_sq * z_sq / denom4;
+	//  Scale and return:
+	return scale * sum;
+}
+
+float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv)
+{
+    //  Float3 version:
+	const float3 scale = pow(z, s);
+	float3 sum = s_inv;
+	const float3 z_sq = z*z;
+	const float3 denom1 = s + float3(1.0);
+	const float3 denom2 = 2.0*s + float3(4.0);
+	const float3 denom3 = 6.0*s + float3(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv)
+{
+    //  Float2 version:
+	const float2 scale = pow(z, s);
+	float2 sum = s_inv;
+	const float2 z_sq = z*z;
+	const float2 denom1 = s + float2(1.0);
+	const float2 denom2 = 2.0*s + float2(4.0);
+	const float2 denom3 = 6.0*s + float2(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float ligamma_small_z_impl(const float s, const float z, const float s_inv)
+{
+    //  Float version:
+	const float scale = pow(z, s);
+	float sum = s_inv;
+	const float z_sq = z*z;
+	const float denom1 = s + 1.0;
+	const float denom2 = 2.0*s + 4.0;
+	const float denom3 = 6.0*s + 18.0;
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+//  Upper incomplete gamma function for small s and large z (implementation):
+float4 uigamma_large_z_impl(const float4 s, const float4 z)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z > ~0.775075
+    //  Returns:    Gauss's continued fraction representation for the upper
+    //              incomplete gamma function (4 terms).
+	//  The "rolled up" continued fraction looks like this.  The denominator
+    //  is truncated, and it's calculated "from the bottom up:"
+	//      denom = float4('inf');
+	//      float4 one = float4(1.0);
+	//      for(int i = 4; i > 0; --i)
+	//      {
+	//          denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom;
+	//      }
+	//  Unrolled and constant-unfolded for madds and parallelism:
+	const float4 numerator = pow(z, s) * exp(-z);
+	float4 denom = float4(7.0) + z - s;
+	denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom;
+	denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom;
+	denom = float4(1.0) + z - s + (s - float4(1.0))/denom;
+	return numerator / denom;
+}
+
+float3 uigamma_large_z_impl(const float3 s, const float3 z)
+{
+    //  Float3 version:
+	const float3 numerator = pow(z, s) * exp(-z);
+	float3 denom = float3(7.0) + z - s;
+	denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom;
+	denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom;
+	denom = float3(1.0) + z - s + (s - float3(1.0))/denom;
+	return numerator / denom;
+}
+
+float2 uigamma_large_z_impl(const float2 s, const float2 z)
+{
+    //  Float2 version:
+	const float2 numerator = pow(z, s) * exp(-z);
+	float2 denom = float2(7.0) + z - s;
+	denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom;
+	denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom;
+	denom = float2(1.0) + z - s + (s - float2(1.0))/denom;
+	return numerator / denom;
+}
+
+float uigamma_large_z_impl(const float s, const float z)
+{
+    //  Float version:
+	const float numerator = pow(z, s) * exp(-z);
+	float denom = 7.0 + z - s;
+	denom = 5.0 + z - s + (3.0*s - 9.0)/denom;
+	denom = 3.0 + z - s + (2.0*s - 4.0)/denom;
+	denom = 1.0 + z - s + (s - 1.0)/denom;
+	return numerator / denom;
+}
+
+//  Normalized lower incomplete gamma function for small s (implementation):
+float4 normalized_ligamma_impl(const float4 s, const float4 z,
+    const float4 s_inv, const float4 gamma_s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) s_inv = 1/s (precomputed for outside reuse)
+    //              3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse)
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  Since we only care about s < 0.5, we only need
+    //              to evaluate two branches (not four) based on z.  Each branch
+    //              uses four terms, with a max relative error of ~0.00182.  The
+    //              branch threshold and specifics were adapted for fewer terms
+    //              from Gil/Segura/Temme's paper here:
+    //                  http://oai.cwi.nl/oai/asset/20433/20433B.pdf
+	//  Evaluate both branches: Real branches test slower even when available.
+	static const float4 thresh = float4(0.775075);
+	bool4 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	z_is_large.w = z.w > thresh.w;
+	const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	//  Combine the results from both branches:
+	bool4 inverse_z_is_large = not(z_is_large);
+	return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large);
+}
+
+float3 normalized_ligamma_impl(const float3 s, const float3 z,
+    const float3 s_inv, const float3 gamma_s_inv)
+{
+    //  Float3 version:
+	static const float3 thresh = float3(0.775075);
+	bool3 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool3 inverse_z_is_large = not(z_is_large);
+	return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large);
+}
+
+float2 normalized_ligamma_impl(const float2 s, const float2 z,
+    const float2 s_inv, const float2 gamma_s_inv)
+{
+    //  Float2 version:
+	static const float2 thresh = float2(0.775075);
+	bool2 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool2 inverse_z_is_large = not(z_is_large);
+	return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large);
+}
+
+float normalized_ligamma_impl(const float s, const float z,
+    const float s_inv, const float gamma_s_inv)
+{
+    //  Float version:
+	static const float thresh = 0.775075;
+	const bool z_is_large = z > thresh;
+	const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	return large_z * float(z_is_large) + small_z * float(!z_is_large);
+}
+
+//  Normalized lower incomplete gamma function for small s:
+float4 normalized_ligamma(const float4 s, const float4 z)
+{
+    //  Requires:   s < ~0.5
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  See normalized_ligamma_impl() for details.
+	const float4 s_inv = float4(1.0)/s;
+	const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float3 normalized_ligamma(const float3 s, const float3 z)
+{
+    //  Float3 version:
+	const float3 s_inv = float3(1.0)/s;
+	const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float2 normalized_ligamma(const float2 s, const float2 z)
+{
+    //  Float2 version:
+	const float2 s_inv = float2(1.0)/s;
+	const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float normalized_ligamma(const float s, const float z)
+{
+    //  Float version:
+	const float s_inv = 1.0/s;
+	const float gamma_s_inv = 1.0/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+#endif  //  SPECIAL_FUNCTIONS_H
+
+////////////////////////////  END SPECIAL-FUNCTIONS  ///////////////////////////
+
+//#include "../../../../include/gamma-management.h"
+
+////////////////////////////  BEGIN GAMMA-MANAGEMENT  //////////////////////////
+
+#ifndef GAMMA_MANAGEMENT_H
+#define GAMMA_MANAGEMENT_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//  
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides gamma-aware tex*D*() and encode_output() functions.
+//  Requires:   Before #include-ing this file, the including file must #define
+//              the following macros when applicable and follow their rules:
+//              1.) #define FIRST_PASS if this is the first pass.
+//              2.) #define LAST_PASS if this is the last pass.
+//              3.) If sRGB is available, set srgb_framebufferN = "true" for
+//                  every pass except the last in your .cgp preset.
+//              4.) If sRGB isn't available but you want gamma-correctness with
+//                  no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
+//              5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
+//              6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
+//              7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
+//              8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
+//              If an option in [5, 8] is #defined in the first or last pass, it
+//              should be #defined for both.  It shouldn't make a difference
+//              whether it's #defined for intermediate passes or not.
+//  Optional:   The including file (or an earlier included file) may optionally
+//              #define a number of macros indicating it will override certain
+//              macros and associated constants are as follows:
+//              static constants with either static or uniform constants.  The
+//              1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
+//                  static const float ntsc_gamma
+//                  static const float pal_gamma
+//                  static const float crt_reference_gamma_high
+//                  static const float crt_reference_gamma_low
+//                  static const float lcd_reference_gamma
+//                  static const float crt_office_gamma
+//                  static const float lcd_office_gamma
+//              2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
+//                  static const float crt_gamma
+//                  static const float gba_gamma
+//                  static const float lcd_gamma
+//              3.) OVERRIDE_FINAL_GAMMA: The user must first define:
+//                  static const float input_gamma
+//                  static const float intermediate_gamma
+//                  static const float output_gamma
+//                  (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
+//              4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
+//                  static const bool assume_opaque_alpha
+//              The gamma constant overrides must be used in every pass or none,
+//              and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
+//              OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
+//  Usage:      After setting macros appropriately, ignore gamma correction and
+//              replace all tex*D*() calls with equivalent gamma-aware
+//              tex*D*_linearize calls, except:
+//              1.) When you read an LUT, use regular tex*D or a gamma-specified
+//                  function, depending on its gamma encoding:
+//                      tex*D*_linearize_gamma (takes a runtime gamma parameter)
+//              2.) If you must read pass0's original input in a later pass, use
+//                  tex2D_linearize_ntsc_gamma.  If you want to read pass0's
+//                  input with gamma-corrected bilinear filtering, consider
+//                  creating a first linearizing pass and reading from the input
+//                  of pass1 later.
+//              Then, return encode_output(color) from every fragment shader.
+//              Finally, use the global gamma_aware_bilinear boolean if you want
+//              to statically branch based on whether bilinear filtering is
+//              gamma-correct or not (e.g. for placing Gaussian blur samples).
+//
+//  Detailed Policy:
+//  tex*D*_linearize() functions enforce a consistent gamma-management policy
+//  based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings.  They assume
+//  their input texture has the same encoding characteristics as the input for
+//  the current pass (which doesn't apply to the exceptions listed above).
+//  Similarly, encode_output() enforces a policy based on the LAST_PASS and
+//  GAMMA_ENCODE_EVERY_FBO settings.  Together, they result in one of the
+//  following two pipelines.
+//  Typical pipeline with intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = linear_color;     //  Automatic sRGB encoding
+//      linear_color = intermediate_output;     //  Automatic sRGB decoding
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Typical pipeline without intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
+//      linear_color = pow(intermediate_output, intermediate_gamma);
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
+//  easily get gamma-correctness without banding on devices where sRGB isn't
+//  supported.
+//
+//  Use This Header to Maximize Code Reuse:
+//  The purpose of this header is to provide a consistent interface for texture
+//  reads and output gamma-encoding that localizes and abstracts away all the
+//  annoying details.  This greatly reduces the amount of code in each shader
+//  pass that depends on the pass number in the .cgp preset or whether sRGB
+//  FBO's are being used: You can trivially change the gamma behavior of your
+//  whole pass by commenting or uncommenting 1-3 #defines.  To reuse the same
+//  code in your first, Nth, and last passes, you can even put it all in another
+//  header file and #include it from skeleton .cg files that #define the
+//  appropriate pass-specific settings.
+//
+//  Rationale for Using Three Macros:
+//  This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
+//  SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
+//  a lower maintenance burden on each pass.  At first glance it seems we could
+//  accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
+//  This works for simple use cases where input_gamma == output_gamma, but it
+//  breaks down for more complex scenarios like CRT simulation, where the pass
+//  number determines the gamma encoding of the input and output.
+
+
+///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
+
+//  Set standard gamma constants, but allow users to override them:
+#ifndef OVERRIDE_STANDARD_GAMMA
+    //  Standard encoding gammas:
+    static const float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
+    static const float pal_gamma = 2.8;     //  Never actually 2.8 in practice
+    //  Typical device decoding gammas (only use for emulating devices):
+    //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
+    //  gammas: The standards purposely undercorrected for an analog CRT's
+    //  assumed 2.5 reference display gamma to maintain contrast in assumed
+    //  [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
+    //  These unstated assumptions about display gamma and perceptual rendering
+    //  intent caused a lot of confusion, and more modern CRT's seemed to target
+    //  NTSC 2.2 gamma with circuitry.  LCD displays seem to have followed suit
+    //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
+    //  displays designed to view sRGB in bright environments.  (Standards are
+    //  also in flux again with BT.1886, but it's underspecified for displays.)
+    static const float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
+    static const float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
+    static const float lcd_reference_gamma = 2.5;       //  To match CRT
+    static const float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
+    static const float lcd_office_gamma = 2.2;  //  Approximates sRGB
+#endif  //  OVERRIDE_STANDARD_GAMMA
+
+//  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
+//  but only if they're aware of it.
+#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
+    static const bool assume_opaque_alpha = false;
+#endif
+
+
+///////////////////////  DERIVED CONSTANTS AS FUNCTIONS  ///////////////////////
+
+//  gamma-management.h should be compatible with overriding gamma values with
+//  runtime user parameters, but we can only define other global constants in
+//  terms of static constants, not uniform user parameters.  To get around this
+//  limitation, we need to define derived constants using functions.
+
+//  Set device gamma constants, but allow users to override them:
+#ifdef OVERRIDE_DEVICE_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_crt_gamma()    {   return crt_gamma;   }
+    inline float get_gba_gamma()    {   return gba_gamma;   }
+    inline float get_lcd_gamma()    {   return lcd_gamma;   }
+#else
+    inline float get_crt_gamma()    {   return crt_reference_gamma_high;    }
+    inline float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
+    inline float get_lcd_gamma()    {   return lcd_office_gamma;            }
+#endif  //  OVERRIDE_DEVICE_GAMMA
+
+//  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
+#ifdef OVERRIDE_FINAL_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_intermediate_gamma()   {   return intermediate_gamma;  }
+    inline float get_input_gamma()          {   return input_gamma;         }
+    inline float get_output_gamma()         {   return output_gamma;        }
+#else
+    //  If we gamma-correct every pass, always use ntsc_gamma between passes to
+    //  ensure middle passes don't need to care if anything is being simulated:
+    inline float get_intermediate_gamma()   {   return ntsc_gamma;          }
+    #ifdef SIMULATE_CRT_ON_LCD
+        inline float get_input_gamma()      {   return get_crt_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_LCD
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_LCD_ON_CRT
+        inline float get_input_gamma()      {   return get_lcd_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_CRT
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else   //  Don't simulate anything:
+        inline float get_input_gamma()      {   return ntsc_gamma;          }
+        inline float get_output_gamma()     {   return ntsc_gamma;          }
+    #endif  //  SIMULATE_GBA_ON_CRT
+    #endif  //  SIMULATE_LCD_ON_CRT
+    #endif  //  SIMULATE_GBA_ON_LCD
+    #endif  //  SIMULATE_CRT_ON_LCD
+#endif  //  OVERRIDE_FINAL_GAMMA
+
+//  Set decoding/encoding gammas for the current pass.  Use static constants for
+//  linearize_input and gamma_encode_output, because they aren't derived, and
+//  they let the compiler do dead-code elimination.
+#ifndef GAMMA_ENCODE_EVERY_FBO
+    #ifdef FIRST_PASS
+        static const bool linearize_input = true;
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        static const bool linearize_input = false;
+        inline float get_pass_input_gamma()     {   return 1.0;                 }
+    #endif
+    #ifdef LAST_PASS
+        static const bool gamma_encode_output = true;
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        static const bool gamma_encode_output = false;
+        inline float get_pass_output_gamma()    {   return 1.0;                 }
+    #endif
+#else
+    static const bool linearize_input = true;
+    static const bool gamma_encode_output = true;
+    #ifdef FIRST_PASS
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        inline float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
+    #endif
+    #ifdef LAST_PASS
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        inline float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
+    #endif
+#endif
+
+//  Users might want to know if bilinear filtering will be gamma-correct:
+static const bool gamma_aware_bilinear = !linearize_input;
+
+
+//////////////////////  COLOR ENCODING/DECODING FUNCTIONS  /////////////////////
+
+inline float4 encode_output(const float4 color)
+{
+    if(gamma_encode_output)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_input(const float4 color)
+{
+    if(linearize_input)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_gamma_input(const float4 color, const float3 gamma)
+{
+    if(assume_opaque_alpha)
+    {
+        return float4(pow(color.rgb, gamma), 1.0);
+    }
+    else
+    {
+        return float4(pow(color.rgb, gamma), color.a);
+    }
+}
+
+//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯
+//#define tex2D_linearize(C, D) decode_input(vec4(texture(C, D)))
+// EDIT: it's the 'const' in front of the coords that's doing it
+
+///////////////////////////  TEXTURE LOOKUP WRAPPERS  //////////////////////////
+
+//  "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a wide array of linearizing texture lookup wrapper functions.  The
+//  Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
+//  lookups are provided for completeness in case that changes someday.  Nobody
+//  is likely to use the *fetch and *proj functions, but they're included just
+//  in case.  The only tex*D texture sampling functions omitted are:
+//      - tex*Dcmpbias
+//      - tex*Dcmplod
+//      - tex*DARRAY*
+//      - tex*DMS*
+//      - Variants returning integers
+//  Standard line length restrictions are ignored below for vertical brevity.
+/*
+//  tex1D:
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex1Dbias:
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dbias(tex, tex_coords));   }
+
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dbias(tex, tex_coords, texel_off));    }
+
+//  tex1Dfetch:
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
+{   return decode_input(tex1Dfetch(tex, tex_coords));  }
+
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex1Dlod:
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dlod(tex, tex_coords));    }
+
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dlod(tex, tex_coords, texel_off));     }
+
+//  tex1Dproj:
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+*/
+//  tex2D:
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords, texel_off));    }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex2Dbias:
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
+//{   return decode_input(tex2Dbias(tex, tex_coords));   }
+
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dbias(tex, tex_coords, texel_off));    }
+
+//  tex2Dfetch:
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
+//{   return decode_input(tex2Dfetch(tex, tex_coords));  }
+
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords)
+{   return decode_input(textureLod(tex, tex_coords.xy, 0.0));    }
+
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));     }
+/*
+//  tex2Dproj:
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+*/
+/*
+//  tex3D:
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
+{   return decode_input(tex3D(tex, tex_coords));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, texel_off));    }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex3Dbias:
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dbias(tex, tex_coords));   }
+
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dbias(tex, tex_coords, texel_off));    }
+
+//  tex3Dfetch:
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
+{   return decode_input(tex3Dfetch(tex, tex_coords));  }
+
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex3Dlod:
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dlod(tex, tex_coords));    }
+
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dlod(tex, tex_coords, texel_off));     }
+
+//  tex3Dproj:
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dproj(tex, tex_coords));   }
+
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dproj(tex, tex_coords, texel_off));    }
+/////////*
+
+//  NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  This narrow selection of nonstandard tex2D* functions can be useful:
+
+//  tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0)));   }
+
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off));    }
+
+
+//  MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a narrower selection of tex2D* wrapper functions that decode an
+//  input sample with a specified gamma value.  These are useful for reading
+//  LUT's and for reading the input of pass0 in a later pass.
+
+//  tex2D:
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma);   }
+
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+/*
+//  tex2Dbias:
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma);   }
+
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma);    }
+
+//  tex2Dfetch:
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma);  }
+
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma);   }
+*/
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma);    }
+
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma);     }
+
+
+#endif  //  GAMMA_MANAGEMENT_H
+
+////////////////////////////  END GAMMA-MANAGEMENT  //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+/////////////////////////////  SCANLINE FUNCTIONS  /////////////////////////////
+
+inline float3 get_gaussian_sigma(const float3 color, const float sigma_range)
+{
+    //  Requires:   Globals:
+    //              1.) beam_min_sigma and beam_max_sigma are global floats
+    //                  containing the desired minimum and maximum beam standard
+    //                  deviations, for dim and bright colors respectively.
+    //              2.) beam_max_sigma must be > 0.0
+    //              3.) beam_min_sigma must be in (0.0, beam_max_sigma]
+    //              4.) beam_spot_power must be defined as a global float.
+    //              Parameters:
+    //              1.) color is the underlying source color along a scanline
+    //              2.) sigma_range = beam_max_sigma - beam_min_sigma; we take
+    //                  sigma_range as a parameter to avoid repeated computation
+    //                  when beam_{min, max}_sigma are runtime shader parameters
+    //  Optional:   Users may set beam_spot_shape_function to 1 to define the
+    //              inner f(color) subfunction (see below) as:
+    //                  f(color) = sqrt(1.0 - (color - 1.0)*(color - 1.0))
+    //              Otherwise (technically, if beam_spot_shape_function < 0.5):
+    //                  f(color) = pow(color, beam_spot_power)
+    //  Returns:    The standard deviation of the Gaussian beam for "color:"
+    //                  sigma = beam_min_sigma + sigma_range * f(color)
+    //  Details/Discussion:
+    //  The beam's spot shape vaguely resembles an aspect-corrected f() in the
+    //  range [0, 1] (not quite, but it's related).  f(color) = color makes
+    //  spots look like diamonds, and a spherical function or cube balances
+    //  between variable width and a soft/realistic shape.   A beam_spot_power
+    //  > 1.0 can produce an ugly spot shape and more initial clipping, but the
+    //  final shape also differs based on the horizontal resampling filter and
+    //  the phosphor bloom.  For instance, resampling horizontally in nonlinear
+    //  light and/or with a sharp (e.g. Lanczos) filter will sharpen the spot
+    //  shape, but a sixth root is still quite soft.  A power function (default
+    //  1.0/3.0 beam_spot_power) is most flexible, but a fixed spherical curve
+    //  has the highest variability without an awful spot shape.
+    //
+    //  beam_min_sigma affects scanline sharpness/aliasing in dim areas, and its
+    //  difference from beam_max_sigma affects beam width variability.  It only
+    //  affects clipping [for pure Gaussians] if beam_spot_power > 1.0 (which is
+    //  a conservative estimate for a more complex constraint).
+    //
+    //  beam_max_sigma affects clipping and increasing scanline width/softness
+    //  as color increases.  The wider this is, the more scanlines need to be
+    //  evaluated to avoid distortion.  For a pure Gaussian, the max_beam_sigma
+    //  at which the first unused scanline always has a weight < 1.0/255.0 is:
+    //      num scanlines = 2, max_beam_sigma = 0.2089; distortions begin ~0.34
+    //      num scanlines = 3, max_beam_sigma = 0.3879; distortions begin ~0.52
+    //      num scanlines = 4, max_beam_sigma = 0.5723; distortions begin ~0.70
+    //      num scanlines = 5, max_beam_sigma = 0.7591; distortions begin ~0.89
+    //      num scanlines = 6, max_beam_sigma = 0.9483; distortions begin ~1.08
+    //  Generalized Gaussians permit more leeway here as steepness increases.
+    if(beam_spot_shape_function < 0.5)
+    {
+        //  Use a power function:
+        return float3(beam_min_sigma) + sigma_range *
+            pow(color, float3(beam_spot_power));
+    }
+    else
+    {
+        //  Use a spherical function:
+        const float3 color_minus_1 = color - float3(1.0);
+        return float3(beam_min_sigma) + sigma_range *
+            sqrt(float3(1.0) - color_minus_1*color_minus_1);
+    }
+}
+
+inline float3 get_generalized_gaussian_beta(const float3 color,
+    const float shape_range)
+{
+    //  Requires:   Globals:
+    //              1.) beam_min_shape and beam_max_shape are global floats
+    //                  containing the desired min/max generalized Gaussian
+    //                  beta parameters, for dim and bright colors respectively.
+    //              2.) beam_max_shape must be >= 2.0
+    //              3.) beam_min_shape must be in [2.0, beam_max_shape]
+    //              4.) beam_shape_power must be defined as a global float.
+    //              Parameters:
+    //              1.) color is the underlying source color along a scanline
+    //              2.) shape_range = beam_max_shape - beam_min_shape; we take
+    //                  shape_range as a parameter to avoid repeated computation
+    //                  when beam_{min, max}_shape are runtime shader parameters
+    //  Returns:    The type-I generalized Gaussian "shape" parameter beta for
+    //              the given color.
+    //  Details/Discussion:
+    //  Beta affects the scanline distribution as follows:
+    //  a.) beta < 2.0 narrows the peak to a spike with a discontinuous slope
+    //  b.) beta == 2.0 just degenerates to a Gaussian
+    //  c.) beta > 2.0 flattens and widens the peak, then drops off more steeply
+    //      than a Gaussian.  Whereas high sigmas widen and soften peaks, high
+    //      beta widen and sharpen peaks at the risk of aliasing.
+    //  Unlike high beam_spot_powers, high beam_shape_powers actually soften shape
+    //  transitions, whereas lower ones sharpen them (at the risk of aliasing).
+    return beam_min_shape + shape_range * pow(color, float3(beam_shape_power));
+}
+
+float3 scanline_gaussian_integral_contrib(const float3 dist,
+    const float3 color, const float pixel_height, const float sigma_range)
+{
+    //  Requires:   1.) dist is the distance of the [potentially separate R/G/B]
+    //                  point(s) from a scanline in units of scanlines, where
+    //                  1.0 means the sample point straddles the next scanline.
+    //              2.) color is the underlying source color along a scanline.
+    //              3.) pixel_height is the output pixel height in scanlines.
+    //              4.) Requirements of get_gaussian_sigma() must be met.
+    //  Returns:    Return a scanline's light output over a given pixel.
+    //  Details:
+    //  The CRT beam profile follows a roughly Gaussian distribution which is
+    //  wider for bright colors than dark ones.  The integral over the full
+    //  range of a Gaussian function is always 1.0, so we can vary the beam
+    //  with a standard deviation without affecting brightness.  'x' = distance:
+    //      gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2))
+    //      gaussian integral = 0.5 (1.0 + erf(x/(sigma * sqrt(2))))
+    //  Use a numerical approximation of the "error function" (the Gaussian
+    //  indefinite integral) to find the definite integral of the scanline's
+    //  average brightness over a given pixel area.  Even if curved coords were
+    //  used in this pass, a flat scalar pixel height works almost as well as a
+    //  pixel height computed from a full pixel-space to scanline-space matrix.
+    const float3 sigma = get_gaussian_sigma(color, sigma_range);
+    const float3 ph_offset = float3(pixel_height * 0.5);
+    const float3 denom_inv = 1.0/(sigma*sqrt(2.0));
+    const float3 integral_high = erf((dist + ph_offset)*denom_inv);
+    const float3 integral_low = erf((dist - ph_offset)*denom_inv);
+    return color * 0.5*(integral_high - integral_low)/pixel_height;
+}
+
+float3 scanline_generalized_gaussian_integral_contrib(float3 dist,
+    float3 color, float pixel_height, float sigma_range,
+    float shape_range)
+{
+    //  Requires:   1.) Requirements of scanline_gaussian_integral_contrib()
+    //                  must be met.
+    //              2.) Requirements of get_gaussian_sigma() must be met.
+    //              3.) Requirements of get_generalized_gaussian_beta() must be
+    //                  met.
+    //  Returns:    Return a scanline's light output over a given pixel.
+    //  A generalized Gaussian distribution allows the shape (beta) to vary
+    //  as well as the width (alpha).  "gamma" refers to the gamma function:
+    //      generalized sample =
+    //          beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta)
+    //  ligamma(s, z) is the lower incomplete gamma function, for which we only
+    //  implement two of four branches (because we keep 1/beta <= 0.5):
+    //      generalized integral = 0.5 + 0.5* sign(x) *
+    //          ligamma(1/beta, (|x|/alpha)**beta)/gamma(1/beta)
+    //  See get_generalized_gaussian_beta() for a discussion of beta.
+    //  We base alpha on the intended Gaussian sigma, but it only strictly
+    //  models models standard deviation at beta == 2, because the standard
+    //  deviation depends on both alpha and beta (keeping alpha independent is
+    //  faster and preserves intuitive behavior and a full spectrum of results).
+    const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
+    const float3 beta = get_generalized_gaussian_beta(color, shape_range);
+    const float3 alpha_inv = float3(1.0)/alpha;
+    const float3 s = float3(1.0)/beta;
+    const float3 ph_offset = float3(pixel_height * 0.5);
+    //  Pass beta to gamma_impl to avoid repeated divides.  Similarly pass
+    //  beta (i.e. 1/s) and 1/gamma(s) to normalized_ligamma_impl.
+    const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, beta);
+    const float3 dist1 = dist + ph_offset;
+    const float3 dist0 = dist - ph_offset;
+    const float3 integral_high = sign(dist1) * normalized_ligamma_impl(
+        s, pow(abs(dist1)*alpha_inv, beta), beta, gamma_s_inv);
+    const float3 integral_low = sign(dist0) * normalized_ligamma_impl(
+        s, pow(abs(dist0)*alpha_inv, beta), beta, gamma_s_inv);
+    return color * 0.5*(integral_high - integral_low)/pixel_height;
+}
+
+float3 scanline_gaussian_sampled_contrib(const float3 dist, const float3 color,
+    const float pixel_height, const float sigma_range)
+{
+    //  See scanline_gaussian integral_contrib() for detailed comments!
+    //  gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2))
+    const float3 sigma = get_gaussian_sigma(color, sigma_range);
+    //  Avoid repeated divides:
+    const float3 sigma_inv = float3(1.0)/sigma;
+    const float3 inner_denom_inv = 0.5 * sigma_inv * sigma_inv;
+    const float3 outer_denom_inv = sigma_inv/sqrt(2.0*pi);
+    if(beam_antialias_level > 0.5)
+    {
+        //  Sample 1/3 pixel away in each direction as well:
+        const float3 sample_offset = float3(pixel_height/3.0);
+        const float3 dist2 = dist + sample_offset;
+        const float3 dist3 = abs(dist - sample_offset);
+        //  Average three pure Gaussian samples:
+        const float3 scale = color/3.0 * outer_denom_inv;
+        const float3 weight1 = exp(-(dist*dist)*inner_denom_inv);
+        const float3 weight2 = exp(-(dist2*dist2)*inner_denom_inv);
+        const float3 weight3 = exp(-(dist3*dist3)*inner_denom_inv);
+        return scale * (weight1 + weight2 + weight3);
+    }
+    else
+    {
+        return color*exp(-(dist*dist)*inner_denom_inv)*outer_denom_inv;
+    }
+}
+
+float3 scanline_generalized_gaussian_sampled_contrib(float3 dist,
+    float3 color, float pixel_height, float sigma_range,
+    float shape_range)
+{
+    //  See scanline_generalized_gaussian_integral_contrib() for details!
+    //  generalized sample =
+    //      beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta)
+    const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
+    const float3 beta = get_generalized_gaussian_beta(color, shape_range);
+    //  Avoid repeated divides:
+    const float3 alpha_inv = float3(1.0)/alpha;
+    const float3 beta_inv = float3(1.0)/beta;
+    const float3 scale = color * beta * 0.5 * alpha_inv /
+        gamma_impl(beta_inv, beta);
+    if(beam_antialias_level > 0.5)
+    {
+        //  Sample 1/3 pixel closer to and farther from the scanline too.
+        const float3 sample_offset = float3(pixel_height/3.0);
+        const float3 dist2 = dist + sample_offset;
+        const float3 dist3 = abs(dist - sample_offset);
+        //  Average three generalized Gaussian samples:
+        const float3 weight1 = exp(-pow(abs(dist*alpha_inv), beta));
+        const float3 weight2 = exp(-pow(abs(dist2*alpha_inv), beta));
+        const float3 weight3 = exp(-pow(abs(dist3*alpha_inv), beta));
+        return scale/3.0 * (weight1 + weight2 + weight3);
+    }
+    else
+    {
+        return scale * exp(-pow(abs(dist*alpha_inv), beta));
+    }
+}
+
+inline float3 scanline_contrib(float3 dist, float3 color,
+    float pixel_height, const float sigma_range, const float shape_range)
+{
+    //  Requires:   1.) Requirements of scanline_gaussian_integral_contrib()
+    //                  must be met.
+    //              2.) Requirements of get_gaussian_sigma() must be met.
+    //              3.) Requirements of get_generalized_gaussian_beta() must be
+    //                  met.
+    //  Returns:    Return a scanline's light output over a given pixel, using
+    //              a generalized or pure Gaussian distribution and sampling or
+    //              integrals as desired by user codepath choices.
+    if(beam_generalized_gaussian)
+    {
+        if(beam_antialias_level > 1.5)
+        {
+            return scanline_generalized_gaussian_integral_contrib(
+                dist, color, pixel_height, sigma_range, shape_range);
+        }
+        else
+        {
+            return scanline_generalized_gaussian_sampled_contrib(
+                dist, color, pixel_height, sigma_range, shape_range);
+        }
+    }
+    else
+    {
+        if(beam_antialias_level > 1.5)
+        {
+            return scanline_gaussian_integral_contrib(
+                dist, color, pixel_height, sigma_range);
+        }
+        else
+        {
+            return scanline_gaussian_sampled_contrib(
+                dist, color, pixel_height, sigma_range);
+        }
+    }
+}
+
+inline float3 get_raw_interpolated_color(const float3 color0,
+    const float3 color1, const float3 color2, const float3 color3,
+    const float4 weights)
+{
+    //  Use max to avoid bizarre artifacts from negative colors:
+    return max(mul(weights, float4x3(color0, color1, color2, color3)), 0.0);
+}
+
+float3 get_interpolated_linear_color(const float3 color0, const float3 color1,
+    const float3 color2, const float3 color3, const float4 weights)
+{
+    //  Requires:   1.) Requirements of include/gamma-management.h must be met:
+    //                  intermediate_gamma must be globally defined, and input
+    //                  colors are interpreted as linear RGB unless you #define
+    //                  GAMMA_ENCODE_EVERY_FBO (in which case they are
+    //                  interpreted as gamma-encoded with intermediate_gamma).
+    //              2.) color0-3 are colors sampled from a texture with tex2D().
+    //                  They are interpreted as defined in requirement 1.
+    //              3.) weights contains weights for each color, summing to 1.0.
+    //              4.) beam_horiz_linear_rgb_weight must be defined as a global
+    //                  float in [0.0, 1.0] describing how much blending should
+    //                  be done in linear RGB (rest is gamma-corrected RGB).
+    //              5.) RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE must be #defined
+    //                  if beam_horiz_linear_rgb_weight is anything other than a
+    //                  static constant, or we may try branching at runtime
+    //                  without dynamic branches allowed (slow).
+    //  Returns:    Return an interpolated color lookup between the four input
+    //              colors based on the weights in weights.  The final color will
+    //              be a linear RGB value, but the blending will be done as
+    //              indicated above.
+    const float intermediate_gamma = get_intermediate_gamma();
+    //  Branch if beam_horiz_linear_rgb_weight is static (for free) or if the
+    //  profile allows dynamic branches (faster than computing extra pows):
+    #ifndef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+    #else
+        #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+            #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+        #endif
+    #endif
+    #ifdef SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+        //  beam_horiz_linear_rgb_weight is static, so we can branch:
+        #ifdef GAMMA_ENCODE_EVERY_FBO
+            const float3 gamma_mixed_color = pow(get_raw_interpolated_color(
+                color0, color1, color2, color3, weights), float3(intermediate_gamma));
+            if(beam_horiz_linear_rgb_weight > 0.0)
+            {
+                const float3 linear_mixed_color = get_raw_interpolated_color(
+                    pow(color0, float3(intermediate_gamma)),
+                    pow(color1, float3(intermediate_gamma)),
+                    pow(color2, float3(intermediate_gamma)),
+                    pow(color3, float3(intermediate_gamma)),
+                    weights);
+                return lerp(gamma_mixed_color, linear_mixed_color,
+                    beam_horiz_linear_rgb_weight);
+            }
+            else
+            {
+                return gamma_mixed_color;
+            }
+        #else
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                color0, color1, color2, color3, weights);
+            if(beam_horiz_linear_rgb_weight < 1.0)
+            {
+                const float3 gamma_mixed_color = get_raw_interpolated_color(
+                    pow(color0, float3(1.0/intermediate_gamma)),
+                    pow(color1, float3(1.0/intermediate_gamma)),
+                    pow(color2, float3(1.0/intermediate_gamma)),
+                    pow(color3, float3(1.0/intermediate_gamma)),
+                    weights);
+                return lerp(gamma_mixed_color, linear_mixed_color,
+                    beam_horiz_linear_rgb_weight);
+            }
+            else
+            {
+                return linear_mixed_color;
+            }
+        #endif  //  GAMMA_ENCODE_EVERY_FBO
+    #else
+        #ifdef GAMMA_ENCODE_EVERY_FBO
+            //  Inputs: color0-3 are colors in gamma-encoded RGB.
+            const float3 gamma_mixed_color = pow(get_raw_interpolated_color(
+                color0, color1, color2, color3, weights), intermediate_gamma);
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                pow(color0, float3(intermediate_gamma)),
+                pow(color1, float3(intermediate_gamma)),
+                pow(color2, float3(intermediate_gamma)),
+                pow(color3, float3(intermediate_gamma)),
+                weights);
+            return lerp(gamma_mixed_color, linear_mixed_color,
+                beam_horiz_linear_rgb_weight);
+        #else
+            //  Inputs: color0-3 are colors in linear RGB.
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                color0, color1, color2, color3, weights);
+            const float3 gamma_mixed_color = get_raw_interpolated_color(
+                    pow(color0, float3(1.0/intermediate_gamma)),
+                    pow(color1, float3(1.0/intermediate_gamma)),
+                    pow(color2, float3(1.0/intermediate_gamma)),
+                    pow(color3, float3(1.0/intermediate_gamma)),
+                    weights);
+			// wtf fixme
+//			const float beam_horiz_linear_rgb_weight1 = 1.0;
+            return lerp(gamma_mixed_color, linear_mixed_color,
+                beam_horiz_linear_rgb_weight);
+        #endif  //  GAMMA_ENCODE_EVERY_FBO
+    #endif  //  SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+}
+
+float3 get_scanline_color(const sampler2D tex, const float2 scanline_uv,
+    const float2 uv_step_x, const float4 weights)
+{
+    //  Requires:   1.) scanline_uv must be vertically snapped to the caller's
+    //                  desired line or scanline and horizontally snapped to the
+    //                  texel just left of the output pixel (color1)
+    //              2.) uv_step_x must contain the horizontal uv distance
+    //                  between texels.
+    //              3.) weights must contain interpolation filter weights for
+    //                  color0, color1, color2, and color3, where color1 is just
+    //                  left of the output pixel.
+    //  Returns:    Return a horizontally interpolated texture lookup using 2-4
+    //              nearby texels, according to weights and the conventions of
+    //              get_interpolated_linear_color().
+    //  We can ignore the outside texture lookups for Quilez resampling.
+    const float3 color1 = COMPAT_TEXTURE(tex, scanline_uv).rgb;
+    const float3 color2 = COMPAT_TEXTURE(tex, scanline_uv + uv_step_x).rgb;
+    float3 color0 = float3(0.0);
+    float3 color3 = float3(0.0);
+    if(beam_horiz_filter > 0.5)
+    {
+        color0 = COMPAT_TEXTURE(tex, scanline_uv - uv_step_x).rgb;
+        color3 = COMPAT_TEXTURE(tex, scanline_uv + 2.0 * uv_step_x).rgb;
+    }
+    //  Sample the texture as-is, whether it's linear or gamma-encoded:
+    //  get_interpolated_linear_color() will handle the difference.
+    return get_interpolated_linear_color(color0, color1, color2, color3, weights);
+}
+
+float3 sample_single_scanline_horizontal(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size,
+    const float2 texture_size_inv)
+{
+    //  TODO: Add function requirements.
+    //  Snap to the previous texel and get sample dists from 2/4 nearby texels:
+    const float2 curr_texel = tex_uv * tex_size;
+    //  Use under_half to fix a rounding bug right around exact texel locations.
+    const float2 prev_texel =
+        floor(curr_texel - float2(under_half)) + float2(0.5);
+    const float2 prev_texel_hor = float2(prev_texel.x, curr_texel.y);
+    const float2 prev_texel_hor_uv = prev_texel_hor * texture_size_inv;
+    const float prev_dist = curr_texel.x - prev_texel_hor.x;
+    const float4 sample_dists = float4(1.0 + prev_dist, prev_dist,
+        1.0 - prev_dist, 2.0 - prev_dist);
+    //  Get Quilez, Lanczos2, or Gaussian resize weights for 2/4 nearby texels:
+    float4 weights;
+    if(beam_horiz_filter < 0.5)
+    {
+        //  Quilez:
+        const float x = sample_dists.y;
+        const float w2 = x*x*x*(x*(x*6.0 - 15.0) + 10.0);
+        weights = float4(0.0, 1.0 - w2, w2, 0.0);
+    }
+    else if(beam_horiz_filter < 1.5)
+    {
+        //  Gaussian:
+        float inner_denom_inv = 1.0/(2.0*beam_horiz_sigma*beam_horiz_sigma);
+        weights = exp(-(sample_dists*sample_dists)*inner_denom_inv);
+    }
+    else
+    {
+        //  Lanczos2:
+        const float4 pi_dists = FIX_ZERO(sample_dists * pi);
+        weights = 2.0 * sin(pi_dists) * sin(pi_dists * 0.5) /
+            (pi_dists * pi_dists);
+    }
+    //  Ensure the weight sum == 1.0:
+    const float4 final_weights = weights/dot(weights, float4(1.0));
+    //  Get the interpolated horizontal scanline color:
+    const float2 uv_step_x = float2(texture_size_inv.x, 0.0);
+    return get_scanline_color(
+        tex, prev_texel_hor_uv, uv_step_x, final_weights);
+}
+
+float3 sample_rgb_scanline_horizontal(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size,
+    const float2 texture_size_inv)
+{
+    //  TODO: Add function requirements.
+    //  Rely on a helper to make convergence easier.
+    if(beam_misconvergence)
+    {
+        const float3 convergence_offsets_rgb =
+            get_convergence_offsets_x_vector();
+        const float3 offset_u_rgb =
+            convergence_offsets_rgb * texture_size_inv.xxx;
+        const float2 scanline_uv_r = tex_uv - float2(offset_u_rgb.r, 0.0);
+        const float2 scanline_uv_g = tex_uv - float2(offset_u_rgb.g, 0.0);
+        const float2 scanline_uv_b = tex_uv - float2(offset_u_rgb.b, 0.0);
+        const float3 sample_r = sample_single_scanline_horizontal(
+            tex, scanline_uv_r, tex_size, texture_size_inv);
+        const float3 sample_g = sample_single_scanline_horizontal(
+            tex, scanline_uv_g, tex_size, texture_size_inv);
+        const float3 sample_b = sample_single_scanline_horizontal(
+            tex, scanline_uv_b, tex_size, texture_size_inv);
+        return float3(sample_r.r, sample_g.g, sample_b.b);
+    }
+    else
+    {
+        return sample_single_scanline_horizontal(tex, tex_uv, tex_size,
+            texture_size_inv);
+    }
+}
+
+float2 get_last_scanline_uv(const float2 tex_uv, const float2 tex_size,
+    const float2 texture_size_inv, const float2 il_step_multiple,
+    const float frame_count, out float dist)
+{
+    //  Compute texture coords for the last/upper scanline, accounting for
+    //  interlacing: With interlacing, only consider even/odd scanlines every
+    //  other frame.  Top-field first (TFF) order puts even scanlines on even
+    //  frames, and BFF order puts them on odd frames.  Texels are centered at:
+    //      frac(tex_uv * tex_size) == x.5
+    //  Caution: If these coordinates ever seem incorrect, first make sure it's
+    //  not because anisotropic filtering is blurring across field boundaries.
+    //  Note: TFF/BFF won't matter for sources that double-weave or similar.
+	// wtf fixme
+//	const float interlace_bff1 = 1.0;
+    const float field_offset = floor(il_step_multiple.y * 0.75) *
+        fmod(frame_count + float(interlace_bff), 2.0);
+    const float2 curr_texel = tex_uv * tex_size;
+    //  Use under_half to fix a rounding bug right around exact texel locations.
+    const float2 prev_texel_num = floor(curr_texel - float2(under_half));
+    const float wrong_field = fmod(
+        prev_texel_num.y + field_offset, il_step_multiple.y);
+    const float2 scanline_texel_num = prev_texel_num - float2(0.0, wrong_field);
+    //  Snap to the center of the previous scanline in the current field:
+    const float2 scanline_texel = scanline_texel_num + float2(0.5);
+    const float2 scanline_uv = scanline_texel * texture_size_inv;
+    //  Save the sample's distance from the scanline, in units of scanlines:
+    dist = (curr_texel.y - scanline_texel.y)/il_step_multiple.y;
+    return scanline_uv;
+}
+
+inline bool is_interlaced(float num_lines)
+{
+    //  Detect interlacing based on the number of lines in the source.
+    if(interlace_detect)
+    {
+        //  NTSC: 525 lines, 262.5/field; 486 active (2 half-lines), 243/field
+        //  NTSC Emulators: Typically 224 or 240 lines
+        //  PAL: 625 lines, 312.5/field; 576 active (typical), 288/field
+        //  PAL Emulators: ?
+        //  ATSC: 720p, 1080i, 1080p
+        //  Where do we place our cutoffs?  Assumptions:
+        //  1.) We only need to care about active lines.
+        //  2.) Anything > 288 and <= 576 lines is probably interlaced.
+        //  3.) Anything > 576 lines is probably not interlaced...
+        //  4.) ...except 1080 lines, which is a crapshoot (user decision).
+        //  5.) Just in case the main program uses calculated video sizes,
+        //      we should nudge the float thresholds a bit.
+        const bool sd_interlace = ((num_lines > 288.5) && (num_lines < 576.5));
+        const bool hd_interlace = bool(interlace_1080i) ?
+            ((num_lines > 1079.5) && (num_lines < 1080.5)) :
+            false;
+        return (sd_interlace || hd_interlace);
+    }
+    else
+    {
+        return false;
+    }
+}
+
+#endif  //  SCANLINE_FUNCTIONS_H
+
+/////////////////////////////  END SCANLINE-FUNCTIONS  ////////////////////////////
+
+//////////////////////////////  END VERTEX-INCLUDES  //////////////////////////////
+
+#undef COMPAT_PRECISION
+#undef COMPAT_TEXTURE
+
+float bloom_approx_scale_x = targetSize.x / sourceSize[0].y;
+const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0);
+const float bloom_diff_thresh_ = 1.0/256.0;
+
+// copied from bloom-functions.h
+inline float get_min_sigma_to_blur_triad(const float triad_size,
+    const float thresh)
+{
+    //  Requires:   1.) triad_size is the final phosphor triad size in pixels
+    //              2.) thresh is the max desired pixel difference in the
+    //                  blurred triad (e.g. 1.0/256.0).
+    //  Returns:    Return the minimum sigma that will fully blur a phosphor
+    //              triad on the screen to an even color, within thresh.
+    //              This closed-form function was found by curve-fitting data.
+    //  Estimate: max error = ~0.086036, mean sq. error = ~0.0013387:
+    return -0.05168 + 0.6113*triad_size -
+        1.122*triad_size*sqrt(0.000416 + thresh);
+    //  Estimate: max error = ~0.16486, mean sq. error = ~0.0041041:
+    //return 0.5985*triad_size - triad_size*sqrt(thresh)
+}
+
+void main() {
+   gl_Position = position;
+   vTexCoord = texCoord * 1.0001;
+	float2 tex_uv = vTexCoord.xy;
+
+// These things keep causing weird behavior and they're not needed except for NPOT, so...   
+/*    //  Our various input textures use different coords:
+    const float2 video_uv = tex_uv;// * texture_size/video_size;
+    video_uv = video_uv;
+    scanline_tex_uv = video_uv;// * MASKED_SCANLINESvideo_size /
+        MASKED_SCANLINEStexture_size;
+    halation_tex_uv = video_uv;// * HALATION_BLURvideo_size /
+        HALATION_BLURtexture_size;
+    brightpass_tex_uv = video_uv;// * BRIGHTPASSvideo_size /
+        BRIGHTPASStexture_size;
+    bloom_tex_uv = tex_uv;
+*/
+    //  We're horizontally blurring the bloom input (vertically blurred
+    //  brightpass).  Get the uv distance between output pixels / input texels
+    //  in the horizontal direction (this pass must NOT resize):
+    bloom_dxdy = float2(1.0/texture_size.x, 0.0);
+
+    //  Calculate a runtime bloom_sigma in case it's needed:
+    const float mask_tile_size_x = get_resized_mask_tile_size(
+        output_size, output_size * mask_resize_viewport_scale, false).x;
+    bloom_sigma_runtime = get_min_sigma_to_blur_triad(
+        mask_tile_size_x / mask_triads_per_tile, bloom_diff_thresh_);
+}
\ No newline at end of file
diff --git a/shaders/CRT-Royale.shader/bloom-vertical.fs b/shaders/CRT-Royale.shader/bloom-vertical.fs
new file mode 100644
index 00000000..4c37eee1
--- /dev/null
+++ b/shaders/CRT-Royale.shader/bloom-vertical.fs
@@ -0,0 +1,4824 @@
+#version 150
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+uniform sampler2D source[];
+uniform vec4 sourceSize[];
+uniform vec4 targetSize;
+
+in Vertex {
+   vec2 vTexCoord;
+   vec2 tex_uv;
+   vec2 bloom_dxdy;
+   float bloom_sigma_runtime;
+};
+
+out vec4 FragColor;
+
+// USER SETTINGS BLOCK //
+
+#define crt_gamma 2.500000
+#define lcd_gamma 2.200000
+#define levels_contrast 1.0
+#define halation_weight 0.0
+#define diffusion_weight 0.075
+#define bloom_underestimate_levels 0.8
+#define bloom_excess 0.000000
+#define beam_min_sigma 0.020000
+#define beam_max_sigma 0.300000
+#define beam_spot_power 0.330000
+#define beam_min_shape 2.000000
+#define beam_max_shape 4.000000
+#define beam_shape_power 0.250000
+#define beam_horiz_filter 0.000000
+#define beam_horiz_sigma 0.35
+#define beam_horiz_linear_rgb_weight 1.000000
+#define convergence_offset_x_r -0.000000
+#define convergence_offset_x_g 0.000000
+#define convergence_offset_x_b 0.000000
+#define convergence_offset_y_r 0.000000
+#define convergence_offset_y_g -0.000000
+#define convergence_offset_y_b 0.000000
+#define mask_type 1.000000
+#define mask_sample_mode_desired 0.000000
+#define mask_specify_num_triads 0.000000
+#define mask_triad_size_desired 3.000000
+#define mask_num_triads_desired 480.000000
+#define aa_subpixel_r_offset_x_runtime -0.0
+#define aa_subpixel_r_offset_y_runtime 0.000000
+#define aa_cubic_c 0.500000
+#define aa_gauss_sigma 0.500000
+#define geom_mode_runtime 0.000000
+#define geom_radius 2.000000
+#define geom_view_dist 2.000000
+#define geom_tilt_angle_x 0.000000
+#define geom_tilt_angle_y 0.000000
+#define geom_aspect_ratio_x 432.000000
+#define geom_aspect_ratio_y 329.000000
+#define geom_overscan_x 1.000000
+#define geom_overscan_y 1.000000
+#define border_size 0.015
+#define border_darkness 2.0
+#define border_compress 2.500000
+#define interlace_bff 0.000000
+#define interlace_1080i 0.000000
+
+// END USER SETTINGS BLOCK //
+
+// compatibility macros for transparently converting HLSLisms into GLSLisms
+#define mul(a,b) (b*a)
+#define lerp(a,b,c) mix(a,b,c)
+#define saturate(c) clamp(c, 0.0, 1.0)
+#define frac(x) (fract(x))
+#define float2 vec2
+#define float3 vec3
+#define float4 vec4
+#define bool2 bvec2
+#define bool3 bvec3
+#define bool4 bvec4
+#define float2x2 mat2x2
+#define float3x3 mat3x3
+#define float4x4 mat4x4
+#define float4x3 mat4x3
+#define float2x4 mat2x4
+#define IN params
+#define texture_size sourceSize[0].xy
+#define video_size sourceSize[0].xy
+#define output_size targetSize.xy
+#define frame_count phase
+#define static  
+#define inline  
+#define const  
+#define fmod(x,y) mod(x,y)
+#define ddx(c) dFdx(c)
+#define ddy(c) dFdy(c)
+#define atan2(x,y) atan(y,x)
+#define rsqrt(c) inversesqrt(c)
+
+#define MASKED_SCANLINEStexture source[0]
+#define MASKED_SCANLINEStexture_size sourceSize[0].xy
+#define MASKED_SCANLINESvideo_size sourceSize[0].xy
+#define BLOOM_APPROXtexture source[3]
+#define BLOOM_APPROXtexture_size sourceSize[3].xy
+#define BLOOM_APPROXvideo_size sourceSize[3].xy
+
+#define input_texture source[0]
+
+#if defined(GL_ES)
+	#define COMPAT_PRECISION mediump
+#else
+	#define COMPAT_PRECISION
+#endif
+
+#if __VERSION__ >= 130
+	#define COMPAT_TEXTURE texture
+#else
+	#define COMPAT_TEXTURE texture2D
+#endif
+
+float bloom_approx_scale_x = targetSize.y / sourceSize[0].y;
+const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0);
+const float bloom_diff_thresh_ = 1.0/256.0;
+
+//////////////////////////////  FRAGMENT INCLUDES  //////////////////////////////
+
+//#include "bloom-functions.h"
+
+////////////////////////////  BEGIN BLOOM-FUNCTIONS  ///////////////////////////
+
+#ifndef BLOOM_FUNCTIONS_H
+#define BLOOM_FUNCTIONS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These utility functions and constants help several passes determine the
+//  size and center texel weight of the phosphor bloom in a uniform manner.
+
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//  We need to calculate the correct blur sigma using some .cgp constants:
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+////////////////////////////  END USER-SETTINGS  //////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  END DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////////////
+
+//#include "../../../../include/blur-functions.h"
+
+////////////////////////////  BEGIN BLUR-FUNCTIONS  ///////////////////////////
+
+#ifndef BLUR_FUNCTIONS_H
+#define BLUR_FUNCTIONS_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides reusable one-pass and separable (two-pass) blurs.
+//  Requires:   All blurs share these requirements (dxdy requirement is split):
+//              1.) All requirements of gamma-management.h must be satisfied!
+//              2.) filter_linearN must == "true" in your .cgp preset unless
+//                  you're using tex2DblurNresize at 1x scale.
+//              3.) mipmap_inputN must == "true" in your .cgp preset if
+//                  IN.output_size < IN.video_size.
+//              4.) IN.output_size == IN.video_size / pow(2, M), where M is some
+//                  positive integer.  tex2Dblur*resize can resize arbitrarily
+//                  (and the blur will be done after resizing), but arbitrary
+//                  resizes "fail" with other blurs due to the way they mix
+//                  static weights with bilinear sample exploitation.
+//              5.) In general, dxdy should contain the uv pixel spacing:
+//                      dxdy = (IN.video_size/IN.output_size)/IN.texture_size
+//              6.) For separable blurs (tex2DblurNresize and tex2DblurNfast),
+//                  zero out the dxdy component in the unblurred dimension:
+//                      dxdy = float2(dxdy.x, 0.0) or float2(0.0, dxdy.y)
+//              Many blurs share these requirements:
+//              1.) One-pass blurs require scale_xN == scale_yN or scales > 1.0,
+//                  or they will blur more in the lower-scaled dimension.
+//              2.) One-pass shared sample blurs require ddx(), ddy(), and
+//                  tex2Dlod() to be supported by the current Cg profile, and
+//                  the drivers must support high-quality derivatives.
+//              3.) One-pass shared sample blurs require:
+//                      tex_uv.w == log2(IN.video_size/IN.output_size).y;
+//              Non-wrapper blurs share this requirement:
+//              1.) sigma is the intended standard deviation of the blur
+//              Wrapper blurs share this requirement, which is automatically
+//              met (unless OVERRIDE_BLUR_STD_DEVS is #defined; see below):
+//              1.) blurN_std_dev must be global static const float values
+//                  specifying standard deviations for Nx blurs in units
+//                  of destination pixels
+//  Optional:   1.) The including file (or an earlier included file) may
+//                  optionally #define USE_BINOMIAL_BLUR_STD_DEVS to replace
+//                  default standard deviations with those matching a binomial
+//                  distribution.  (See below for details/properties.)
+//              2.) The including file (or an earlier included file) may
+//                  optionally #define OVERRIDE_BLUR_STD_DEVS and override:
+//                      static const float blur3_std_dev
+//                      static const float blur4_std_dev
+//                      static const float blur5_std_dev
+//                      static const float blur6_std_dev
+//                      static const float blur7_std_dev
+//                      static const float blur8_std_dev
+//                      static const float blur9_std_dev
+//                      static const float blur10_std_dev
+//                      static const float blur11_std_dev
+//                      static const float blur12_std_dev
+//                      static const float blur17_std_dev
+//                      static const float blur25_std_dev
+//                      static const float blur31_std_dev
+//                      static const float blur43_std_dev
+//              3.) The including file (or an earlier included file) may
+//                  optionally #define OVERRIDE_ERROR_BLURRING and override:
+//                      static const float error_blurring
+//                  This tuning value helps mitigate weighting errors from one-
+//                  pass shared-sample blurs sharing bilinear samples between
+//                  fragments.  Values closer to 0.0 have "correct" blurriness
+//                  but allow more artifacts, and values closer to 1.0 blur away
+//                  artifacts by sampling closer to halfway between texels.
+//              UPDATE 6/21/14: The above static constants may now be overridden
+//              by non-static uniform constants.  This permits exposing blur
+//              standard deviations as runtime GUI shader parameters.  However,
+//              using them keeps weights from being statically computed, and the
+//              speed hit depends on the blur: On my machine, uniforms kill over
+//              53% of the framerate with tex2Dblur12x12shared, but they only
+//              drop the framerate by about 18% with tex2Dblur11fast.
+//  Quality and Performance Comparisons:
+//  For the purposes of the following discussion, "no sRGB" means
+//  GAMMA_ENCODE_EVERY_FBO is #defined, and "sRGB" means it isn't.
+//  1.) tex2DblurNfast is always faster than tex2DblurNresize.
+//  2.) tex2DblurNresize functions are the only ones that can arbitrarily resize
+//      well, because they're the only ones that don't exploit bilinear samples.
+//      This also means they're the only functions which can be truly gamma-
+//      correct without linear (or sRGB FBO) input, but only at 1x scale.
+//  3.) One-pass shared sample blurs only have a speed advantage without sRGB.
+//      They also have some inaccuracies due to their shared-[bilinear-]sample
+//      design, which grow increasingly bothersome for smaller blurs and higher-
+//      frequency source images (relative to their resolution).  I had high
+//      hopes for them, but their most realistic use case is limited to quickly
+//      reblurring an already blurred input at full resolution.  Otherwise:
+//      a.) If you're blurring a low-resolution source, you want a better blur.
+//      b.) If you're blurring a lower mipmap, you want a better blur.
+//      c.) If you're blurring a high-resolution, high-frequency source, you
+//          want a better blur.
+//  4.) The one-pass blurs without shared samples grow slower for larger blurs,
+//      but they're competitive with separable blurs at 5x5 and smaller, and
+//      even tex2Dblur7x7 isn't bad if you're wanting to conserve passes.
+//  Here are some framerates from a GeForce 8800GTS.  The first pass resizes to
+//  viewport size (4x in this test) and linearizes for sRGB codepaths, and the
+//  remaining passes perform 6 full blurs.  Mipmapped tests are performed at the
+//  same scale, so they just measure the cost of mipmapping each FBO (only every
+//  other FBO is mipmapped for separable blurs, to mimic realistic usage).
+//  Mipmap      Neither     sRGB+Mipmap sRGB        Function
+//  76.0        92.3        131.3       193.7       tex2Dblur3fast
+//  63.2        74.4        122.4       175.5       tex2Dblur3resize
+//  93.7        121.2       159.3       263.2       tex2Dblur3x3
+//  59.7        68.7        115.4       162.1       tex2Dblur3x3resize
+//  63.2        74.4        122.4       175.5       tex2Dblur5fast
+//  49.3        54.8        100.0       132.7       tex2Dblur5resize
+//  59.7        68.7        115.4       162.1       tex2Dblur5x5
+//  64.9        77.2        99.1        137.2       tex2Dblur6x6shared
+//  55.8        63.7        110.4       151.8       tex2Dblur7fast
+//  39.8        43.9        83.9        105.8       tex2Dblur7resize
+//  40.0        44.2        83.2        104.9       tex2Dblur7x7
+//  56.4        65.5        71.9        87.9        tex2Dblur8x8shared
+//  49.3        55.1        99.9        132.5       tex2Dblur9fast
+//  33.3        36.2        72.4        88.0        tex2Dblur9resize
+//  27.8        29.7        61.3        72.2        tex2Dblur9x9
+//  37.2        41.1        52.6        60.2        tex2Dblur10x10shared
+//  44.4        49.5        91.3        117.8       tex2Dblur11fast
+//  28.8        30.8        63.6        75.4        tex2Dblur11resize
+//  33.6        36.5        40.9        45.5        tex2Dblur12x12shared
+//  TODO: Fill in benchmarks for new untested blurs.
+//                                                  tex2Dblur17fast
+//                                                  tex2Dblur25fast
+//                                                  tex2Dblur31fast
+//                                                  tex2Dblur43fast
+//                                                  tex2Dblur3x3resize
+
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+//  Set static standard deviations, but allow users to override them with their
+//  own constants (even non-static uniforms if they're okay with the speed hit):
+#ifndef OVERRIDE_BLUR_STD_DEVS
+    //  blurN_std_dev values are specified in terms of dxdy strides.
+    #ifdef USE_BINOMIAL_BLUR_STD_DEVS
+        //  By request, we can define standard deviations corresponding to a
+        //  binomial distribution with p = 0.5 (related to Pascal's triangle).
+        //  This distribution works such that blurring multiple times should
+        //  have the same result as a single larger blur.  These values are
+        //  larger than default for blurs up to 6x and smaller thereafter.
+        static const float blur3_std_dev = 0.84931640625;
+        static const float blur4_std_dev = 0.84931640625;
+        static const float blur5_std_dev = 1.0595703125;
+        static const float blur6_std_dev = 1.06591796875;
+        static const float blur7_std_dev = 1.17041015625;
+        static const float blur8_std_dev = 1.1720703125;
+        static const float blur9_std_dev = 1.2259765625;
+        static const float blur10_std_dev = 1.21982421875;
+        static const float blur11_std_dev = 1.25361328125;
+        static const float blur12_std_dev = 1.2423828125;
+        static const float blur17_std_dev = 1.27783203125;
+        static const float blur25_std_dev = 1.2810546875;
+        static const float blur31_std_dev = 1.28125;
+        static const float blur43_std_dev = 1.28125;
+    #else
+        //  The defaults are the largest values that keep the largest unused
+        //  blur term on each side <= 1.0/256.0.  (We could get away with more
+        //  or be more conservative, but this compromise is pretty reasonable.)
+        static const float blur3_std_dev = 0.62666015625;
+        static const float blur4_std_dev = 0.66171875;
+        static const float blur5_std_dev = 0.9845703125;
+        static const float blur6_std_dev = 1.02626953125;
+        static const float blur7_std_dev = 1.36103515625;
+        static const float blur8_std_dev = 1.4080078125;
+        static const float blur9_std_dev = 1.7533203125;
+        static const float blur10_std_dev = 1.80478515625;
+        static const float blur11_std_dev = 2.15986328125;
+        static const float blur12_std_dev = 2.215234375;
+        static const float blur17_std_dev = 3.45535583496;
+        static const float blur25_std_dev = 5.3409576416;
+        static const float blur31_std_dev = 6.86488037109;
+        static const float blur43_std_dev = 10.1852050781;
+    #endif  //  USE_BINOMIAL_BLUR_STD_DEVS
+#endif  //  OVERRIDE_BLUR_STD_DEVS
+
+#ifndef OVERRIDE_ERROR_BLURRING
+    //  error_blurring should be in [0.0, 1.0].  Higher values reduce ringing
+    //  in shared-sample blurs but increase blurring and feature shifting.
+    static const float error_blurring = 0.5;
+#endif
+
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//  gamma-management.h relies on pass-specific settings to guide its behavior:
+//  FIRST_PASS, LAST_PASS, GAMMA_ENCODE_EVERY_FBO, etc.  See it for details.
+//#include "gamma-management.h"
+
+////////////////////////////  BEGIN GAMMA-MANAGEMENT  //////////////////////////
+
+#ifndef GAMMA_MANAGEMENT_H
+#define GAMMA_MANAGEMENT_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//  
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides gamma-aware tex*D*() and encode_output() functions.
+//  Requires:   Before #include-ing this file, the including file must #define
+//              the following macros when applicable and follow their rules:
+//              1.) #define FIRST_PASS if this is the first pass.
+//              2.) #define LAST_PASS if this is the last pass.
+//              3.) If sRGB is available, set srgb_framebufferN = "true" for
+//                  every pass except the last in your .cgp preset.
+//              4.) If sRGB isn't available but you want gamma-correctness with
+//                  no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
+//              5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
+//              6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
+//              7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
+//              8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
+//              If an option in [5, 8] is #defined in the first or last pass, it
+//              should be #defined for both.  It shouldn't make a difference
+//              whether it's #defined for intermediate passes or not.
+//  Optional:   The including file (or an earlier included file) may optionally
+//              #define a number of macros indicating it will override certain
+//              macros and associated constants are as follows:
+//              static constants with either static or uniform constants.  The
+//              1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
+//                  static const float ntsc_gamma
+//                  static const float pal_gamma
+//                  static const float crt_reference_gamma_high
+//                  static const float crt_reference_gamma_low
+//                  static const float lcd_reference_gamma
+//                  static const float crt_office_gamma
+//                  static const float lcd_office_gamma
+//              2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
+//                  static const float crt_gamma
+//                  static const float gba_gamma
+//                  static const float lcd_gamma
+//              3.) OVERRIDE_FINAL_GAMMA: The user must first define:
+//                  static const float input_gamma
+//                  static const float intermediate_gamma
+//                  static const float output_gamma
+//                  (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
+//              4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
+//                  static const bool assume_opaque_alpha
+//              The gamma constant overrides must be used in every pass or none,
+//              and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
+//              OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
+//  Usage:      After setting macros appropriately, ignore gamma correction and
+//              replace all tex*D*() calls with equivalent gamma-aware
+//              tex*D*_linearize calls, except:
+//              1.) When you read an LUT, use regular tex*D or a gamma-specified
+//                  function, depending on its gamma encoding:
+//                      tex*D*_linearize_gamma (takes a runtime gamma parameter)
+//              2.) If you must read pass0's original input in a later pass, use
+//                  tex2D_linearize_ntsc_gamma.  If you want to read pass0's
+//                  input with gamma-corrected bilinear filtering, consider
+//                  creating a first linearizing pass and reading from the input
+//                  of pass1 later.
+//              Then, return encode_output(color) from every fragment shader.
+//              Finally, use the global gamma_aware_bilinear boolean if you want
+//              to statically branch based on whether bilinear filtering is
+//              gamma-correct or not (e.g. for placing Gaussian blur samples).
+//
+//  Detailed Policy:
+//  tex*D*_linearize() functions enforce a consistent gamma-management policy
+//  based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings.  They assume
+//  their input texture has the same encoding characteristics as the input for
+//  the current pass (which doesn't apply to the exceptions listed above).
+//  Similarly, encode_output() enforces a policy based on the LAST_PASS and
+//  GAMMA_ENCODE_EVERY_FBO settings.  Together, they result in one of the
+//  following two pipelines.
+//  Typical pipeline with intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = linear_color;     //  Automatic sRGB encoding
+//      linear_color = intermediate_output;     //  Automatic sRGB decoding
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Typical pipeline without intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
+//      linear_color = pow(intermediate_output, intermediate_gamma);
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
+//  easily get gamma-correctness without banding on devices where sRGB isn't
+//  supported.
+//
+//  Use This Header to Maximize Code Reuse:
+//  The purpose of this header is to provide a consistent interface for texture
+//  reads and output gamma-encoding that localizes and abstracts away all the
+//  annoying details.  This greatly reduces the amount of code in each shader
+//  pass that depends on the pass number in the .cgp preset or whether sRGB
+//  FBO's are being used: You can trivially change the gamma behavior of your
+//  whole pass by commenting or uncommenting 1-3 #defines.  To reuse the same
+//  code in your first, Nth, and last passes, you can even put it all in another
+//  header file and #include it from skeleton .cg files that #define the
+//  appropriate pass-specific settings.
+//
+//  Rationale for Using Three Macros:
+//  This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
+//  SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
+//  a lower maintenance burden on each pass.  At first glance it seems we could
+//  accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
+//  This works for simple use cases where input_gamma == output_gamma, but it
+//  breaks down for more complex scenarios like CRT simulation, where the pass
+//  number determines the gamma encoding of the input and output.
+
+
+///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
+
+//  Set standard gamma constants, but allow users to override them:
+#ifndef OVERRIDE_STANDARD_GAMMA
+    //  Standard encoding gammas:
+    static const float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
+    static const float pal_gamma = 2.8;     //  Never actually 2.8 in practice
+    //  Typical device decoding gammas (only use for emulating devices):
+    //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
+    //  gammas: The standards purposely undercorrected for an analog CRT's
+    //  assumed 2.5 reference display gamma to maintain contrast in assumed
+    //  [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
+    //  These unstated assumptions about display gamma and perceptual rendering
+    //  intent caused a lot of confusion, and more modern CRT's seemed to target
+    //  NTSC 2.2 gamma with circuitry.  LCD displays seem to have followed suit
+    //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
+    //  displays designed to view sRGB in bright environments.  (Standards are
+    //  also in flux again with BT.1886, but it's underspecified for displays.)
+    static const float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
+    static const float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
+    static const float lcd_reference_gamma = 2.5;       //  To match CRT
+    static const float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
+    static const float lcd_office_gamma = 2.2;  //  Approximates sRGB
+#endif  //  OVERRIDE_STANDARD_GAMMA
+
+//  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
+//  but only if they're aware of it.
+#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
+    static const bool assume_opaque_alpha = false;
+#endif
+
+
+///////////////////////  DERIVED CONSTANTS AS FUNCTIONS  ///////////////////////
+
+//  gamma-management.h should be compatible with overriding gamma values with
+//  runtime user parameters, but we can only define other global constants in
+//  terms of static constants, not uniform user parameters.  To get around this
+//  limitation, we need to define derived constants using functions.
+
+//  Set device gamma constants, but allow users to override them:
+#ifdef OVERRIDE_DEVICE_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_crt_gamma()    {   return crt_gamma;   }
+    inline float get_gba_gamma()    {   return gba_gamma;   }
+    inline float get_lcd_gamma()    {   return lcd_gamma;   }
+#else
+    inline float get_crt_gamma()    {   return crt_reference_gamma_high;    }
+    inline float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
+    inline float get_lcd_gamma()    {   return lcd_office_gamma;            }
+#endif  //  OVERRIDE_DEVICE_GAMMA
+
+//  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
+#ifdef OVERRIDE_FINAL_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_intermediate_gamma()   {   return intermediate_gamma;  }
+    inline float get_input_gamma()          {   return input_gamma;         }
+    inline float get_output_gamma()         {   return output_gamma;        }
+#else
+    //  If we gamma-correct every pass, always use ntsc_gamma between passes to
+    //  ensure middle passes don't need to care if anything is being simulated:
+    inline float get_intermediate_gamma()   {   return ntsc_gamma;          }
+    #ifdef SIMULATE_CRT_ON_LCD
+        inline float get_input_gamma()      {   return get_crt_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_LCD
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_LCD_ON_CRT
+        inline float get_input_gamma()      {   return get_lcd_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_CRT
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else   //  Don't simulate anything:
+        inline float get_input_gamma()      {   return ntsc_gamma;          }
+        inline float get_output_gamma()     {   return ntsc_gamma;          }
+    #endif  //  SIMULATE_GBA_ON_CRT
+    #endif  //  SIMULATE_LCD_ON_CRT
+    #endif  //  SIMULATE_GBA_ON_LCD
+    #endif  //  SIMULATE_CRT_ON_LCD
+#endif  //  OVERRIDE_FINAL_GAMMA
+
+//  Set decoding/encoding gammas for the current pass.  Use static constants for
+//  linearize_input and gamma_encode_output, because they aren't derived, and
+//  they let the compiler do dead-code elimination.
+#ifndef GAMMA_ENCODE_EVERY_FBO
+    #ifdef FIRST_PASS
+        static const bool linearize_input = true;
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        static const bool linearize_input = false;
+        inline float get_pass_input_gamma()     {   return 1.0;                 }
+    #endif
+    #ifdef LAST_PASS
+        static const bool gamma_encode_output = true;
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        static const bool gamma_encode_output = false;
+        inline float get_pass_output_gamma()    {   return 1.0;                 }
+    #endif
+#else
+    static const bool linearize_input = true;
+    static const bool gamma_encode_output = true;
+    #ifdef FIRST_PASS
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        inline float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
+    #endif
+    #ifdef LAST_PASS
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        inline float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
+    #endif
+#endif
+
+//  Users might want to know if bilinear filtering will be gamma-correct:
+static const bool gamma_aware_bilinear = !linearize_input;
+
+
+//////////////////////  COLOR ENCODING/DECODING FUNCTIONS  /////////////////////
+
+inline float4 encode_output(const float4 color)
+{
+    if(gamma_encode_output)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_input(const float4 color)
+{
+    if(linearize_input)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_gamma_input(const float4 color, const float3 gamma)
+{
+    if(assume_opaque_alpha)
+    {
+        return float4(pow(color.rgb, gamma), 1.0);
+    }
+    else
+    {
+        return float4(pow(color.rgb, gamma), color.a);
+    }
+}
+
+//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯
+//#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D)))
+// EDIT: it's the 'const' in front of the coords that's doing it
+
+///////////////////////////  TEXTURE LOOKUP WRAPPERS  //////////////////////////
+
+//  "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a wide array of linearizing texture lookup wrapper functions.  The
+//  Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
+//  lookups are provided for completeness in case that changes someday.  Nobody
+//  is likely to use the *fetch and *proj functions, but they're included just
+//  in case.  The only tex*D texture sampling functions omitted are:
+//      - tex*Dcmpbias
+//      - tex*Dcmplod
+//      - tex*DARRAY*
+//      - tex*DMS*
+//      - Variants returning integers
+//  Standard line length restrictions are ignored below for vertical brevity.
+/*
+//  tex1D:
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex1Dbias:
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dbias(tex, tex_coords));   }
+
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dbias(tex, tex_coords, texel_off));    }
+
+//  tex1Dfetch:
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
+{   return decode_input(tex1Dfetch(tex, tex_coords));  }
+
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex1Dlod:
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dlod(tex, tex_coords));    }
+
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dlod(tex, tex_coords, texel_off));     }
+
+//  tex1Dproj:
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+*/
+//  tex2D:
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords, texel_off));    }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex2Dbias:
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
+//{   return decode_input(tex2Dbias(tex, tex_coords));   }
+
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dbias(tex, tex_coords, texel_off));    }
+
+//  tex2Dfetch:
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
+//{   return decode_input(tex2Dfetch(tex, tex_coords));  }
+
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords)
+{   return decode_input(textureLod(tex, tex_coords.xy, 0.0));    }
+
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));     }
+/*
+//  tex2Dproj:
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+*/
+/*
+//  tex3D:
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
+{   return decode_input(tex3D(tex, tex_coords));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, texel_off));    }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex3Dbias:
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dbias(tex, tex_coords));   }
+
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dbias(tex, tex_coords, texel_off));    }
+
+//  tex3Dfetch:
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
+{   return decode_input(tex3Dfetch(tex, tex_coords));  }
+
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex3Dlod:
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dlod(tex, tex_coords));    }
+
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dlod(tex, tex_coords, texel_off));     }
+
+//  tex3Dproj:
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dproj(tex, tex_coords));   }
+
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dproj(tex, tex_coords, texel_off));    }
+/////////*
+
+//  NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  This narrow selection of nonstandard tex2D* functions can be useful:
+
+//  tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0)));   }
+
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off));    }
+
+
+//  MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a narrower selection of tex2D* wrapper functions that decode an
+//  input sample with a specified gamma value.  These are useful for reading
+//  LUT's and for reading the input of pass0 in a later pass.
+
+//  tex2D:
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma);   }
+
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+/*
+//  tex2Dbias:
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma);   }
+
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma);    }
+
+//  tex2Dfetch:
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma);  }
+
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma);   }
+*/
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma);    }
+
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma);     }
+
+
+#endif  //  GAMMA_MANAGEMENT_H
+
+////////////////////////////  END GAMMA-MANAGEMENT  //////////////////////////
+
+//#include "quad-pixel-communication.h"
+
+///////////////////////  BEGIN QUAD-PIXEL-COMMUNICATION  //////////////////////
+
+#ifndef QUAD_PIXEL_COMMUNICATION_H
+#define QUAD_PIXEL_COMMUNICATION_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey*
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DISCLAIMER  /////////////////////////////////
+
+//  *This code was inspired by "Shader Amortization using Pixel Quad Message
+//  Passing" by Eric Penner, published in GPU Pro 2, Chapter VI.2.  My intent
+//  is not to plagiarize his fundamentally similar code and assert my own
+//  copyright, but the algorithmic helper functions require so little code that
+//  implementations can't vary by much except bugfixes and conventions.  I just
+//  wanted to license my own particular code here to avoid ambiguity and make it
+//  clear that as far as I'm concerned, people can do as they please with it.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  Given screen pixel numbers, derive a "quad vector" describing a fragment's
+//  position in its 2x2 pixel quad.  Given that vector, obtain the values of any
+//  variable at neighboring fragments.
+//  Requires:   Using this file in general requires:
+//              1.) ddx() and ddy() are present in the current Cg profile.
+//              2.) The GPU driver is using fine/high-quality derivatives.
+//                  Functions will give incorrect results if this is not true,
+//                  so a test function is included.
+
+
+/////////////////////  QUAD-PIXEL COMMUNICATION PRIMITIVES  ////////////////////
+
+float4 get_quad_vector_naive(float4 output_pixel_num_wrt_uvxy)
+{
+    //  Requires:   Two measures of the current fragment's output pixel number
+    //              in the range ([0, IN.output_size.x), [0, IN.output_size.y)):
+    //              1.) output_pixel_num_wrt_uvxy.xy increase with uv coords.
+    //              2.) output_pixel_num_wrt_uvxy.zw increase with screen xy.
+    //  Returns:    Two measures of the fragment's position in its 2x2 quad:
+    //              1.) The .xy components are its 2x2 placement with respect to
+    //                  uv direction (the origin (0, 0) is at the top-left):
+    //                  top-left     = (-1.0, -1.0) top-right    = ( 1.0, -1.0)
+    //                  bottom-left  = (-1.0,  1.0) bottom-right = ( 1.0,  1.0)
+    //                  You need this to arrange/weight shared texture samples.
+    //              2.) The .zw components are its 2x2 placement with respect to
+    //                  screen xy direction (IN.position); the origin varies.
+    //                  quad_gather needs this measure to work correctly.
+    //              Note: quad_vector.zw = quad_vector.xy * float2(
+    //                      ddx(output_pixel_num_wrt_uvxy.x),
+    //                      ddy(output_pixel_num_wrt_uvxy.y));
+    //  Caveats:    This function assumes the GPU driver always starts 2x2 pixel
+    //              quads at even pixel numbers.  This assumption can be wrong
+    //              for odd output resolutions (nondeterministically so).
+    float4 pixel_odd = frac(output_pixel_num_wrt_uvxy * 0.5) * 2.0;
+    float4 quad_vector = pixel_odd * 2.0 - float4(1.0);
+    return quad_vector;
+}
+
+float4 get_quad_vector(float4 output_pixel_num_wrt_uvxy)
+{
+    //  Requires:   Same as get_quad_vector_naive() (see that first).
+    //  Returns:    Same as get_quad_vector_naive() (see that first), but it's
+    //              correct even if the 2x2 pixel quad starts at an odd pixel,
+    //              which can occur at odd resolutions.
+    float4 quad_vector_guess =
+        get_quad_vector_naive(output_pixel_num_wrt_uvxy);
+    //  If quad_vector_guess.zw doesn't increase with screen xy, we know
+    //  the 2x2 pixel quad starts at an odd pixel:
+    float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_guess.z),
+                                                ddy(quad_vector_guess.w));
+    return quad_vector_guess * odd_start_mirror.xyxy;
+}
+
+float4 get_quad_vector(float2 output_pixel_num_wrt_uv)
+{
+    //  Requires:   1.) ddx() and ddy() are present in the current Cg profile.
+    //              2.) output_pixel_num_wrt_uv must increase with uv coords and
+    //                  measure the current fragment's output pixel number in:
+    //                      ([0, IN.output_size.x), [0, IN.output_size.y))
+    //  Returns:    Same as get_quad_vector_naive() (see that first), but it's
+    //              correct even if the 2x2 pixel quad starts at an odd pixel,
+    //              which can occur at odd resolutions.
+    //  Caveats:    This function requires less information than the version
+    //              taking a float4, but it's potentially slower.
+    //  Do screen coords increase with or against uv?  Get the direction
+    //  with respect to (uv.x, uv.y) for (screen.x, screen.y) in {-1, 1}.
+    float2 screen_uv_mirror = float2(ddx(output_pixel_num_wrt_uv.x),
+                                        ddy(output_pixel_num_wrt_uv.y));
+    float2 pixel_odd_wrt_uv = frac(output_pixel_num_wrt_uv * 0.5) * 2.0;
+    float2 quad_vector_uv_guess = (pixel_odd_wrt_uv - float2(0.5)) * 2.0;
+    float2 quad_vector_screen_guess = quad_vector_uv_guess * screen_uv_mirror;
+    //  If quad_vector_screen_guess doesn't increase with screen xy, we know
+    //  the 2x2 pixel quad starts at an odd pixel:
+    float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_screen_guess.x),
+                                                ddy(quad_vector_screen_guess.y));
+    float4 quad_vector_guess = float4(
+        quad_vector_uv_guess, quad_vector_screen_guess);
+    return quad_vector_guess * odd_start_mirror.xyxy;
+}
+
+void quad_gather(float4 quad_vector, float4 curr,
+    out float4 adjx, out float4 adjy, out float4 diag)
+{
+    //  Requires:   1.) ddx() and ddy() are present in the current Cg profile.
+    //              2.) The GPU driver is using fine/high-quality derivatives.
+    //              3.) quad_vector describes the current fragment's location in
+    //                  its 2x2 pixel quad using get_quad_vector()'s conventions.
+    //              4.) curr is any vector you wish to get neighboring values of.
+    //  Returns:    Values of an input vector (curr) at neighboring fragments
+    //              adjacent x, adjacent y, and diagonal (via out parameters).
+    adjx = curr - ddx(curr) * quad_vector.z;
+    adjy = curr - ddy(curr) * quad_vector.w;
+    diag = adjx - ddy(adjx) * quad_vector.w;
+}
+
+void quad_gather(float4 quad_vector, float3 curr,
+    out float3 adjx, out float3 adjy, out float3 diag)
+{
+    //  Float3 version
+    adjx = curr - ddx(curr) * quad_vector.z;
+    adjy = curr - ddy(curr) * quad_vector.w;
+    diag = adjx - ddy(adjx) * quad_vector.w;
+}
+
+void quad_gather(float4 quad_vector, float2 curr,
+    out float2 adjx, out float2 adjy, out float2 diag)
+{
+    //  Float2 version
+    adjx = curr - ddx(curr) * quad_vector.z;
+    adjy = curr - ddy(curr) * quad_vector.w;
+    diag = adjx - ddy(adjx) * quad_vector.w;
+}
+
+float4 quad_gather(float4 quad_vector, float curr)
+{
+    //  Float version:
+    //  Returns:    return.x == current
+    //              return.y == adjacent x
+    //              return.z == adjacent y
+    //              return.w == diagonal
+    float4 all = float4(curr);
+    all.y = all.x - ddx(all.x) * quad_vector.z;
+    all.zw = all.xy - ddy(all.xy) * quad_vector.w;
+    return all;
+}
+
+float4 quad_gather_sum(float4 quad_vector, float4 curr)
+{
+    //  Requires:   Same as quad_gather()
+    //  Returns:    Sum of an input vector (curr) at all fragments in a quad.
+    float4 adjx, adjy, diag;
+    quad_gather(quad_vector, curr, adjx, adjy, diag);
+    return (curr + adjx + adjy + diag);
+}
+
+float3 quad_gather_sum(float4 quad_vector, float3 curr)
+{
+    //  Float3 version:
+    float3 adjx, adjy, diag;
+    quad_gather(quad_vector, curr, adjx, adjy, diag);
+    return (curr + adjx + adjy + diag);
+}
+
+float2 quad_gather_sum(float4 quad_vector, float2 curr)
+{
+    //  Float2 version:
+    float2 adjx, adjy, diag;
+    quad_gather(quad_vector, curr, adjx, adjy, diag);
+    return (curr + adjx + adjy + diag);
+}
+
+float quad_gather_sum(float4 quad_vector, float curr)
+{
+    //  Float version:
+    float4 all_values = quad_gather(quad_vector, curr);
+    return (all_values.x + all_values.y + all_values.z + all_values.w);
+}
+
+bool fine_derivatives_working(float4 quad_vector, float4 curr)
+{
+    //  Requires:   1.) ddx() and ddy() are present in the current Cg profile.
+    //              2.) quad_vector describes the current fragment's location in
+    //                  its 2x2 pixel quad using get_quad_vector()'s conventions.
+    //              3.) curr must be a test vector with non-constant derivatives
+    //                  (its value should change nonlinearly across fragments).
+    //  Returns:    true if fine/hybrid/high-quality derivatives are used, or
+    //              false if coarse derivatives are used or inconclusive
+    //  Usage:      Test whether quad-pixel communication is working!
+    //  Method:     We can confirm fine derivatives are used if the following
+    //              holds (ever, for any value at any fragment):
+    //                  (ddy(curr) != ddy(adjx)) or (ddx(curr) != ddx(adjy))
+    //              The more values we test (e.g. test a float4 two ways), the
+    //              easier it is to demonstrate fine derivatives are working.
+    //  TODO: Check for floating point exact comparison issues!
+    float4 ddx_curr = ddx(curr);
+    float4 ddy_curr = ddy(curr);
+    float4 adjx = curr - ddx_curr * quad_vector.z;
+    float4 adjy = curr - ddy_curr * quad_vector.w;
+    bool ddy_different = any(bool4(ddy_curr.x != ddy(adjx).x, ddy_curr.y != ddy(adjx).y, ddy_curr.z != ddy(adjx).z, ddy_curr.w != ddy(adjx).w));
+    bool ddx_different = any(bool4(ddx_curr.x != ddx(adjy).x, ddx_curr.y != ddx(adjy).y, ddx_curr.z != ddx(adjy).z, ddx_curr.w != ddx(adjy).w));
+    return any(bool2(ddy_different, ddx_different));
+}
+
+bool fine_derivatives_working_fast(float4 quad_vector, float curr)
+{
+    //  Requires:   Same as fine_derivatives_working()
+    //  Returns:    Same as fine_derivatives_working()
+    //  Usage:      This is faster than fine_derivatives_working() but more
+    //              likely to return false negatives, so it's less useful for
+    //              offline testing/debugging.  It's also useless as the basis
+    //              for dynamic runtime branching as of May 2014: Derivatives
+    //              (and quad-pixel communication) are currently disallowed in
+    //              branches.  However, future GPU's may allow you to use them
+    //              in dynamic branches if you promise the branch condition
+    //              evaluates the same for every fragment in the quad (and/or if
+    //              the driver enforces that promise by making a single fragment
+    //              control branch decisions).  If that ever happens, this
+    //              version may become a more economical choice.
+    float ddx_curr = ddx(curr);
+    float ddy_curr = ddy(curr);
+    float adjx = curr - ddx_curr * quad_vector.z;
+    return (ddy_curr != ddy(adjx));
+}
+
+#endif  //  QUAD_PIXEL_COMMUNICATION_H
+
+////////////////////////  END QUAD-PIXEL-COMMUNICATION  ///////////////////////
+
+//#include "special-functions.h"
+
+///////////////////////////  BEGIN SPECIAL-FUNCTIONS  //////////////////////////
+
+#ifndef SPECIAL_FUNCTIONS_H
+#define SPECIAL_FUNCTIONS_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file implements the following mathematical special functions:
+//  1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2))
+//  2.) gamma(s), a real-numbered extension of the integer factorial function
+//  It also implements normalized_ligamma(s, z), a normalized lower incomplete
+//  gamma function for s < 0.5 only.  Both gamma() and normalized_ligamma() can
+//  be called with an _impl suffix to use an implementation version with a few
+//  extra precomputed parameters (which may be useful for the caller to reuse).
+//  See below for details.
+//
+//  Design Rationale:
+//  Pretty much every line of code in this file is duplicated four times for
+//  different input types (float4/float3/float2/float).  This is unfortunate,
+//  but Cg doesn't allow function templates.  Macros would be far less verbose,
+//  but they would make the code harder to document and read.  I don't expect
+//  these functions will require a whole lot of maintenance changes unless
+//  someone ever has need for more robust incomplete gamma functions, so code
+//  duplication seems to be the lesser evil in this case.
+
+
+///////////////////////////  GAUSSIAN ERROR FUNCTION  //////////////////////////
+
+float4 erf6(float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Return an Abramowitz/Stegun approximation of erf(), where:
+    //                  erf(x) = 2/sqrt(pi) * integral(e**(-x**2))
+    //              This approximation has a max absolute error of 2.5*10**-5
+    //              with solid numerical robustness and efficiency.  See:
+	//                  https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions
+	static const float4 one = float4(1.0);
+	const float4 sign_x = sign(x);
+	const float4 t = one/(one + 0.47047*abs(x));
+	const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float3 erf6(const float3 x)
+{
+    //  Float3 version:
+	static const float3 one = float3(1.0);
+	const float3 sign_x = sign(x);
+	const float3 t = one/(one + 0.47047*abs(x));
+	const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float2 erf6(const float2 x)
+{
+    //  Float2 version:
+	static const float2 one = float2(1.0);
+	const float2 sign_x = sign(x);
+	const float2 t = one/(one + 0.47047*abs(x));
+	const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float erf6(const float x)
+{
+    //  Float version:
+	const float sign_x = sign(x);
+	const float t = 1.0/(1.0 + 0.47047*abs(x));
+	const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float4 erft(const float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Approximate erf() with the hyperbolic tangent.  The error is
+    //              visually noticeable, but it's blazing fast and perceptually
+    //              close...at least on ATI hardware.  See:
+    //                  http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html
+    //  Warning:    Only use this if your hardware drivers correctly implement
+    //              tanh(): My nVidia 8800GTS returns garbage output.
+	return tanh(1.202760580 * x);
+}
+
+float3 erft(const float3 x)
+{
+    //  Float3 version:
+	return tanh(1.202760580 * x);
+}
+
+float2 erft(const float2 x)
+{
+    //  Float2 version:
+	return tanh(1.202760580 * x);
+}
+
+float erft(const float x)
+{
+    //  Float version:
+	return tanh(1.202760580 * x);
+}
+
+inline float4 erf(const float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Some approximation of erf(x), depending on user settings.
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float3 erf(const float3 x)
+{
+    //  Float3 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float2 erf(const float2 x)
+{
+    //  Float2 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float erf(const float x)
+{
+    //  Float version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+
+///////////////////////////  COMPLETE GAMMA FUNCTION  //////////////////////////
+
+float4 gamma_impl(const float4 s, const float4 s_inv)
+{
+    //  Requires:   1.) s is the standard parameter to the gamma function, and
+    //                  it should lie in the [0, 36] range.
+    //              2.) s_inv = 1.0/s.  This implementation function requires
+    //                  the caller to precompute this value, giving users the
+    //                  opportunity to reuse it.
+    //  Returns:    Return approximate gamma function (real-numbered factorial)
+    //              output using the Lanczos approximation with two coefficients
+    //              calculated using Paul Godfrey's method here:
+    //                  http://my.fit.edu/~gabdo/gamma.txt
+    //              An optimal g value for s in [0, 36] is ~1.12906830989, with
+    //              a maximum relative error of 0.000463 for 2**16 equally
+    //              evals.  We could use three coeffs (0.0000346 error) without
+    //              hurting latency, but this allows more parallelism with
+    //              outside instructions.
+	static const float4 g = float4(1.12906830989);
+	static const float4 c0 = float4(0.8109119309638332633713423362694399653724431);
+	static const float4 c1 = float4(0.4808354605142681877121661197951496120000040);
+	static const float4 e = float4(2.71828182845904523536028747135266249775724709);
+	const float4 sph = s + float4(0.5);
+	const float4 lanczos_sum = c0 + c1/(s + float4(1.0));
+	const float4 base = (sph + g)/e;  //  or (s + g + float4(0.5))/e
+	//  gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s).
+	//  This has less error for small s's than (s -= 1.0) at the beginning.
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float3 gamma_impl(const float3 s, const float3 s_inv)
+{
+    //  Float3 version:
+	static const float3 g = float3(1.12906830989);
+	static const float3 c0 = float3(0.8109119309638332633713423362694399653724431);
+	static const float3 c1 = float3(0.4808354605142681877121661197951496120000040);
+	static const float3 e = float3(2.71828182845904523536028747135266249775724709);
+	const float3 sph = s + float3(0.5);
+	const float3 lanczos_sum = c0 + c1/(s + float3(1.0));
+	const float3 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float2 gamma_impl(const float2 s, const float2 s_inv)
+{
+    //  Float2 version:
+	static const float2 g = float2(1.12906830989);
+	static const float2 c0 = float2(0.8109119309638332633713423362694399653724431);
+	static const float2 c1 = float2(0.4808354605142681877121661197951496120000040);
+	static const float2 e = float2(2.71828182845904523536028747135266249775724709);
+	const float2 sph = s + float2(0.5);
+	const float2 lanczos_sum = c0 + c1/(s + float2(1.0));
+	const float2 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float gamma_impl(const float s, const float s_inv)
+{
+    //  Float version:
+	static const float g = 1.12906830989;
+	static const float c0 = 0.8109119309638332633713423362694399653724431;
+	static const float c1 = 0.4808354605142681877121661197951496120000040;
+	static const float e = 2.71828182845904523536028747135266249775724709;
+	const float sph = s + 0.5;
+	const float lanczos_sum = c0 + c1/(s + 1.0);
+	const float base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float4 gamma(const float4 s)
+{
+    //  Requires:   s is the standard parameter to the gamma function, and it
+    //              should lie in the [0, 36] range.
+    //  Returns:    Return approximate gamma function output with a maximum
+    //              relative error of 0.000463.  See gamma_impl for details.
+	return gamma_impl(s, float4(1.0)/s);
+}
+
+float3 gamma(const float3 s)
+{
+    //  Float3 version:
+	return gamma_impl(s, float3(1.0)/s);
+}
+
+float2 gamma(const float2 s)
+{
+    //  Float2 version:
+	return gamma_impl(s, float2(1.0)/s);
+}
+
+float gamma(const float s)
+{
+    //  Float version:
+	return gamma_impl(s, 1.0/s);
+}
+
+
+////////////////  INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT)  ///////////////
+
+//  Lower incomplete gamma function for small s and z (implementation):
+float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z <= ~0.775075
+    //              3.) s_inv = 1.0/s (precomputed for outside reuse)
+    //  Returns:    A series representation for the lower incomplete gamma
+    //              function for small s and small z (4 terms).
+    //  The actual "rolled up" summation looks like:
+	//      last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0;
+	//      sum = last_sign * last_pow / ((s + k) * last_factorial)
+	//      for(int i = 0; i < 4; ++i)
+	//      {
+	//          last_sign *= -1.0; last_pow *= z; last_factorial *= i;
+	//          sum += last_sign * last_pow / ((s + k) * last_factorial);
+	//      }
+	//  Unrolled, constant-unfolded and arranged for madds and parallelism:
+	const float4 scale = pow(z, s);
+	float4 sum = s_inv;  //  Summation iteration 0 result
+	//  Summation iterations 1, 2, and 3:
+	const float4 z_sq = z*z;
+	const float4 denom1 = s + float4(1.0);
+	const float4 denom2 = 2.0*s + float4(4.0);
+	const float4 denom3 = 6.0*s + float4(18.0);
+	//float4 denom4 = 24.0*s + float4(96.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	//sum += z_sq * z_sq / denom4;
+	//  Scale and return:
+	return scale * sum;
+}
+
+float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv)
+{
+    //  Float3 version:
+	const float3 scale = pow(z, s);
+	float3 sum = s_inv;
+	const float3 z_sq = z*z;
+	const float3 denom1 = s + float3(1.0);
+	const float3 denom2 = 2.0*s + float3(4.0);
+	const float3 denom3 = 6.0*s + float3(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv)
+{
+    //  Float2 version:
+	const float2 scale = pow(z, s);
+	float2 sum = s_inv;
+	const float2 z_sq = z*z;
+	const float2 denom1 = s + float2(1.0);
+	const float2 denom2 = 2.0*s + float2(4.0);
+	const float2 denom3 = 6.0*s + float2(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float ligamma_small_z_impl(const float s, const float z, const float s_inv)
+{
+    //  Float version:
+	const float scale = pow(z, s);
+	float sum = s_inv;
+	const float z_sq = z*z;
+	const float denom1 = s + 1.0;
+	const float denom2 = 2.0*s + 4.0;
+	const float denom3 = 6.0*s + 18.0;
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+//  Upper incomplete gamma function for small s and large z (implementation):
+float4 uigamma_large_z_impl(const float4 s, const float4 z)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z > ~0.775075
+    //  Returns:    Gauss's continued fraction representation for the upper
+    //              incomplete gamma function (4 terms).
+	//  The "rolled up" continued fraction looks like this.  The denominator
+    //  is truncated, and it's calculated "from the bottom up:"
+	//      denom = float4('inf');
+	//      float4 one = float4(1.0);
+	//      for(int i = 4; i > 0; --i)
+	//      {
+	//          denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom;
+	//      }
+	//  Unrolled and constant-unfolded for madds and parallelism:
+	const float4 numerator = pow(z, s) * exp(-z);
+	float4 denom = float4(7.0) + z - s;
+	denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom;
+	denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom;
+	denom = float4(1.0) + z - s + (s - float4(1.0))/denom;
+	return numerator / denom;
+}
+
+float3 uigamma_large_z_impl(const float3 s, const float3 z)
+{
+    //  Float3 version:
+	const float3 numerator = pow(z, s) * exp(-z);
+	float3 denom = float3(7.0) + z - s;
+	denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom;
+	denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom;
+	denom = float3(1.0) + z - s + (s - float3(1.0))/denom;
+	return numerator / denom;
+}
+
+float2 uigamma_large_z_impl(const float2 s, const float2 z)
+{
+    //  Float2 version:
+	const float2 numerator = pow(z, s) * exp(-z);
+	float2 denom = float2(7.0) + z - s;
+	denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom;
+	denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom;
+	denom = float2(1.0) + z - s + (s - float2(1.0))/denom;
+	return numerator / denom;
+}
+
+float uigamma_large_z_impl(const float s, const float z)
+{
+    //  Float version:
+	const float numerator = pow(z, s) * exp(-z);
+	float denom = 7.0 + z - s;
+	denom = 5.0 + z - s + (3.0*s - 9.0)/denom;
+	denom = 3.0 + z - s + (2.0*s - 4.0)/denom;
+	denom = 1.0 + z - s + (s - 1.0)/denom;
+	return numerator / denom;
+}
+
+//  Normalized lower incomplete gamma function for small s (implementation):
+float4 normalized_ligamma_impl(const float4 s, const float4 z,
+    const float4 s_inv, const float4 gamma_s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) s_inv = 1/s (precomputed for outside reuse)
+    //              3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse)
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  Since we only care about s < 0.5, we only need
+    //              to evaluate two branches (not four) based on z.  Each branch
+    //              uses four terms, with a max relative error of ~0.00182.  The
+    //              branch threshold and specifics were adapted for fewer terms
+    //              from Gil/Segura/Temme's paper here:
+    //                  http://oai.cwi.nl/oai/asset/20433/20433B.pdf
+	//  Evaluate both branches: Real branches test slower even when available.
+	static const float4 thresh = float4(0.775075);
+	bool4 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	z_is_large.w = z.w > thresh.w;
+	const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	//  Combine the results from both branches:
+	bool4 inverse_z_is_large = not(z_is_large);
+	return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large);
+}
+
+float3 normalized_ligamma_impl(const float3 s, const float3 z,
+    const float3 s_inv, const float3 gamma_s_inv)
+{
+    //  Float3 version:
+	static const float3 thresh = float3(0.775075);
+	bool3 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool3 inverse_z_is_large = not(z_is_large);
+	return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large);
+}
+
+float2 normalized_ligamma_impl(const float2 s, const float2 z,
+    const float2 s_inv, const float2 gamma_s_inv)
+{
+    //  Float2 version:
+	static const float2 thresh = float2(0.775075);
+	bool2 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool2 inverse_z_is_large = not(z_is_large);
+	return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large);
+}
+
+float normalized_ligamma_impl(const float s, const float z,
+    const float s_inv, const float gamma_s_inv)
+{
+    //  Float version:
+	static const float thresh = 0.775075;
+	const bool z_is_large = z > thresh;
+	const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	return large_z * float(z_is_large) + small_z * float(!z_is_large);
+}
+
+//  Normalized lower incomplete gamma function for small s:
+float4 normalized_ligamma(const float4 s, const float4 z)
+{
+    //  Requires:   s < ~0.5
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  See normalized_ligamma_impl() for details.
+	const float4 s_inv = float4(1.0)/s;
+	const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float3 normalized_ligamma(const float3 s, const float3 z)
+{
+    //  Float3 version:
+	const float3 s_inv = float3(1.0)/s;
+	const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float2 normalized_ligamma(const float2 s, const float2 z)
+{
+    //  Float2 version:
+	const float2 s_inv = float2(1.0)/s;
+	const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float normalized_ligamma(const float s, const float z)
+{
+    //  Float version:
+	const float s_inv = 1.0/s;
+	const float gamma_s_inv = 1.0/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+#endif  //  SPECIAL_FUNCTIONS_H
+
+////////////////////////////  END SPECIAL-FUNCTIONS  ///////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////////  HELPERS  //////////////////////////////////
+
+inline float4 uv2_to_uv4(float2 tex_uv)
+{
+    //  Make a float2 uv offset safe for adding to float4 tex2Dlod coords:
+    return float4(tex_uv, 0.0, 0.0);
+}
+
+//  Make a length squared helper macro (for usage with static constants):
+#define LENGTH_SQ(vec) (dot(vec, vec))
+
+inline float get_fast_gaussian_weight_sum_inv(const float sigma)
+{
+    //  We can use the Gaussian integral to calculate the asymptotic weight for
+    //  the center pixel.  Since the unnormalized center pixel weight is 1.0,
+    //  the normalized weight is the same as the weight sum inverse.  Given a
+    //  large enough blur (9+), the asymptotic weight sum is close and faster:
+    //      center_weight = 0.5 *
+    //          (erf(0.5/(sigma*sqrt(2.0))) - erf(-0.5/(sigma*sqrt(2.0))))
+    //      erf(-x) == -erf(x), so we get 0.5 * (2.0 * erf(blah blah)):
+    //  However, we can get even faster results with curve-fitting.  These are
+    //  also closer than the asymptotic results, because they were constructed
+    //  from 64 blurs sizes from [3, 131) and 255 equally-spaced sigmas from
+    //  (0, blurN_std_dev), so the results for smaller sigmas are biased toward
+    //  smaller blurs.  The max error is 0.0031793913.
+    //  Relative FPS: 134.3 with erf, 135.8 with curve-fitting.
+    //static const float temp = 0.5/sqrt(2.0);
+    //return erf(temp/sigma);
+    return min(exp(exp(0.348348412457428/
+        (sigma - 0.0860587260734721))), 0.399334576340352/sigma);
+}
+
+
+////////////////////  ARBITRARILY RESIZABLE SEPARABLE BLURS  ///////////////////
+
+float3 tex2Dblur11resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 11x Gaussian blurred texture lookup using a 11-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  Calculate Gaussian blur kernel weights and a normalization factor for
+    //  distances of 0-4, ignoring constant factors (since we're normalizing).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float weight_sum_inv = 1.0 /
+        (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5));
+    //  Statically normalize weights, sum weighted samples, and return.  Blurs are
+    //  currently optimized for dynamic weights.
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w5 * tex2D_linearize(tex, tex_uv - 5.0 * dxdy).rgb;
+    sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
+    sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb;
+    sum += w5 * tex2D_linearize(tex, tex_uv + 5.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur9resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 9x Gaussian blurred texture lookup using a 9-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
+    sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur7resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 7x Gaussian blurred texture lookup using a 7-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3));
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur5resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 5x Gaussian blurred texture lookup using a 5-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2));
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur3resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 3x Gaussian blurred texture lookup using a 3-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1);
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+
+///////////////////////////  FAST SEPARABLE BLURS  ///////////////////////////
+
+float3 tex2Dblur11fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   1.) Global requirements must be met (see file description).
+    //              2.) filter_linearN must = "true" in your .cgp file.
+    //              3.) For gamma-correct bilinear filtering, global
+    //                  gamma_aware_bilinear == true (from gamma-management.h)
+    //  Returns:    A 1D 11x Gaussian blurred texture lookup using 6 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float weight_sum_inv = 1.0 /
+        (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    const float w01 = w0 * 0.5 + w1;
+    const float w23 = w2 + w3;
+    const float w45 = w4 + w5;
+    const float w01_ratio = w1/w01;
+    const float w23_ratio = w3/w23;
+    const float w45_ratio = w5/w45;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w45 * tex2D_linearize(tex, tex_uv - (4.0 + w45_ratio) * dxdy).rgb;
+    sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb;
+    sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb;
+    sum += w45 * tex2D_linearize(tex, tex_uv + (4.0 + w45_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur9fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 9x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 4 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    const float w12 = w1 + w2;
+    const float w34 = w3 + w4;
+    const float w12_ratio = w2/w12;
+    const float w34_ratio = w4/w34;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w34 * tex2D_linearize(tex, tex_uv - (3.0 + w34_ratio) * dxdy).rgb;
+    sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb;
+    sum += w34 * tex2D_linearize(tex, tex_uv + (3.0 + w34_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur7fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 7x Gaussian blurred texture lookup using 4 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    const float w01 = w0 * 0.5 + w1;
+    const float w23 = w2 + w3;
+    const float w01_ratio = w1/w01;
+    const float w23_ratio = w3/w23;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb;
+    sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur5fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 5x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 2 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    const float w12 = w1 + w2;
+    const float w12_ratio = w2/w12;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur3fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 3x Gaussian blurred texture lookup using 2 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    const float w01 = w0 * 0.5 + w1;
+    const float w01_ratio = w1/w01;
+    //  Weights for all samples are the same, so just average them:
+    return 0.5 * (
+        tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb +
+        tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb);
+}
+
+
+////////////////////////////  HUGE SEPARABLE BLURS  ////////////////////////////
+
+//  Huge separable blurs come only in "fast" versions.
+float3 tex2Dblur43fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 43x Gaussian blurred texture lookup using 22 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float w6 = exp(-36.0 * denom_inv);
+    const float w7 = exp(-49.0 * denom_inv);
+    const float w8 = exp(-64.0 * denom_inv);
+    const float w9 = exp(-81.0 * denom_inv);
+    const float w10 = exp(-100.0 * denom_inv);
+    const float w11 = exp(-121.0 * denom_inv);
+    const float w12 = exp(-144.0 * denom_inv);
+    const float w13 = exp(-169.0 * denom_inv);
+    const float w14 = exp(-196.0 * denom_inv);
+    const float w15 = exp(-225.0 * denom_inv);
+    const float w16 = exp(-256.0 * denom_inv);
+    const float w17 = exp(-289.0 * denom_inv);
+    const float w18 = exp(-324.0 * denom_inv);
+    const float w19 = exp(-361.0 * denom_inv);
+    const float w20 = exp(-400.0 * denom_inv);
+    const float w21 = exp(-441.0 * denom_inv);
+    //const float weight_sum_inv = 1.0 /
+    //    (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 +
+    //        w12 + w13 + w14 + w15 + w16 + w17 + w18 + w19 + w20 + w21));
+    const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    const float w0_1 = w0 * 0.5 + w1;
+    const float w2_3 = w2 + w3;
+    const float w4_5 = w4 + w5;
+    const float w6_7 = w6 + w7;
+    const float w8_9 = w8 + w9;
+    const float w10_11 = w10 + w11;
+    const float w12_13 = w12 + w13;
+    const float w14_15 = w14 + w15;
+    const float w16_17 = w16 + w17;
+    const float w18_19 = w18 + w19;
+    const float w20_21 = w20 + w21;
+    const float w0_1_ratio = w1/w0_1;
+    const float w2_3_ratio = w3/w2_3;
+    const float w4_5_ratio = w5/w4_5;
+    const float w6_7_ratio = w7/w6_7;
+    const float w8_9_ratio = w9/w8_9;
+    const float w10_11_ratio = w11/w10_11;
+    const float w12_13_ratio = w13/w12_13;
+    const float w14_15_ratio = w15/w14_15;
+    const float w16_17_ratio = w17/w16_17;
+    const float w18_19_ratio = w19/w18_19;
+    const float w20_21_ratio = w21/w20_21;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w20_21 * tex2D_linearize(tex, tex_uv - (20.0 + w20_21_ratio) * dxdy).rgb;
+    sum += w18_19 * tex2D_linearize(tex, tex_uv - (18.0 + w18_19_ratio) * dxdy).rgb;
+    sum += w16_17 * tex2D_linearize(tex, tex_uv - (16.0 + w16_17_ratio) * dxdy).rgb;
+    sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb;
+    sum += w16_17 * tex2D_linearize(tex, tex_uv + (16.0 + w16_17_ratio) * dxdy).rgb;
+    sum += w18_19 * tex2D_linearize(tex, tex_uv + (18.0 + w18_19_ratio) * dxdy).rgb;
+    sum += w20_21 * tex2D_linearize(tex, tex_uv + (20.0 + w20_21_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur31fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 31x Gaussian blurred texture lookup using 16 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float w6 = exp(-36.0 * denom_inv);
+    const float w7 = exp(-49.0 * denom_inv);
+    const float w8 = exp(-64.0 * denom_inv);
+    const float w9 = exp(-81.0 * denom_inv);
+    const float w10 = exp(-100.0 * denom_inv);
+    const float w11 = exp(-121.0 * denom_inv);
+    const float w12 = exp(-144.0 * denom_inv);
+    const float w13 = exp(-169.0 * denom_inv);
+    const float w14 = exp(-196.0 * denom_inv);
+    const float w15 = exp(-225.0 * denom_inv);
+    //const float weight_sum_inv = 1.0 /
+    //    (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 +
+    //        w9 + w10 + w11 + w12 + w13 + w14 + w15));
+    const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    const float w0_1 = w0 * 0.5 + w1;
+    const float w2_3 = w2 + w3;
+    const float w4_5 = w4 + w5;
+    const float w6_7 = w6 + w7;
+    const float w8_9 = w8 + w9;
+    const float w10_11 = w10 + w11;
+    const float w12_13 = w12 + w13;
+    const float w14_15 = w14 + w15;
+    const float w0_1_ratio = w1/w0_1;
+    const float w2_3_ratio = w3/w2_3;
+    const float w4_5_ratio = w5/w4_5;
+    const float w6_7_ratio = w7/w6_7;
+    const float w8_9_ratio = w9/w8_9;
+    const float w10_11_ratio = w11/w10_11;
+    const float w12_13_ratio = w13/w12_13;
+    const float w14_15_ratio = w15/w14_15;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur25fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 25x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 12 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float w6 = exp(-36.0 * denom_inv);
+    const float w7 = exp(-49.0 * denom_inv);
+    const float w8 = exp(-64.0 * denom_inv);
+    const float w9 = exp(-81.0 * denom_inv);
+    const float w10 = exp(-100.0 * denom_inv);
+    const float w11 = exp(-121.0 * denom_inv);
+    const float w12 = exp(-144.0 * denom_inv);
+    //const float weight_sum_inv = 1.0 / (w0 + 2.0 * (
+    //    w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12));
+    const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    const float w1_2 = w1 + w2;
+    const float w3_4 = w3 + w4;
+    const float w5_6 = w5 + w6;
+    const float w7_8 = w7 + w8;
+    const float w9_10 = w9 + w10;
+    const float w11_12 = w11 + w12;
+    const float w1_2_ratio = w2/w1_2;
+    const float w3_4_ratio = w4/w3_4;
+    const float w5_6_ratio = w6/w5_6;
+    const float w7_8_ratio = w8/w7_8;
+    const float w9_10_ratio = w10/w9_10;
+    const float w11_12_ratio = w12/w11_12;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w11_12 * tex2D_linearize(tex, tex_uv - (11.0 + w11_12_ratio) * dxdy).rgb;
+    sum += w9_10 * tex2D_linearize(tex, tex_uv - (9.0 + w9_10_ratio) * dxdy).rgb;
+    sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w9_10 * tex2D_linearize(tex, tex_uv + (9.0 + w9_10_ratio) * dxdy).rgb;
+    sum += w11_12 * tex2D_linearize(tex, tex_uv + (11.0 + w11_12_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur17fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 17x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 8 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float w6 = exp(-36.0 * denom_inv);
+    const float w7 = exp(-49.0 * denom_inv);
+    const float w8 = exp(-64.0 * denom_inv);
+    //const float weight_sum_inv = 1.0 / (w0 + 2.0 * (
+    //    w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8));
+    const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    const float w1_2 = w1 + w2;
+    const float w3_4 = w3 + w4;
+    const float w5_6 = w5 + w6;
+    const float w7_8 = w7 + w8;
+    const float w1_2_ratio = w2/w1_2;
+    const float w3_4_ratio = w4/w3_4;
+    const float w5_6_ratio = w6/w5_6;
+    const float w7_8_ratio = w8/w7_8;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+
+////////////////////  ARBITRARILY RESIZABLE ONE-PASS BLURS  ////////////////////
+
+float3 tex2Dblur3x3resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 3x3 Gaussian blurred mipmapped texture lookup of the
+    //              resized input.
+    //  Description:
+    //  This is the only arbitrarily resizable one-pass blur; tex2Dblur5x5resize
+    //  would perform like tex2Dblur9x9, MUCH slower than tex2Dblur5resize.
+    const float denom_inv = 0.5/(sigma*sigma);
+    //  Load each sample.  We need all 3x3 samples.  Quad-pixel communication
+    //  won't help either: This should perform like tex2Dblur5x5, but sharing a
+    //  4x4 sample field would perform more like tex2Dblur8x8shared (worse).
+    const float2 sample4_uv = tex_uv;
+    const float2 dx = float2(dxdy.x, 0.0);
+    const float2 dy = float2(0.0, dxdy.y);
+    const float2 sample1_uv = sample4_uv - dy;
+    const float2 sample7_uv = sample4_uv + dy;
+    const float3 sample0 = tex2D_linearize(tex, sample1_uv - dx).rgb;
+    const float3 sample1 = tex2D_linearize(tex, sample1_uv).rgb;
+    const float3 sample2 = tex2D_linearize(tex, sample1_uv + dx).rgb;
+    const float3 sample3 = tex2D_linearize(tex, sample4_uv - dx).rgb;
+    const float3 sample4 = tex2D_linearize(tex, sample4_uv).rgb;
+    const float3 sample5 = tex2D_linearize(tex, sample4_uv + dx).rgb;
+    const float3 sample6 = tex2D_linearize(tex, sample7_uv - dx).rgb;
+    const float3 sample7 = tex2D_linearize(tex, sample7_uv).rgb;
+    const float3 sample8 = tex2D_linearize(tex, sample7_uv + dx).rgb;
+    //  Statically compute Gaussian sample weights:
+    const float w4 = 1.0;
+    const float w1_3_5_7 = exp(-LENGTH_SQ(float2(1.0, 0.0)) * denom_inv);
+    const float w0_2_6_8 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
+    const float weight_sum_inv = 1.0/(w4 + 4.0 * (w1_3_5_7 + w0_2_6_8));
+    //  Weight and sum the samples:
+    const float3 sum = w4 * sample4 +
+        w1_3_5_7 * (sample1 + sample3 + sample5 + sample7) +
+        w0_2_6_8 * (sample0 + sample2 + sample6 + sample8);
+    return sum * weight_sum_inv;
+}
+
+
+////////////////////////////  FASTER ONE-PASS BLURS  ///////////////////////////
+
+float3 tex2Dblur9x9(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Perform a 1-pass 9x9 blur with 5x5 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 9x9 Gaussian blurred mipmapped texture lookup composed of
+    //              5x5 carefully selected bilinear samples.
+    //  Description:
+    //  Perform a 1-pass 9x9 blur with 5x5 bilinear samples.  Adjust the
+    //  bilinear sample location to reflect the true Gaussian weights for each
+    //  underlying texel.  The following diagram illustrates the relative
+    //  locations of bilinear samples.  Each sample with the same number has the
+    //  same weight (notice the symmetry).  The letters a, b, c, d distinguish
+    //  quadrants, and the letters U, D, L, R, C (up, down, left, right, center)
+    //  distinguish 1D directions along the line containing the pixel center:
+    //      6a 5a 2U 5b 6b
+    //      4a 3a 1U 3b 4b
+    //      2L 1L 0C 1R 2R
+    //      4c 3c 1D 3d 4d
+    //      6c 5c 2D 5d 6d
+    //  The following diagram illustrates the underlying equally spaced texels,
+    //  named after the sample that accesses them and subnamed by their location
+    //  within their 2x2, 2x1, 1x2, or 1x1 texel block:
+    //      6a4 6a3 5a4 5a3 2U2 5b3 5b4 6b3 6b4
+    //      6a2 6a1 5a2 5a1 2U1 5b1 5b2 6b1 6b2
+    //      4a4 4a3 3a4 3a3 1U2 3b3 3b4 4b3 4b4
+    //      4a2 4a1 3a2 3a1 1U1 3b1 3b2 4b1 4b2
+    //      2L2 2L1 1L2 1L1 0C1 1R1 1R2 2R1 2R2
+    //      4c2 4c1 3c2 3c1 1D1 3d1 3d2 4d1 4d2
+    //      4c4 4c3 3c4 3c3 1D2 3d3 3d4 4d3 4d4
+    //      6c2 6c1 5c2 5c1 2D1 5d1 5d2 6d1 6d2
+    //      6c4 6c3 5c4 5c3 2D2 5d3 5d4 6d3 6d4
+    //  Note there is only one C texel and only two texels for each U, D, L, or
+    //  R sample.  The center sample is effectively a nearest neighbor sample,
+    //  and the U/D/L/R samples use 1D linear filtering.  All other texels are
+    //  read with bilinear samples somewhere within their 2x2 texel blocks.
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute sampling offsets within each 2x2 texel block, based
+    //  on 1D sampling ratios between texels [1, 2] and [3, 4] texels away from
+    //  the center, and reuse them independently for both dimensions.  Compute
+    //  these offsets based on the relative 1D Gaussian weights of the texels
+    //  in question.  (w1off means "Gaussian weight for the texel 1.0 texels
+    //  away from the pixel center," etc.).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w1off = exp(-1.0 * denom_inv);
+    const float w2off = exp(-4.0 * denom_inv);
+    const float w3off = exp(-9.0 * denom_inv);
+    const float w4off = exp(-16.0 * denom_inv);
+    const float texel1to2ratio = w2off/(w1off + w2off);
+    const float texel3to4ratio = w4off/(w3off + w4off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including x-axis-aligned:
+    const float2 sample1R_texel_offset = float2(1.0, 0.0) + float2(texel1to2ratio, 0.0);
+    const float2 sample2R_texel_offset = float2(3.0, 0.0) + float2(texel3to4ratio, 0.0);
+    const float2 sample3d_texel_offset = float2(1.0, 1.0) + float2(texel1to2ratio, texel1to2ratio);
+    const float2 sample4d_texel_offset = float2(3.0, 1.0) + float2(texel3to4ratio, texel1to2ratio);
+    const float2 sample5d_texel_offset = float2(1.0, 3.0) + float2(texel1to2ratio, texel3to4ratio);
+    const float2 sample6d_texel_offset = float2(3.0, 3.0) + float2(texel3to4ratio, texel3to4ratio);
+
+    //  CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
+    //  Statically compute Gaussian texel weights for the bottom-right quadrant.
+    //  Read underscores as "and."
+    const float w1R1 = w1off;
+    const float w1R2 = w2off;
+    const float w2R1 = w3off;
+    const float w2R2 = w4off;
+    const float w3d1 =     exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
+    const float w3d2_3d3 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv);
+    const float w3d4 =     exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv);
+    const float w4d1_5d1 = exp(-LENGTH_SQ(float2(3.0, 1.0)) * denom_inv);
+    const float w4d2_5d3 = exp(-LENGTH_SQ(float2(4.0, 1.0)) * denom_inv);
+    const float w4d3_5d2 = exp(-LENGTH_SQ(float2(3.0, 2.0)) * denom_inv);
+    const float w4d4_5d4 = exp(-LENGTH_SQ(float2(4.0, 2.0)) * denom_inv);
+    const float w6d1 =     exp(-LENGTH_SQ(float2(3.0, 3.0)) * denom_inv);
+    const float w6d2_6d3 = exp(-LENGTH_SQ(float2(4.0, 3.0)) * denom_inv);
+    const float w6d4 =     exp(-LENGTH_SQ(float2(4.0, 4.0)) * denom_inv);
+    //  Statically add texel weights in each sample to get sample weights:
+    const float w0 = 1.0;
+    const float w1 = w1R1 + w1R2;
+    const float w2 = w2R1 + w2R2;
+    const float w3 = w3d1 + 2.0 * w3d2_3d3 + w3d4;
+    const float w4 = w4d1_5d1 + w4d2_5d3 + w4d3_5d2 + w4d4_5d4;
+    const float w5 = w4;
+    const float w6 = w6d1 + 2.0 * w6d2_6d3 + w6d4;
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv =
+        1.0/(w0 + 4.0 * (w1 + w2 + w3 + w4 + w5 + w6));
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 25 samples (1 nearest, 8 linear, 16 bilinear) using symmetry:
+    const float2 mirror_x = float2(-1.0, 1.0);
+    const float2 mirror_y = float2(1.0, -1.0);
+    const float2 mirror_xy = float2(-1.0, -1.0);
+    const float2 dxdy_mirror_x = dxdy * mirror_x;
+    const float2 dxdy_mirror_y = dxdy * mirror_y;
+    const float2 dxdy_mirror_xy = dxdy * mirror_xy;
+    //  Sampling order doesn't seem to affect performance, so just be clear:
+    const float3 sample0C = tex2D_linearize(tex, tex_uv).rgb;
+    const float3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb;
+    const float3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb;
+    const float3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb;
+    const float3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb;
+    const float3 sample2R = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset).rgb;
+    const float3 sample2D = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset.yx).rgb;
+    const float3 sample2L = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset).rgb;
+    const float3 sample2U = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset.yx).rgb;
+    const float3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb;
+    const float3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb;
+    const float3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb;
+    const float3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb;
+    const float3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb;
+    const float3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb;
+    const float3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb;
+    const float3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb;
+    const float3 sample5d = tex2D_linearize(tex, tex_uv + dxdy * sample5d_texel_offset).rgb;
+    const float3 sample5c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample5d_texel_offset).rgb;
+    const float3 sample5b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample5d_texel_offset).rgb;
+    const float3 sample5a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample5d_texel_offset).rgb;
+    const float3 sample6d = tex2D_linearize(tex, tex_uv + dxdy * sample6d_texel_offset).rgb;
+    const float3 sample6c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample6d_texel_offset).rgb;
+    const float3 sample6b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample6d_texel_offset).rgb;
+    const float3 sample6a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample6d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    float3 sum = w0 * sample0C;
+    sum += w1 * (sample1R + sample1D + sample1L + sample1U);
+    sum += w2 * (sample2R + sample2D + sample2L + sample2U);
+    sum += w3 * (sample3d + sample3c + sample3b + sample3a);
+    sum += w4 * (sample4d + sample4c + sample4b + sample4a);
+    sum += w5 * (sample5d + sample5c + sample5b + sample5a);
+    sum += w6 * (sample6d + sample6c + sample6b + sample6a);
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur7x7(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Perform a 1-pass 7x7 blur with 5x5 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 7x7 Gaussian blurred mipmapped texture lookup composed of
+    //              4x4 carefully selected bilinear samples.
+    //  Description:
+    //  First see the descriptions for tex2Dblur9x9() and tex2Dblur7().  This
+    //  blur mixes concepts from both.  The sample layout is as follows:
+    //      4a 3a 3b 4b
+    //      2a 1a 1b 2b
+    //      2c 1c 1d 2d
+    //      4c 3c 3d 4d
+    //  The texel layout is as follows.  Note that samples 3a/3b, 1a/1b, 1c/1d,
+    //  and 3c/3d share a vertical column of texels, and samples 2a/2c, 1a/1c,
+    //  1b/1d, and 2b/2d share a horizontal row of texels (all sample1's share
+    //  the center texel):
+    //      4a4  4a3  3a4  3ab3 3b4  4b3  4b4
+    //      4a2  4a1  3a2  3ab1 3b2  4b1  4b2
+    //      2a4  2a3  1a4  1ab3 1b4  2b3  2b4
+    //      2ac2 2ac1 1ac2 1*   1bd2 2bd1 2bd2
+    //      2c4  2c3  1c4  1cd3 1d4  2d3  2d4
+    //      4c2  4c1  3c2  3cd1 3d2  4d1  4d2
+    //      4c4  4c3  3c4  3cd3 3d4  4d3  4d4
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off = 1.0;
+    const float w1off = exp(-1.0 * denom_inv);
+    const float w2off = exp(-4.0 * denom_inv);
+    const float w3off = exp(-9.0 * denom_inv);
+    const float texel0to1ratio = w1off/(w0off * 0.5 + w1off);
+    const float texel2to3ratio = w3off/(w2off + w3off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including axis-aligned:
+    const float2 sample1d_texel_offset = float2(texel0to1ratio, texel0to1ratio);
+    const float2 sample2d_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample3d_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample4d_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+
+    //  CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
+    //  Statically compute Gaussian texel weights for the bottom-right quadrant.
+    //  Read underscores as "and."
+    const float w1abcd = 1.0;
+    const float w1bd2_1cd3 = exp(-LENGTH_SQ(float2(1.0, 0.0)) * denom_inv);
+    const float w2bd1_3cd1 = exp(-LENGTH_SQ(float2(2.0, 0.0)) * denom_inv);
+    const float w2bd2_3cd2 = exp(-LENGTH_SQ(float2(3.0, 0.0)) * denom_inv);
+    const float w1d4 =       exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
+    const float w2d3_3d2 =   exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv);
+    const float w2d4_3d4 =   exp(-LENGTH_SQ(float2(3.0, 1.0)) * denom_inv);
+    const float w4d1 =       exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv);
+    const float w4d2_4d3 =   exp(-LENGTH_SQ(float2(3.0, 2.0)) * denom_inv);
+    const float w4d4 =       exp(-LENGTH_SQ(float2(3.0, 3.0)) * denom_inv);
+    //  Statically add texel weights in each sample to get sample weights.
+    //  Split weights for shared texels between samples sharing them:
+    const float w1 = w1abcd * 0.25 + w1bd2_1cd3 + w1d4;
+    const float w2_3 = (w2bd1_3cd1 + w2bd2_3cd2) * 0.5 + w2d3_3d2 + w2d4_3d4;
+    const float w4 = w4d1 + 2.0 * w4d2_4d3 + w4d4;
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv =
+        1.0/(4.0 * (w1 + 2.0 * w2_3 + w4));
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 16 samples using symmetry:
+    const float2 mirror_x = float2(-1.0, 1.0);
+    const float2 mirror_y = float2(1.0, -1.0);
+    const float2 mirror_xy = float2(-1.0, -1.0);
+    const float2 dxdy_mirror_x = dxdy * mirror_x;
+    const float2 dxdy_mirror_y = dxdy * mirror_y;
+    const float2 dxdy_mirror_xy = dxdy * mirror_xy;
+    const float3 sample1a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample1d_texel_offset).rgb;
+    const float3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb;
+    const float3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb;
+    const float3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb;
+    const float3 sample1b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample1d_texel_offset).rgb;
+    const float3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb;
+    const float3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb;
+    const float3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb;
+    const float3 sample1c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample1d_texel_offset).rgb;
+    const float3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb;
+    const float3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb;
+    const float3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb;
+    const float3 sample1d = tex2D_linearize(tex, tex_uv + dxdy * sample1d_texel_offset).rgb;
+    const float3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb;
+    const float3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb;
+    const float3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w1 * (sample1a + sample1b + sample1c + sample1d);
+    sum += w2_3 * (sample2a + sample2b + sample2c + sample2d);
+    sum += w2_3 * (sample3a + sample3b + sample3c + sample3d);
+    sum += w4 * (sample4a + sample4b + sample4c + sample4d);
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur5x5(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Perform a 1-pass 5x5 blur with 3x3 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 5x5 Gaussian blurred mipmapped texture lookup composed of
+    //              3x3 carefully selected bilinear samples.
+    //  Description:
+    //  First see the description for tex2Dblur9x9().  This blur uses the same
+    //  concept and sample/texel locations except on a smaller scale.  Samples:
+    //      2a 1U 2b
+    //      1L 0C 1R
+    //      2c 1D 2d
+    //  Texels:
+    //      2a4 2a3 1U2 2b3 2b4
+    //      2a2 2a1 1U1 2b1 2b2
+    //      1L2 1L1 0C1 1R1 1R2
+    //      2c2 2c1 1D1 2d1 2d2
+    //      2c4 2c3 1D2 2d3 2d4
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w1off = exp(-1.0 * denom_inv);
+    const float w2off = exp(-4.0 * denom_inv);
+    const float texel1to2ratio = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including x-axis-aligned:
+    const float2 sample1R_texel_offset = float2(1.0, 0.0) + float2(texel1to2ratio, 0.0);
+    const float2 sample2d_texel_offset = float2(1.0, 1.0) + float2(texel1to2ratio, texel1to2ratio);
+
+    //  CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
+    //  Statically compute Gaussian texel weights for the bottom-right quadrant.
+    //  Read underscores as "and."
+    const float w1R1 = w1off;
+    const float w1R2 = w2off;
+    const float w2d1 =   exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
+    const float w2d2_3 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv);
+    const float w2d4 =   exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv);
+    //  Statically add texel weights in each sample to get sample weights:
+    const float w0 = 1.0;
+    const float w1 = w1R1 + w1R2;
+    const float w2 = w2d1 + 2.0 * w2d2_3 + w2d4;
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv = 1.0/(w0 + 4.0 * (w1 + w2));
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 9 samples (1 nearest, 4 linear, 4 bilinear) using symmetry:
+    const float2 mirror_x = float2(-1.0, 1.0);
+    const float2 mirror_y = float2(1.0, -1.0);
+    const float2 mirror_xy = float2(-1.0, -1.0);
+    const float2 dxdy_mirror_x = dxdy * mirror_x;
+    const float2 dxdy_mirror_y = dxdy * mirror_y;
+    const float2 dxdy_mirror_xy = dxdy * mirror_xy;
+    const float3 sample0C = tex2D_linearize(tex, tex_uv).rgb;
+    const float3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb;
+    const float3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb;
+    const float3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb;
+    const float3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb;
+    const float3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb;
+    const float3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb;
+    const float3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb;
+    const float3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    float3 sum = w0 * sample0C;
+    sum += w1 * (sample1R + sample1D + sample1L + sample1U);
+    sum += w2 * (sample2a + sample2b + sample2c + sample2d);
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur3x3(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Perform a 1-pass 3x3 blur with 5x5 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 3x3 Gaussian blurred mipmapped texture lookup composed of
+    //              2x2 carefully selected bilinear samples.
+    //  Description:
+    //  First see the descriptions for tex2Dblur9x9() and tex2Dblur7().  This
+    //  blur mixes concepts from both.  The sample layout is as follows:
+    //      0a 0b
+    //      0c 0d
+    //  The texel layout is as follows.  Note that samples 0a/0b and 0c/0d share
+    //  a vertical column of texels, and samples 0a/0c and 0b/0d share a
+    //  horizontal row of texels (all samples share the center texel):
+    //      0a3  0ab2 0b3
+    //      0ac1 0*0  0bd1
+    //      0c3  0cd2 0d3
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off = 1.0;
+    const float w1off = exp(-1.0 * denom_inv);
+    const float texel0to1ratio = w1off/(w0off * 0.5 + w1off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including axis-aligned:
+    const float2 sample0d_texel_offset = float2(texel0to1ratio, texel0to1ratio);
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 4 samples using symmetry:
+    const float2 mirror_x = float2(-1.0, 1.0);
+    const float2 mirror_y = float2(1.0, -1.0);
+    const float2 mirror_xy = float2(-1.0, -1.0);
+    const float2 dxdy_mirror_x = dxdy * mirror_x;
+    const float2 dxdy_mirror_y = dxdy * mirror_y;
+    const float2 dxdy_mirror_xy = dxdy * mirror_xy;
+    const float3 sample0a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample0d_texel_offset).rgb;
+    const float3 sample0b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample0d_texel_offset).rgb;
+    const float3 sample0c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample0d_texel_offset).rgb;
+    const float3 sample0d = tex2D_linearize(tex, tex_uv + dxdy * sample0d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Weights for all samples are the same, so just average them:
+    return 0.25 * (sample0a + sample0b + sample0c + sample0d);
+}
+
+
+//////////////////  LINEAR ONE-PASS BLURS WITH SHARED SAMPLES  /////////////////
+
+float3 tex2Dblur12x12shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
+    const float sigma)
+{
+    //  Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
+    //  Requires:   1.) Same as tex2Dblur9()
+    //              2.) ddx() and ddy() are present in the current Cg profile.
+    //              3.) The GPU driver is using fine/high-quality derivatives.
+    //              4.) quad_vector *correctly* describes the current fragment's
+    //                  location in its pixel quad, by the conventions noted in
+    //                  get_quad_vector[_naive].
+    //              5.) tex_uv.w = log2(IN.video_size/IN.output_size).y
+    //              6.) tex2Dlod() is present in the current Cg profile.
+    //  Optional:   Tune artifacts vs. excessive blurriness with the global
+    //              float error_blurring.
+    //  Returns:    A blurred texture lookup using a "virtual" 12x12 Gaussian
+    //              blur (a 6x6 blur of carefully selected bilinear samples)
+    //              of the given mip level.  There will be subtle inaccuracies,
+    //              especially for small or high-frequency detailed sources.
+    //  Description:
+    //  Perform a 1-pass blur with shared texture lookups across a pixel quad.
+    //  We'll get neighboring samples with high-quality ddx/ddy derivatives, as
+    //  in GPU Pro 2, Chapter VI.2, "Shader Amortization using Pixel Quad
+    //  Message Passing" by Eric Penner.
+    //
+    //  Our "virtual" 12x12 blur will be comprised of ((6 - 1)^2)/4 + 3 = 12
+    //  bilinear samples, where bilinear sampling positions are computed from
+    //  the relative Gaussian weights of the 4 surrounding texels.  The catch is
+    //  that the appropriate texel weights and sample coords differ for each
+    //  fragment, but we're reusing most of the same samples across a quad of
+    //  destination fragments.  (We do use unique coords for the four nearest
+    //  samples at each fragment.)  Mixing bilinear filtering and sample-sharing
+    //  therefore introduces some error into the weights, and this can get nasty
+    //  when the source image is small or high-frequency.  Computing bilinear
+    //  ratios based on weights at the sample field center results in sharpening
+    //  and ringing artifacts, but we can move samples closer to halfway between
+    //  texels to try blurring away the error (which can move features around by
+    //  a texel or so).  Tune this with the global float "error_blurring".
+    //
+    //  The pixel quad's sample field covers 12x12 texels, accessed through 6x6
+    //  bilinear (2x2 texel) taps.  Each fragment depends on a window of 10x10
+    //  texels (5x5 bilinear taps), and each fragment is responsible for loading
+    //  a 6x6 texel quadrant as a 3x3 block of bilinear taps, plus 3 more taps
+    //  to use unique bilinear coords for sample0* for each fragment.  This
+    //  diagram illustrates the relative locations of bilinear samples 1-9 for
+    //  each quadrant a, b, c, d (note samples will not be equally spaced):
+    //      8a 7a 6a 6b 7b 8b
+    //      5a 4a 3a 3b 4b 5b
+    //      2a 1a 0a 0b 1b 2b
+    //      2c 1c 0c 0d 1d 2d
+    //      5c 4c 3c 3d 4d 5d
+    //      8c 7c 6c 6d 7d 8d
+    //  The following diagram illustrates the underlying equally spaced texels,
+    //  named after the sample that accesses them and subnamed by their location
+    //  within their 2x2 texel block:
+    //      8a3 8a2 7a3 7a2 6a3 6a2 6b2 6b3 7b2 7b3 8b2 8b3
+    //      8a1 8a0 7a1 7a0 6a1 6a0 6b0 6b1 7b0 7b1 8b0 8b1
+    //      5a3 5a2 4a3 4a2 3a3 3a2 3b2 3b3 4b2 4b3 5b2 5b3
+    //      5a1 5a0 4a1 4a0 3a1 3a0 3b0 3b1 4b0 4b1 5b0 5b1
+    //      2a3 2a2 1a3 1a2 0a3 0a2 0b2 0b3 1b2 1b3 2b2 2b3
+    //      2a1 2a0 1a1 1a0 0a1 0a0 0b0 0b1 1b0 1b1 2b0 2b1
+    //      2c1 2c0 1c1 1c0 0c1 0c0 0d0 0d1 1d0 1d1 2d0 2d1
+    //      2c3 2c2 1c3 1c2 0c3 0c2 0d2 0d3 1d2 1d3 2d2 2d3
+    //      5c1 5c0 4c1 4c0 3c1 3c0 3d0 3d1 4d0 4d1 5d0 5d1
+    //      5c3 5c2 4c3 4c2 3c3 3c2 3d2 3d3 4d2 4d3 5d2 5d3
+    //      8c1 8c0 7c1 7c0 6c1 6c0 6d0 6d1 7d0 7d1 8d0 8d1
+    //      8c3 8c2 7c3 7c2 6c3 6c2 6d2 6d3 7d2 7d3 8d2 8d3
+    //  With this symmetric arrangement, we don't have to know which absolute
+    //  quadrant a sample lies in to assign kernel weights; it's enough to know
+    //  the sample number and the relative quadrant of the sample (relative to
+    //  the current quadrant):
+    //      {current, adjacent x, adjacent y, diagonal}
+
+    //  COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Statically compute sampling offsets within each 2x2 texel block, based
+    //  on appropriate 1D Gaussian sampling ratio between texels [0, 1], [2, 3],
+    //  and [4, 5] away from the fragment, and reuse them independently for both
+    //  dimensions.  Use the sample field center as the estimated destination,
+    //  but nudge the result closer to halfway between texels to blur error.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off   = 1.0;
+    const float w0_5off = exp(-(0.5*0.5) * denom_inv);
+    const float w1off   = exp(-(1.0*1.0) * denom_inv);
+    const float w1_5off = exp(-(1.5*1.5) * denom_inv);
+    const float w2off   = exp(-(2.0*2.0) * denom_inv);
+    const float w2_5off = exp(-(2.5*2.5) * denom_inv);
+    const float w3_5off = exp(-(3.5*3.5) * denom_inv);
+    const float w4_5off = exp(-(4.5*4.5) * denom_inv);
+    const float w5_5off = exp(-(5.5*5.5) * denom_inv);
+    const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
+    const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
+    const float texel4to5ratio = lerp(w5_5off/(w4_5off + w5_5off), 0.5, error_blurring);
+    //  We don't share sample0*, so use the nearest destination fragment:
+    const float texel0to1ratio_nearest = w1off/(w0off + w1off);
+    const float texel1to2ratio_nearest = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the bottom-right fragment to each
+    //  bilinear sample in the bottom-right quadrant:
+    const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample2_texel_offset = float2(4.0, 0.0) + float2(texel4to5ratio, texel0to1ratio);
+    const float2 sample3_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample4_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+    const float2 sample5_texel_offset = float2(4.0, 2.0) + float2(texel4to5ratio, texel2to3ratio);
+    const float2 sample6_texel_offset = float2(0.0, 4.0) + float2(texel0to1ratio, texel4to5ratio);
+    const float2 sample7_texel_offset = float2(2.0, 4.0) + float2(texel2to3ratio, texel4to5ratio);
+    const float2 sample8_texel_offset = float2(4.0, 4.0) + float2(texel4to5ratio, texel4to5ratio);
+
+    //  CALCULATE KERNEL WEIGHTS:
+    //  Statically compute bilinear sample weights at each destination fragment
+    //  based on the sum of their 4 underlying texel weights.  Assume a same-
+    //  resolution blur, so each symmetrically named sample weight will compute
+    //  the same at every fragment in the pixel quad: We can therefore compute
+    //  texel weights based only on the bottom-right quadrant (fragment at 0d0).
+    //  Too avoid too much boilerplate code, use a macro to get all 4 texel
+    //  weights for a bilinear sample based on the offset of its top-left texel:
+    #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
+        (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
+    const float w8diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -6.0);
+    const float w7diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -6.0);
+    const float w6diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -6.0);
+    const float w6adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -6.0);
+    const float w7adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -6.0);
+    const float w8adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -6.0);
+    const float w5diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -4.0);
+    const float w4diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0);
+    const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0);
+    const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0);
+    const float w4adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0);
+    const float w5adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -4.0);
+    const float w2diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -2.0);
+    const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0);
+    const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
+    const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
+    const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
+    const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -2.0);
+    const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 0.0);
+    const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0);
+    const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
+    const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
+    const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
+    const float w2curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 0.0);
+    const float w5adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 2.0);
+    const float w4adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0);
+    const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
+    const float w3curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
+    const float w4curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
+    const float w5curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 2.0);
+    const float w8adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 4.0);
+    const float w7adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 4.0);
+    const float w6adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 4.0);
+    const float w6curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 4.0);
+    const float w7curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 4.0);
+    const float w8curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 4.0);
+    #undef GET_TEXEL_QUAD_WEIGHTS
+    //  Statically pack weights for runtime:
+    const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
+    const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag);
+    const float4 w2 = float4(w2curr, w2adjx, w2adjy, w2diag);
+    const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag);
+    const float4 w4 = float4(w4curr, w4adjx, w4adjy, w4diag);
+    const float4 w5 = float4(w5curr, w5adjx, w5adjy, w5diag);
+    const float4 w6 = float4(w6curr, w6adjx, w6adjy, w6diag);
+    const float4 w7 = float4(w7curr, w7adjx, w7adjy, w7diag);
+    const float4 w8 = float4(w8curr, w8adjx, w8adjy, w8diag);
+    //  Get the weight sum inverse (normalization factor):
+    const float4 weight_sum4 = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8;
+    const float2 weight_sum2 = weight_sum4.xy + weight_sum4.zw;
+    const float weight_sum = weight_sum2.x + weight_sum2.y;
+    const float weight_sum_inv = 1.0/(weight_sum);
+
+    //  LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
+    const float2 dxdy_curr = dxdy * quad_vector.xy;
+    //  Load bilinear samples for the current quadrant (for this fragment):
+    const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
+    const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
+    const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
+    const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
+    const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
+    const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
+    const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
+    const float3 sample4curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample4_texel_offset)).rgb;
+    const float3 sample5curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample5_texel_offset)).rgb;
+    const float3 sample6curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample6_texel_offset)).rgb;
+    const float3 sample7curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample7_texel_offset)).rgb;
+    const float3 sample8curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample8_texel_offset)).rgb;
+
+    //  GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
+    //  Fetch the samples from other fragments in the 2x2 quad:
+    float3 sample1adjx, sample1adjy, sample1diag;
+    float3 sample2adjx, sample2adjy, sample2diag;
+    float3 sample3adjx, sample3adjy, sample3diag;
+    float3 sample4adjx, sample4adjy, sample4diag;
+    float3 sample5adjx, sample5adjy, sample5diag;
+    float3 sample6adjx, sample6adjy, sample6diag;
+    float3 sample7adjx, sample7adjy, sample7diag;
+    float3 sample8adjx, sample8adjy, sample8diag;
+    quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
+    quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
+    quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag);
+    quad_gather(quad_vector, sample4curr, sample4adjx, sample4adjy, sample4diag);
+    quad_gather(quad_vector, sample5curr, sample5adjx, sample5adjy, sample5diag);
+    quad_gather(quad_vector, sample6curr, sample6adjx, sample6adjy, sample6diag);
+    quad_gather(quad_vector, sample7curr, sample7adjx, sample7adjy, sample7diag);
+    quad_gather(quad_vector, sample8curr, sample8adjx, sample8adjy, sample8diag);
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    //  Fill each row of a matrix with an rgb sample and pre-multiply by the
+    //  weights to obtain a weighted result:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
+    sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag));
+    sum += mul(w2, float4x3(sample2curr, sample2adjx, sample2adjy, sample2diag));
+    sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag));
+    sum += mul(w4, float4x3(sample4curr, sample4adjx, sample4adjy, sample4diag));
+    sum += mul(w5, float4x3(sample5curr, sample5adjx, sample5adjy, sample5diag));
+    sum += mul(w6, float4x3(sample6curr, sample6adjx, sample6adjy, sample6diag));
+    sum += mul(w7, float4x3(sample7curr, sample7adjx, sample7adjy, sample7diag));
+    sum += mul(w8, float4x3(sample8curr, sample8adjx, sample8adjy, sample8diag));
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur10x10shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
+    const float sigma)
+{
+    //  Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
+    //  Requires:   Same as tex2Dblur12x12shared()
+    //  Returns:    A blurred texture lookup using a "virtual" 10x10 Gaussian
+    //              blur (a 5x5 blur of carefully selected bilinear samples)
+    //              of the given mip level.  There will be subtle inaccuracies,
+    //              especially for small or high-frequency detailed sources.
+    //  Description:
+    //  First see the description for tex2Dblur12x12shared().  This
+    //  function shares the same concept and sample placement, but each fragment
+    //  only uses 25 of the 36 samples taken across the pixel quad (to cover a
+    //  5x5 sample area, or 10x10 texel area), and it uses a lower standard
+    //  deviation to compensate.  Thanks to symmetry, the 11 omitted samples
+    //  are always the "same:"
+    //      8adjx, 2adjx, 5adjx,
+    //      6adjy, 7adjy, 8adjy,
+    //      2diag, 5diag, 6diag, 7diag, 8diag
+
+    //  COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off   = 1.0;
+    const float w0_5off = exp(-(0.5*0.5) * denom_inv);
+    const float w1off   = exp(-(1.0*1.0) * denom_inv);
+    const float w1_5off = exp(-(1.5*1.5) * denom_inv);
+    const float w2off   = exp(-(2.0*2.0) * denom_inv);
+    const float w2_5off = exp(-(2.5*2.5) * denom_inv);
+    const float w3_5off = exp(-(3.5*3.5) * denom_inv);
+    const float w4_5off = exp(-(4.5*4.5) * denom_inv);
+    const float w5_5off = exp(-(5.5*5.5) * denom_inv);
+    const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
+    const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
+    const float texel4to5ratio = lerp(w5_5off/(w4_5off + w5_5off), 0.5, error_blurring);
+    //  We don't share sample0*, so use the nearest destination fragment:
+    const float texel0to1ratio_nearest = w1off/(w0off + w1off);
+    const float texel1to2ratio_nearest = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the bottom-right fragment to each
+    //  bilinear sample in the bottom-right quadrant:
+    const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample2_texel_offset = float2(4.0, 0.0) + float2(texel4to5ratio, texel0to1ratio);
+    const float2 sample3_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample4_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+    const float2 sample5_texel_offset = float2(4.0, 2.0) + float2(texel4to5ratio, texel2to3ratio);
+    const float2 sample6_texel_offset = float2(0.0, 4.0) + float2(texel0to1ratio, texel4to5ratio);
+    const float2 sample7_texel_offset = float2(2.0, 4.0) + float2(texel2to3ratio, texel4to5ratio);
+    const float2 sample8_texel_offset = float2(4.0, 4.0) + float2(texel4to5ratio, texel4to5ratio);
+
+    //  CALCULATE KERNEL WEIGHTS:
+    //  Statically compute bilinear sample weights at each destination fragment
+    //  from the sum of their 4 texel weights (details in tex2Dblur12x12shared).
+    #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
+        (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
+    //  We only need 25 of the 36 sample weights.  Skip the following weights:
+    //      8adjx, 2adjx, 5adjx,
+    //      6adjy, 7adjy, 8adjy,
+    //      2diag, 5diag, 6diag, 7diag, 8diag
+    const float w4diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0);
+    const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0);
+    const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0);
+    const float w4adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0);
+    const float w5adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -4.0);
+    const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0);
+    const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
+    const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
+    const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
+    const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -2.0);
+    const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0);
+    const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
+    const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
+    const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
+    const float w2curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 0.0);
+    const float w4adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0);
+    const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
+    const float w3curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
+    const float w4curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
+    const float w5curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 2.0);
+    const float w7adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 4.0);
+    const float w6adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 4.0);
+    const float w6curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 4.0);
+    const float w7curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 4.0);
+    const float w8curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 4.0);
+    #undef GET_TEXEL_QUAD_WEIGHTS
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv = 1.0/(w0curr + w1curr + w2curr + w3curr +
+        w4curr + w5curr + w6curr + w7curr + w8curr +
+        w0adjx + w1adjx + w3adjx + w4adjx + w6adjx + w7adjx +
+        w0adjy + w1adjy + w2adjy + w3adjy + w4adjy + w5adjy +
+        w0diag + w1diag + w3diag + w4diag);
+    //  Statically pack most weights for runtime.  Note the mixed packing:
+    const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
+    const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag);
+    const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag);
+    const float4 w4 = float4(w4curr, w4adjx, w4adjy, w4diag);
+    const float4 w2and5 = float4(w2curr, w2adjy, w5curr, w5adjy);
+    const float4 w6and7 = float4(w6curr, w6adjx, w7curr, w7adjx);
+
+    //  LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
+    const float2 dxdy_curr = dxdy * quad_vector.xy;
+    //  Load bilinear samples for the current quadrant (for this fragment):
+    const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
+    const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
+    const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
+    const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
+    const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
+    const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
+    const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
+    const float3 sample4curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample4_texel_offset)).rgb;
+    const float3 sample5curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample5_texel_offset)).rgb;
+    const float3 sample6curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample6_texel_offset)).rgb;
+    const float3 sample7curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample7_texel_offset)).rgb;
+    const float3 sample8curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample8_texel_offset)).rgb;
+
+    //  GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
+    //  Fetch the samples from other fragments in the 2x2 quad in order of need:
+    float3 sample1adjx, sample1adjy, sample1diag;
+    float3 sample2adjx, sample2adjy, sample2diag;
+    float3 sample3adjx, sample3adjy, sample3diag;
+    float3 sample4adjx, sample4adjy, sample4diag;
+    float3 sample5adjx, sample5adjy, sample5diag;
+    float3 sample6adjx, sample6adjy, sample6diag;
+    float3 sample7adjx, sample7adjy, sample7diag;
+    quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
+    quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
+    quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag);
+    quad_gather(quad_vector, sample4curr, sample4adjx, sample4adjy, sample4diag);
+    quad_gather(quad_vector, sample5curr, sample5adjx, sample5adjy, sample5diag);
+    quad_gather(quad_vector, sample6curr, sample6adjx, sample6adjy, sample6diag);
+    quad_gather(quad_vector, sample7curr, sample7adjx, sample7adjy, sample7diag);
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    //  Fill each row of a matrix with an rgb sample and pre-multiply by the
+    //  weights to obtain a weighted result.  First do the simple ones:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
+    sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag));
+    sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag));
+    sum += mul(w4, float4x3(sample4curr, sample4adjx, sample4adjy, sample4diag));
+    //  Now do the mixed-sample ones:
+    sum += mul(w2and5, float4x3(sample2curr, sample2adjy, sample5curr, sample5adjy));
+    sum += mul(w6and7, float4x3(sample6curr, sample6adjx, sample7curr, sample7adjx));
+    sum += w8curr * sample8curr;
+    //  Normalize the sum (so the weights add to 1.0) and return:
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur8x8shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
+    const float sigma)
+{
+    //  Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
+    //  Requires:   Same as tex2Dblur12x12shared()
+    //  Returns:    A blurred texture lookup using a "virtual" 8x8 Gaussian
+    //              blur (a 4x4 blur of carefully selected bilinear samples)
+    //              of the given mip level.  There will be subtle inaccuracies,
+    //              especially for small or high-frequency detailed sources.
+    //  Description:
+    //  First see the description for tex2Dblur12x12shared().  This function
+    //  shares the same concept and a similar sample placement, except each
+    //  quadrant contains 4x4 texels and 2x2 samples instead of 6x6 and 3x3
+    //  respectively.  There could be a total of 16 samples, 4 of which each
+    //  fragment is responsible for, but each fragment loads 0a/0b/0c/0d with
+    //  its own offset to reduce shared sample artifacts, bringing the sample
+    //  count for each fragment to 7.  Sample placement:
+    //      3a 2a 2b 3b
+    //      1a 0a 0b 1b
+    //      1c 0c 0d 1d
+    //      3c 2c 2d 3d
+    //  Texel placement:
+    //      3a3 3a2 2a3 2a2 2b2 2b3 3b2 3b3
+    //      3a1 3a0 2a1 2a0 2b0 2b1 3b0 3b1
+    //      1a3 1a2 0a3 0a2 0b2 0b3 1b2 1b3
+    //      1a1 1a0 0a1 0a0 0b0 0b1 1b0 1b1
+    //      1c1 1c0 0c1 0c0 0d0 0d1 1d0 1d1
+    //      1c3 1c2 0c3 0c2 0d2 0d3 1d2 1d3
+    //      3c1 3c0 2c1 2c0 2d0 2d1 3d0 4d1
+    //      3c3 3c2 2c3 2c2 2d2 2d3 3d2 4d3
+    
+    //  COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off   = 1.0;
+    const float w0_5off = exp(-(0.5*0.5) * denom_inv);
+    const float w1off   = exp(-(1.0*1.0) * denom_inv);
+    const float w1_5off = exp(-(1.5*1.5) * denom_inv);
+    const float w2off   = exp(-(2.0*2.0) * denom_inv);
+    const float w2_5off = exp(-(2.5*2.5) * denom_inv);
+    const float w3_5off = exp(-(3.5*3.5) * denom_inv);
+    const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
+    const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
+    //  We don't share sample0*, so use the nearest destination fragment:
+    const float texel0to1ratio_nearest = w1off/(w0off + w1off);
+    const float texel1to2ratio_nearest = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the bottom-right fragment to each
+    //  bilinear sample in the bottom-right quadrant:
+    const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample2_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample3_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+
+    //  CALCULATE KERNEL WEIGHTS:
+    //  Statically compute bilinear sample weights at each destination fragment
+    //  from the sum of their 4 texel weights (details in tex2Dblur12x12shared).
+    #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
+        (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
+    const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0);
+    const float w2diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0);
+    const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0);
+    const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0);
+    const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0);
+    const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
+    const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
+    const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
+    const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0);
+    const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
+    const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
+    const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
+    const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0);
+    const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
+    const float w2curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
+    const float w3curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
+    #undef GET_TEXEL_QUAD_WEIGHTS
+    //  Statically pack weights for runtime:
+    const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
+    const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag);
+    const float4 w2 = float4(w2curr, w2adjx, w2adjy, w2diag);
+    const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag);
+    //  Get the weight sum inverse (normalization factor):
+    const float4 weight_sum4 = w0 + w1 + w2 + w3;
+    const float2 weight_sum2 = weight_sum4.xy + weight_sum4.zw;
+    const float weight_sum = weight_sum2.x + weight_sum2.y;
+    const float weight_sum_inv = 1.0/(weight_sum);
+
+    //  LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
+    const float2 dxdy_curr = dxdy * quad_vector.xy;
+    //  Load bilinear samples for the current quadrant (for this fragment):
+    const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
+    const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
+    const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
+    const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
+    const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
+    const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
+    const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
+
+    //  GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
+    //  Fetch the samples from other fragments in the 2x2 quad:
+    float3 sample1adjx, sample1adjy, sample1diag;
+    float3 sample2adjx, sample2adjy, sample2diag;
+    float3 sample3adjx, sample3adjy, sample3diag;
+    quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
+    quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
+    quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag);
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    //  Fill each row of a matrix with an rgb sample and pre-multiply by the
+    //  weights to obtain a weighted result:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
+    sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag));
+    sum += mul(w2, float4x3(sample2curr, sample2adjx, sample2adjy, sample2diag));
+    sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag));
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur6x6shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
+    const float sigma)
+{
+    //  Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
+    //  Requires:   Same as tex2Dblur12x12shared()
+    //  Returns:    A blurred texture lookup using a "virtual" 6x6 Gaussian
+    //              blur (a 3x3 blur of carefully selected bilinear samples)
+    //              of the given mip level.  There will be some inaccuracies,subtle inaccuracies,
+    //              especially for small or high-frequency detailed sources.
+    //  Description:
+    //  First see the description for tex2Dblur8x8shared().  This
+    //  function shares the same concept and sample placement, but each fragment
+    //  only uses 9 of the 16 samples taken across the pixel quad (to cover a
+    //  3x3 sample area, or 6x6 texel area), and it uses a lower standard
+    //  deviation to compensate.  Thanks to symmetry, the 7 omitted samples
+    //  are always the "same:"
+    //      1adjx, 3adjx
+    //      2adjy, 3adjy
+    //      1diag, 2diag, 3diag
+
+    //  COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off   = 1.0;
+    const float w0_5off = exp(-(0.5*0.5) * denom_inv);
+    const float w1off   = exp(-(1.0*1.0) * denom_inv);
+    const float w1_5off = exp(-(1.5*1.5) * denom_inv);
+    const float w2off   = exp(-(2.0*2.0) * denom_inv);
+    const float w2_5off = exp(-(2.5*2.5) * denom_inv);
+    const float w3_5off = exp(-(3.5*3.5) * denom_inv);
+    const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
+    const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
+    //  We don't share sample0*, so use the nearest destination fragment:
+    const float texel0to1ratio_nearest = w1off/(w0off + w1off);
+    const float texel1to2ratio_nearest = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the bottom-right fragment to each
+    //  bilinear sample in the bottom-right quadrant:
+    const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample2_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample3_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+
+    //  CALCULATE KERNEL WEIGHTS:
+    //  Statically compute bilinear sample weights at each destination fragment
+    //  from the sum of their 4 texel weights (details in tex2Dblur12x12shared).
+    #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
+        (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
+    //  We only need 9 of the 16 sample weights.  Skip the following weights:
+    //      1adjx, 3adjx
+    //      2adjy, 3adjy
+    //      1diag, 2diag, 3diag
+    const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
+    const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
+    const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
+    const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
+    const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
+    const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
+    const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
+    const float w2curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
+    const float w3curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
+    #undef GET_TEXEL_QUAD_WEIGHTS
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv = 1.0/(w0curr + w1curr + w2curr + w3curr +
+        w0adjx + w2adjx + w0adjy + w1adjy + w0diag);
+    //  Statically pack some weights for runtime:
+    const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
+
+    //  LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
+    const float2 dxdy_curr = dxdy * quad_vector.xy;
+    //  Load bilinear samples for the current quadrant (for this fragment):
+    const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
+    const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
+    const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
+    const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
+    const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
+    const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
+    const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
+
+    //  GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
+    //  Fetch the samples from other fragments in the 2x2 quad:
+    float3 sample1adjx, sample1adjy, sample1diag;
+    float3 sample2adjx, sample2adjy, sample2diag;
+    quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
+    quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    //  Fill each row of a matrix with an rgb sample and pre-multiply by the
+    //  weights to obtain a weighted result for sample1*, and handle the rest
+    //  of the weights more directly/verbosely:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
+    sum += w1curr * sample1curr + w1adjy * sample1adjy + w2curr * sample2curr +
+            w2adjx * sample2adjx + w3curr * sample3curr;
+    return sum * weight_sum_inv;
+}
+
+
+///////////////////////  MAX OPTIMAL SIGMA BLUR WRAPPERS  //////////////////////
+
+//  The following blurs are static wrappers around the dynamic blurs above.
+//  HOPEFULLY, the compiler will be smart enough to do constant-folding.
+
+//  Resizable separable blurs:
+inline float3 tex2Dblur11resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur11resize(tex, tex_uv, dxdy, blur11_std_dev);
+}
+inline float3 tex2Dblur9resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur9resize(tex, tex_uv, dxdy, blur9_std_dev);
+}
+inline float3 tex2Dblur7resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur7resize(tex, tex_uv, dxdy, blur7_std_dev);
+}
+inline float3 tex2Dblur5resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur5resize(tex, tex_uv, dxdy, blur5_std_dev);
+}
+inline float3 tex2Dblur3resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur3resize(tex, tex_uv, dxdy, blur3_std_dev);
+}
+//  Fast separable blurs:
+inline float3 tex2Dblur11fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur11fast(tex, tex_uv, dxdy, blur11_std_dev);
+}
+inline float3 tex2Dblur9fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur9fast(tex, tex_uv, dxdy, blur9_std_dev);
+}
+inline float3 tex2Dblur7fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur7fast(tex, tex_uv, dxdy, blur7_std_dev);
+}
+inline float3 tex2Dblur5fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur5fast(tex, tex_uv, dxdy, blur5_std_dev);
+}
+inline float3 tex2Dblur3fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur3fast(tex, tex_uv, dxdy, blur3_std_dev);
+}
+//  Huge, "fast" separable blurs:
+inline float3 tex2Dblur43fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur43fast(tex, tex_uv, dxdy, blur43_std_dev);
+}
+inline float3 tex2Dblur31fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur31fast(tex, tex_uv, dxdy, blur31_std_dev);
+}
+inline float3 tex2Dblur25fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur25fast(tex, tex_uv, dxdy, blur25_std_dev);
+}
+inline float3 tex2Dblur17fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur17fast(tex, tex_uv, dxdy, blur17_std_dev);
+}
+//  Resizable one-pass blurs:
+inline float3 tex2Dblur3x3resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur3x3resize(tex, tex_uv, dxdy, blur3_std_dev);
+}
+//  "Fast" one-pass blurs:
+inline float3 tex2Dblur9x9(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur9x9(tex, tex_uv, dxdy, blur9_std_dev);
+}
+inline float3 tex2Dblur7x7(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur7x7(tex, tex_uv, dxdy, blur7_std_dev);
+}
+inline float3 tex2Dblur5x5(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur5x5(tex, tex_uv, dxdy, blur5_std_dev);
+}
+inline float3 tex2Dblur3x3(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur3x3(tex, tex_uv, dxdy, blur3_std_dev);
+}
+//  "Fast" shared-sample one-pass blurs:
+inline float3 tex2Dblur12x12shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
+{
+    return tex2Dblur12x12shared(tex, tex_uv, dxdy, quad_vector, blur12_std_dev);
+}
+inline float3 tex2Dblur10x10shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
+{
+    return tex2Dblur10x10shared(tex, tex_uv, dxdy, quad_vector, blur10_std_dev);
+}
+inline float3 tex2Dblur8x8shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
+{
+    return tex2Dblur8x8shared(tex, tex_uv, dxdy, quad_vector, blur8_std_dev);
+}
+inline float3 tex2Dblur6x6shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
+{
+    return tex2Dblur6x6shared(tex, tex_uv, dxdy, quad_vector, blur6_std_dev);
+}
+
+
+#endif  //  BLUR_FUNCTIONS_H
+
+////////////////////////////  END BLUR-FUNCTIONS  ///////////////////////////
+
+///////////////////////////////  BLOOM CONSTANTS  //////////////////////////////
+
+//  Compute constants with manual inlines of the functions below:
+static const float bloom_diff_thresh = 1.0/256.0;
+
+
+
+///////////////////////////////////  HELPERS  //////////////////////////////////
+
+inline float get_min_sigma_to_blur_triad(const float triad_size,
+    const float thresh)
+{
+    //  Requires:   1.) triad_size is the final phosphor triad size in pixels
+    //              2.) thresh is the max desired pixel difference in the
+    //                  blurred triad (e.g. 1.0/256.0).
+    //  Returns:    Return the minimum sigma that will fully blur a phosphor
+    //              triad on the screen to an even color, within thresh.
+    //              This closed-form function was found by curve-fitting data.
+    //  Estimate: max error = ~0.086036, mean sq. error = ~0.0013387:
+    return -0.05168 + 0.6113*triad_size -
+        1.122*triad_size*sqrt(0.000416 + thresh);
+    //  Estimate: max error = ~0.16486, mean sq. error = ~0.0041041:
+    //return 0.5985*triad_size - triad_size*sqrt(thresh)
+}
+
+inline float get_absolute_scale_blur_sigma(const float thresh)
+{
+    //  Requires:   1.) min_expected_triads must be a global float.  The number
+    //                  of horizontal phosphor triads in the final image must be
+    //                  >= min_allowed_viewport_triads.x for realistic results.
+    //              2.) bloom_approx_scale_x must be a global float equal to the
+    //                  absolute horizontal scale of BLOOM_APPROX.
+    //              3.) bloom_approx_scale_x/min_allowed_viewport_triads.x
+    //                  should be <= 1.1658025090 to keep the final result <
+    //                  0.62666015625 (the largest sigma ensuring the largest
+    //                  unused texel weight stays < 1.0/256.0 for a 3x3 blur).
+    //              4.) thresh is the max desired pixel difference in the
+    //                  blurred triad (e.g. 1.0/256.0).
+    //  Returns:    Return the minimum Gaussian sigma that will blur the pass
+    //              output as much as it would have taken to blur away
+    //              bloom_approx_scale_x horizontal phosphor triads.
+    //  Description:
+    //  BLOOM_APPROX should look like a downscaled phosphor blur.  Ideally, we'd
+    //  use the same blur sigma as the actual phosphor bloom and scale it down
+    //  to the current resolution with (bloom_approx_scale_x/viewport_size_x), but
+    //  we don't know the viewport size in this pass.  Instead, we'll blur as
+    //  much as it would take to blur away min_allowed_viewport_triads.x.  This
+    //  will blur "more than necessary" if the user actually uses more triads,
+    //  but that's not terrible either, because blurring a constant fraction of
+    //  the viewport may better resemble a true optical bloom anyway (since the
+    //  viewport will generally be about the same fraction of each player's
+    //  field of view, regardless of screen size and resolution).
+    //  Assume an extremely large viewport size for asymptotic results.
+    return bloom_approx_scale_x/max_viewport_size_x *
+        get_min_sigma_to_blur_triad(
+            max_viewport_size_x/min_allowed_viewport_triads.x, thresh);
+}
+
+inline float get_center_weight(const float sigma)
+{
+    //  Given a Gaussian blur sigma, get the blur weight for the center texel.
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        return get_fast_gaussian_weight_sum_inv(sigma);
+    #else
+        const float denom_inv = 0.5/(sigma*sigma);
+        const float w0 = 1.0;
+        const float w1 = exp(-1.0 * denom_inv);
+        const float w2 = exp(-4.0 * denom_inv);
+        const float w3 = exp(-9.0 * denom_inv);
+        const float w4 = exp(-16.0 * denom_inv);
+        const float w5 = exp(-25.0 * denom_inv);
+        const float w6 = exp(-36.0 * denom_inv);
+        const float w7 = exp(-49.0 * denom_inv);
+        const float w8 = exp(-64.0 * denom_inv);
+        const float w9 = exp(-81.0 * denom_inv);
+        const float w10 = exp(-100.0 * denom_inv);
+        const float w11 = exp(-121.0 * denom_inv);
+        const float w12 = exp(-144.0 * denom_inv);
+        const float w13 = exp(-169.0 * denom_inv);
+        const float w14 = exp(-196.0 * denom_inv);
+        const float w15 = exp(-225.0 * denom_inv);
+        const float w16 = exp(-256.0 * denom_inv);
+        const float w17 = exp(-289.0 * denom_inv);
+        const float w18 = exp(-324.0 * denom_inv);
+        const float w19 = exp(-361.0 * denom_inv);
+        const float w20 = exp(-400.0 * denom_inv);
+        const float w21 = exp(-441.0 * denom_inv);
+        //  Note: If the implementation uses a smaller blur than the max allowed,
+        //  the worst case scenario is that the center weight will be overestimated,
+        //  so we'll put a bit more energy into the brightpass...no huge deal.
+        //  Then again, if the implementation uses a larger blur than the max
+        //  "allowed" because of dynamic branching, the center weight could be
+        //  underestimated, which is more of a problem...consider always using
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+            //  43x blur:
+            const float weight_sum_inv = 1.0 /
+                (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 +
+                w11 + w12 + w13 + w14 + w15 + w16 + w17 + w18 + w19 + w20 + w21));
+        #else
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+            //  31x blur:
+            const float weight_sum_inv = 1.0 /
+                (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 +
+                w8 + w9 + w10 + w11 + w12 + w13 + w14 + w15));
+        #else
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+            //  25x blur:
+            const float weight_sum_inv = 1.0 / (w0 + 2.0 * (
+                w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12));
+        #else
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+            //  17x blur:
+            const float weight_sum_inv = 1.0 / (w0 + 2.0 * (
+                w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8));
+        #else
+            //  9x blur:
+            const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+        const float center_weight = weight_sum_inv * weight_sum_inv;
+        return center_weight;
+    #endif
+}
+
+inline float3 tex2DblurNfast(const sampler2D texture, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  If sigma is static, we can safely branch and use the smallest blur
+    //  that's big enough.  Ignore #define hints, because we'll only use a
+    //  large blur if we actually need it, and the branches cost nothing.
+    #ifndef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #define PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE
+    #else
+        //  It's still worth branching if the profile supports dynamic branches:
+        //  It's much faster than using a hugely excessive blur, but each branch
+        //  eats ~1% FPS.
+        #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+            #define PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE
+        #endif
+    #endif
+    //  Failed optimization notes:
+    //  I originally created a same-size mipmapped 5-tap separable blur10 that
+    //  could handle any sigma by reaching into lower mip levels.  It was
+    //  as fast as blur25fast for runtime sigmas and a tad faster than
+    //  blur31fast for static sigmas, but mipmapping two viewport-size passes
+    //  ate 10% of FPS across all codepaths, so it wasn't worth it.
+    #ifdef PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE
+        if(sigma <= blur9_std_dev)
+        {
+            return tex2Dblur9fast(texture, tex_uv, dxdy, sigma);
+        }
+        else if(sigma <= blur17_std_dev)
+        {
+            return tex2Dblur17fast(texture, tex_uv, dxdy, sigma);
+        }
+        else if(sigma <= blur25_std_dev)
+        {
+            return tex2Dblur25fast(texture, tex_uv, dxdy, sigma);
+        }
+        else if(sigma <= blur31_std_dev)
+        {
+            return tex2Dblur31fast(texture, tex_uv, dxdy, sigma);
+        }
+        else
+        {
+            return tex2Dblur43fast(texture, tex_uv, dxdy, sigma);
+        }
+    #else
+        //  If we can't afford to branch, we can only guess at what blur
+        //  size we need.  Therefore, use the largest blur allowed.
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+            return tex2Dblur43fast(texture, tex_uv, dxdy, sigma);
+        #else
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+            return tex2Dblur31fast(texture, tex_uv, dxdy, sigma);
+        #else
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+            return tex2Dblur25fast(texture, tex_uv, dxdy, sigma);
+        #else
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+            return tex2Dblur17fast(texture, tex_uv, dxdy, sigma);
+        #else
+            return tex2Dblur9fast(texture, tex_uv, dxdy, sigma);
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    #endif  //  PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE
+}
+
+inline float get_bloom_approx_sigma(const float output_size_x_runtime,
+    const float estimated_viewport_size_x)
+{
+    //  Requires:   1.) output_size_x_runtime == BLOOM_APPROX.output_size.x.
+    //                  This is included for dynamic codepaths just in case the
+    //                  following two globals are incorrect:
+    //              2.) bloom_approx_size_x_for_skip should == the same
+    //                  if PHOSPHOR_BLOOM_FAKE is #defined
+    //              3.) bloom_approx_size_x should == the same otherwise
+    //  Returns:    For gaussian4x4, return a dynamic small bloom sigma that's
+    //              as close to optimal as possible given available information.
+    //              For blur3x3, return the a static small bloom sigma that
+    //              works well for typical cases.  Otherwise, we're using simple
+    //              bilinear filtering, so use static calculations.
+    //  Assume the default static value.  This is a compromise that ensures
+    //  typical triads are blurred, even if unusually large ones aren't.
+    static const float mask_num_triads_static =
+        max(min_allowed_viewport_triads.x, mask_num_triads_desired_static);
+    const float mask_num_triads_from_size =
+        estimated_viewport_size_x/mask_triad_size_desired;
+    const float mask_num_triads_runtime = max(min_allowed_viewport_triads.x,
+        lerp(mask_num_triads_from_size, mask_num_triads_desired,
+            mask_specify_num_triads));
+    //  Assume an extremely large viewport size for asymptotic results:
+    static const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0);
+    if(bloom_approx_filter > 1.5)   //  4x4 true Gaussian resize
+    {
+        //  Use the runtime num triads and output size:
+        const float asymptotic_triad_size =
+            max_viewport_size_x/mask_num_triads_runtime;
+        const float asymptotic_sigma = get_min_sigma_to_blur_triad(
+            asymptotic_triad_size, bloom_diff_thresh);
+        const float bloom_approx_sigma =
+            asymptotic_sigma * output_size_x_runtime/max_viewport_size_x;
+        //  The BLOOM_APPROX input has to be ORIG_LINEARIZED to avoid moire, but
+        //  account for the Gaussian scanline sigma from the last pass too.
+        //  The bloom will be too wide horizontally but tall enough vertically.
+        return length(float2(bloom_approx_sigma, beam_max_sigma));
+    }
+    else    //  3x3 blur resize (the bilinear resize doesn't need a sigma)
+    {
+        //  We're either using blur3x3 or bilinear filtering.  The biggest
+        //  reason to choose blur3x3 is to avoid dynamic weights, so use a
+        //  static calculation.
+        #ifdef PHOSPHOR_BLOOM_FAKE
+            static const float output_size_x_static =
+                bloom_approx_size_x_for_fake;
+        #else
+            static const float output_size_x_static = bloom_approx_size_x;
+        #endif
+        static const float asymptotic_triad_size =
+            max_viewport_size_x/mask_num_triads_static;
+        const float asymptotic_sigma = get_min_sigma_to_blur_triad(
+            asymptotic_triad_size, bloom_diff_thresh);
+        const float bloom_approx_sigma =
+            asymptotic_sigma * output_size_x_static/max_viewport_size_x;
+        //  The BLOOM_APPROX input has to be ORIG_LINEARIZED to avoid moire, but
+        //  try accounting for the Gaussian scanline sigma from the last pass
+        //  too; use the static default value:
+        return length(float2(bloom_approx_sigma, beam_max_sigma_static));
+    }
+}
+
+inline float get_final_bloom_sigma(const float bloom_sigma_runtime)
+{
+    //  Requires:   1.) bloom_sigma_runtime is a precalculated sigma that's
+    //                  optimal for the [known] triad size.
+    //              2.) Call this from a fragment shader (not a vertex shader),
+    //                  or blurring with static sigmas won't be constant-folded.
+    //  Returns:    Return the optimistic static sigma if the triad size is
+    //              known at compile time.  Otherwise return the optimal runtime
+    //              sigma (10% slower) or an implementation-specific compromise
+    //              between an optimistic or pessimistic static sigma.
+    //  Notes:      Call this from the fragment shader, NOT the vertex shader,
+    //              so static sigmas can be constant-folded!
+    const float bloom_sigma_optimistic = get_min_sigma_to_blur_triad(
+        mask_triad_size_desired_static, bloom_diff_thresh);
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        return bloom_sigma_runtime;
+    #else
+        //  Overblurring looks as bad as underblurring, so assume average-size
+        //  triads, not worst-case huge triads:
+        return bloom_sigma_optimistic;
+    #endif
+}
+
+
+#endif  //  BLOOM_FUNCTIONS_H
+
+////////////////////////////  END BLOOM-FUNCTIONS  ///////////////////////////
+
+///////////////////////////  END FRAGMENT-INCLUDES  //////////////////////////
+
+void main() {
+    //  Blur the brightpass horizontally with a 9/17/25/43x blur:
+    const float bloom_sigma = get_final_bloom_sigma(bloom_sigma_runtime);
+    const float3 color = tex2DblurNfast(input_texture, tex_uv,
+        bloom_dxdy, bloom_sigma);
+    //  Encode and output the blurred image:
+    FragColor = encode_output(float4(color, 1.0));
+}
\ No newline at end of file
diff --git a/shaders/CRT-Royale.shader/bloom-vertical.vs b/shaders/CRT-Royale.shader/bloom-vertical.vs
new file mode 100644
index 00000000..dfec96e6
--- /dev/null
+++ b/shaders/CRT-Royale.shader/bloom-vertical.vs
@@ -0,0 +1,3792 @@
+#version 150
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+in vec4 position;
+in vec2 texCoord;
+
+out Vertex {
+   vec2 vTexCoord;
+   vec2 tex_uv;
+   vec2 bloom_dxdy;
+   float bloom_sigma_runtime;
+};
+
+uniform vec4 targetSize;
+uniform vec4 sourceSize[];
+
+// USER SETTINGS BLOCK //
+
+#define crt_gamma 2.500000
+#define lcd_gamma 2.200000
+#define levels_contrast 1.0
+#define halation_weight 0.0
+#define diffusion_weight 0.075
+#define bloom_underestimate_levels 0.8
+#define bloom_excess 0.000000
+#define beam_min_sigma 0.020000
+#define beam_max_sigma 0.300000
+#define beam_spot_power 0.330000
+#define beam_min_shape 2.000000
+#define beam_max_shape 4.000000
+#define beam_shape_power 0.250000
+#define beam_horiz_filter 0.000000
+#define beam_horiz_sigma 0.35
+#define beam_horiz_linear_rgb_weight 1.000000
+#define convergence_offset_x_r -0.000000
+#define convergence_offset_x_g 0.000000
+#define convergence_offset_x_b 0.000000
+#define convergence_offset_y_r 0.000000
+#define convergence_offset_y_g -0.000000
+#define convergence_offset_y_b 0.000000
+#define mask_type 1.000000
+#define mask_sample_mode_desired 0.000000
+#define mask_specify_num_triads 0.000000
+#define mask_triad_size_desired 3.000000
+#define mask_num_triads_desired 480.000000
+#define aa_subpixel_r_offset_x_runtime -0.0
+#define aa_subpixel_r_offset_y_runtime 0.000000
+#define aa_cubic_c 0.500000
+#define aa_gauss_sigma 0.500000
+#define geom_mode_runtime 0.000000
+#define geom_radius 2.000000
+#define geom_view_dist 2.000000
+#define geom_tilt_angle_x 0.000000
+#define geom_tilt_angle_y 0.000000
+#define geom_aspect_ratio_x 432.000000
+#define geom_aspect_ratio_y 329.000000
+#define geom_overscan_x 1.000000
+#define geom_overscan_y 1.000000
+#define border_size 0.015
+#define border_darkness 2.0
+#define border_compress 2.500000
+#define interlace_bff 0.000000
+#define interlace_1080i 0.000000
+
+// END USER SETTINGS BLOCK //
+
+// compatibility macros for transparently converting HLSLisms into GLSLisms
+#define mul(a,b) (b*a)
+#define lerp(a,b,c) mix(a,b,c)
+#define saturate(c) clamp(c, 0.0, 1.0)
+#define frac(x) (fract(x))
+#define float2 vec2
+#define float3 vec3
+#define float4 vec4
+#define bool2 bvec2
+#define bool3 bvec3
+#define bool4 bvec4
+#define float2x2 mat2x2
+#define float3x3 mat3x3
+#define float4x4 mat4x4
+#define float4x3 mat4x3
+#define float2x4 mat2x4
+#define IN params
+#define texture_size sourceSize[0].xy
+#define video_size sourceSize[0].xy
+#define output_size targetSize.xy
+#define frame_count phase
+#define static  
+#define inline  
+#define const  
+#define fmod(x,y) mod(x,y)
+#define ddx(c) dFdx(c)
+#define ddy(c) dFdy(c)
+#define atan2(x,y) atan(y,x)
+#define rsqrt(c) inversesqrt(c)
+
+#define MASKED_SCANLINEStexture source[0]
+#define MASKED_SCANLINEStexture_size sourceSize[0].xy
+#define MASKED_SCANLINESvideo_size sourceSize[0].xy
+#define BLOOM_APPROXtexture source[3]
+#define BLOOM_APPROXtexture_size sourceSize[3].xy
+#define BLOOM_APPROXvideo_size sourceSize[3].xy
+
+#if defined(GL_ES)
+	#define COMPAT_PRECISION mediump
+#else
+	#define COMPAT_PRECISION
+#endif
+
+#if __VERSION__ >= 130
+	#define COMPAT_TEXTURE texture
+#else
+	#define COMPAT_TEXTURE texture2D
+#endif
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+////////////////////////////  END USER-SETTINGS  //////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  END DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////////////
+
+//#include "bind-shader-params.h"
+
+/////////////////////////////  BEGIN BIND-SHADER-PARAMS  ////////////////////////////
+
+#ifndef BIND_SHADER_PARAMS_H
+#define BIND_SHADER_PARAMS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+/////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+////////////////////   END DERIVED-SETTINGS-AND-CONSTANTS   /////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+//  Override some parameters for gamma-management.h and tex2Dantialias.h:
+#define OVERRIDE_DEVICE_GAMMA
+static const float gba_gamma = 3.5; //  Irrelevant but necessary to define.
+#define ANTIALIAS_OVERRIDE_BASICS
+#define ANTIALIAS_OVERRIDE_PARAMETERS
+
+//  Provide accessors for vector constants that pack scalar uniforms:
+inline float2 get_aspect_vector(const float geom_aspect_ratio)
+{
+    //  Get an aspect ratio vector.  Enforce geom_max_aspect_ratio, and prevent
+    //  the absolute scale from affecting the uv-mapping for curvature:
+    const float geom_clamped_aspect_ratio =
+        min(geom_aspect_ratio, geom_max_aspect_ratio);
+    const float2 geom_aspect =
+        normalize(float2(geom_clamped_aspect_ratio, 1.0));
+    return geom_aspect;
+}
+
+inline float2 get_geom_overscan_vector()
+{
+    return float2(geom_overscan_x, geom_overscan_y);
+}
+
+inline float2 get_geom_tilt_angle_vector()
+{
+    return float2(geom_tilt_angle_x, geom_tilt_angle_y);
+}
+
+inline float3 get_convergence_offsets_x_vector()
+{
+    return float3(convergence_offset_x_r, convergence_offset_x_g,
+        convergence_offset_x_b);
+}
+
+inline float3 get_convergence_offsets_y_vector()
+{
+    return float3(convergence_offset_y_r, convergence_offset_y_g,
+        convergence_offset_y_b);
+}
+
+inline float2 get_convergence_offsets_r_vector()
+{
+    return float2(convergence_offset_x_r, convergence_offset_y_r);
+}
+
+inline float2 get_convergence_offsets_g_vector()
+{
+    return float2(convergence_offset_x_g, convergence_offset_y_g);
+}
+
+inline float2 get_convergence_offsets_b_vector()
+{
+    return float2(convergence_offset_x_b, convergence_offset_y_b);
+}
+
+inline float2 get_aa_subpixel_r_offset()
+{
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+            //  WARNING: THIS IS EXTREMELY EXPENSIVE.
+            return float2(aa_subpixel_r_offset_x_runtime,
+                aa_subpixel_r_offset_y_runtime);
+        #else
+            return aa_subpixel_r_offset_static;
+        #endif
+    #else
+        return aa_subpixel_r_offset_static;
+    #endif
+}
+
+//  Provide accessors settings which still need "cooking:"
+inline float get_mask_amplify()
+{
+    static const float mask_grille_amplify = 1.0/mask_grille_avg_color;
+    static const float mask_slot_amplify = 1.0/mask_slot_avg_color;
+    static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color;
+    return mask_type < 0.5 ? mask_grille_amplify :
+        mask_type < 1.5 ? mask_slot_amplify :
+        mask_shadow_amplify;
+}
+
+inline float get_mask_sample_mode()
+{
+    #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_desired;
+        #else
+            return clamp(mask_sample_mode_desired, 1.0, 2.0);
+        #endif
+    #else
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_static;
+        #else
+            return clamp(mask_sample_mode_static, 1.0, 2.0);
+        #endif
+    #endif
+}
+
+#endif  //  BIND_SHADER_PARAMS_H
+
+////////////////////////////  END BIND-SHADER-PARAMS  ///////////////////////////
+
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
+
+//#include "../../../../include/gamma-management.h"
+
+////////////////////////////  BEGIN GAMMA-MANAGEMENT  //////////////////////////
+
+#ifndef GAMMA_MANAGEMENT_H
+#define GAMMA_MANAGEMENT_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//  
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides gamma-aware tex*D*() and encode_output() functions.
+//  Requires:   Before #include-ing this file, the including file must #define
+//              the following macros when applicable and follow their rules:
+//              1.) #define FIRST_PASS if this is the first pass.
+//              2.) #define LAST_PASS if this is the last pass.
+//              3.) If sRGB is available, set srgb_framebufferN = "true" for
+//                  every pass except the last in your .cgp preset.
+//              4.) If sRGB isn't available but you want gamma-correctness with
+//                  no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
+//              5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
+//              6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
+//              7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
+//              8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
+//              If an option in [5, 8] is #defined in the first or last pass, it
+//              should be #defined for both.  It shouldn't make a difference
+//              whether it's #defined for intermediate passes or not.
+//  Optional:   The including file (or an earlier included file) may optionally
+//              #define a number of macros indicating it will override certain
+//              macros and associated constants are as follows:
+//              static constants with either static or uniform constants.  The
+//              1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
+//                  static const float ntsc_gamma
+//                  static const float pal_gamma
+//                  static const float crt_reference_gamma_high
+//                  static const float crt_reference_gamma_low
+//                  static const float lcd_reference_gamma
+//                  static const float crt_office_gamma
+//                  static const float lcd_office_gamma
+//              2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
+//                  static const float crt_gamma
+//                  static const float gba_gamma
+//                  static const float lcd_gamma
+//              3.) OVERRIDE_FINAL_GAMMA: The user must first define:
+//                  static const float input_gamma
+//                  static const float intermediate_gamma
+//                  static const float output_gamma
+//                  (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
+//              4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
+//                  static const bool assume_opaque_alpha
+//              The gamma constant overrides must be used in every pass or none,
+//              and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
+//              OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
+//  Usage:      After setting macros appropriately, ignore gamma correction and
+//              replace all tex*D*() calls with equivalent gamma-aware
+//              tex*D*_linearize calls, except:
+//              1.) When you read an LUT, use regular tex*D or a gamma-specified
+//                  function, depending on its gamma encoding:
+//                      tex*D*_linearize_gamma (takes a runtime gamma parameter)
+//              2.) If you must read pass0's original input in a later pass, use
+//                  tex2D_linearize_ntsc_gamma.  If you want to read pass0's
+//                  input with gamma-corrected bilinear filtering, consider
+//                  creating a first linearizing pass and reading from the input
+//                  of pass1 later.
+//              Then, return encode_output(color) from every fragment shader.
+//              Finally, use the global gamma_aware_bilinear boolean if you want
+//              to statically branch based on whether bilinear filtering is
+//              gamma-correct or not (e.g. for placing Gaussian blur samples).
+//
+//  Detailed Policy:
+//  tex*D*_linearize() functions enforce a consistent gamma-management policy
+//  based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings.  They assume
+//  their input texture has the same encoding characteristics as the input for
+//  the current pass (which doesn't apply to the exceptions listed above).
+//  Similarly, encode_output() enforces a policy based on the LAST_PASS and
+//  GAMMA_ENCODE_EVERY_FBO settings.  Together, they result in one of the
+//  following two pipelines.
+//  Typical pipeline with intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = linear_color;     //  Automatic sRGB encoding
+//      linear_color = intermediate_output;     //  Automatic sRGB decoding
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Typical pipeline without intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
+//      linear_color = pow(intermediate_output, intermediate_gamma);
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
+//  easily get gamma-correctness without banding on devices where sRGB isn't
+//  supported.
+//
+//  Use This Header to Maximize Code Reuse:
+//  The purpose of this header is to provide a consistent interface for texture
+//  reads and output gamma-encoding that localizes and abstracts away all the
+//  annoying details.  This greatly reduces the amount of code in each shader
+//  pass that depends on the pass number in the .cgp preset or whether sRGB
+//  FBO's are being used: You can trivially change the gamma behavior of your
+//  whole pass by commenting or uncommenting 1-3 #defines.  To reuse the same
+//  code in your first, Nth, and last passes, you can even put it all in another
+//  header file and #include it from skeleton .cg files that #define the
+//  appropriate pass-specific settings.
+//
+//  Rationale for Using Three Macros:
+//  This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
+//  SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
+//  a lower maintenance burden on each pass.  At first glance it seems we could
+//  accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
+//  This works for simple use cases where input_gamma == output_gamma, but it
+//  breaks down for more complex scenarios like CRT simulation, where the pass
+//  number determines the gamma encoding of the input and output.
+
+
+///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
+
+//  Set standard gamma constants, but allow users to override them:
+#ifndef OVERRIDE_STANDARD_GAMMA
+    //  Standard encoding gammas:
+    static const float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
+    static const float pal_gamma = 2.8;     //  Never actually 2.8 in practice
+    //  Typical device decoding gammas (only use for emulating devices):
+    //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
+    //  gammas: The standards purposely undercorrected for an analog CRT's
+    //  assumed 2.5 reference display gamma to maintain contrast in assumed
+    //  [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
+    //  These unstated assumptions about display gamma and perceptual rendering
+    //  intent caused a lot of confusion, and more modern CRT's seemed to target
+    //  NTSC 2.2 gamma with circuitry.  LCD displays seem to have followed suit
+    //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
+    //  displays designed to view sRGB in bright environments.  (Standards are
+    //  also in flux again with BT.1886, but it's underspecified for displays.)
+    static const float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
+    static const float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
+    static const float lcd_reference_gamma = 2.5;       //  To match CRT
+    static const float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
+    static const float lcd_office_gamma = 2.2;  //  Approximates sRGB
+#endif  //  OVERRIDE_STANDARD_GAMMA
+
+//  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
+//  but only if they're aware of it.
+#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
+    static const bool assume_opaque_alpha = false;
+#endif
+
+
+///////////////////////  DERIVED CONSTANTS AS FUNCTIONS  ///////////////////////
+
+//  gamma-management.h should be compatible with overriding gamma values with
+//  runtime user parameters, but we can only define other global constants in
+//  terms of static constants, not uniform user parameters.  To get around this
+//  limitation, we need to define derived constants using functions.
+
+//  Set device gamma constants, but allow users to override them:
+#ifdef OVERRIDE_DEVICE_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_crt_gamma()    {   return crt_gamma;   }
+    inline float get_gba_gamma()    {   return gba_gamma;   }
+    inline float get_lcd_gamma()    {   return lcd_gamma;   }
+#else
+    inline float get_crt_gamma()    {   return crt_reference_gamma_high;    }
+    inline float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
+    inline float get_lcd_gamma()    {   return lcd_office_gamma;            }
+#endif  //  OVERRIDE_DEVICE_GAMMA
+
+//  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
+#ifdef OVERRIDE_FINAL_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_intermediate_gamma()   {   return intermediate_gamma;  }
+    inline float get_input_gamma()          {   return input_gamma;         }
+    inline float get_output_gamma()         {   return output_gamma;        }
+#else
+    //  If we gamma-correct every pass, always use ntsc_gamma between passes to
+    //  ensure middle passes don't need to care if anything is being simulated:
+    inline float get_intermediate_gamma()   {   return ntsc_gamma;          }
+    #ifdef SIMULATE_CRT_ON_LCD
+        inline float get_input_gamma()      {   return get_crt_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_LCD
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_LCD_ON_CRT
+        inline float get_input_gamma()      {   return get_lcd_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_CRT
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else   //  Don't simulate anything:
+        inline float get_input_gamma()      {   return ntsc_gamma;          }
+        inline float get_output_gamma()     {   return ntsc_gamma;          }
+    #endif  //  SIMULATE_GBA_ON_CRT
+    #endif  //  SIMULATE_LCD_ON_CRT
+    #endif  //  SIMULATE_GBA_ON_LCD
+    #endif  //  SIMULATE_CRT_ON_LCD
+#endif  //  OVERRIDE_FINAL_GAMMA
+
+//  Set decoding/encoding gammas for the current pass.  Use static constants for
+//  linearize_input and gamma_encode_output, because they aren't derived, and
+//  they let the compiler do dead-code elimination.
+#ifndef GAMMA_ENCODE_EVERY_FBO
+    #ifdef FIRST_PASS
+        static const bool linearize_input = true;
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        static const bool linearize_input = false;
+        inline float get_pass_input_gamma()     {   return 1.0;                 }
+    #endif
+    #ifdef LAST_PASS
+        static const bool gamma_encode_output = true;
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        static const bool gamma_encode_output = false;
+        inline float get_pass_output_gamma()    {   return 1.0;                 }
+    #endif
+#else
+    static const bool linearize_input = true;
+    static const bool gamma_encode_output = true;
+    #ifdef FIRST_PASS
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        inline float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
+    #endif
+    #ifdef LAST_PASS
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        inline float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
+    #endif
+#endif
+
+//  Users might want to know if bilinear filtering will be gamma-correct:
+static const bool gamma_aware_bilinear = !linearize_input;
+
+
+//////////////////////  COLOR ENCODING/DECODING FUNCTIONS  /////////////////////
+
+inline float4 encode_output(const float4 color)
+{
+    if(gamma_encode_output)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_input(const float4 color)
+{
+    if(linearize_input)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_gamma_input(const float4 color, const float3 gamma)
+{
+    if(assume_opaque_alpha)
+    {
+        return float4(pow(color.rgb, gamma), 1.0);
+    }
+    else
+    {
+        return float4(pow(color.rgb, gamma), color.a);
+    }
+}
+
+//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯
+//#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D)))
+// EDIT: it's the 'const' in front of the coords that's doing it
+
+///////////////////////////  TEXTURE LOOKUP WRAPPERS  //////////////////////////
+
+//  "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a wide array of linearizing texture lookup wrapper functions.  The
+//  Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
+//  lookups are provided for completeness in case that changes someday.  Nobody
+//  is likely to use the *fetch and *proj functions, but they're included just
+//  in case.  The only tex*D texture sampling functions omitted are:
+//      - tex*Dcmpbias
+//      - tex*Dcmplod
+//      - tex*DARRAY*
+//      - tex*DMS*
+//      - Variants returning integers
+//  Standard line length restrictions are ignored below for vertical brevity.
+/*
+//  tex1D:
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex1Dbias:
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dbias(tex, tex_coords));   }
+
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dbias(tex, tex_coords, texel_off));    }
+
+//  tex1Dfetch:
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
+{   return decode_input(tex1Dfetch(tex, tex_coords));  }
+
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex1Dlod:
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dlod(tex, tex_coords));    }
+
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dlod(tex, tex_coords, texel_off));     }
+
+//  tex1Dproj:
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+*/
+//  tex2D:
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords, texel_off));    }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex2Dbias:
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
+//{   return decode_input(tex2Dbias(tex, tex_coords));   }
+
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dbias(tex, tex_coords, texel_off));    }
+
+//  tex2Dfetch:
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
+//{   return decode_input(tex2Dfetch(tex, tex_coords));  }
+
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords)
+{   return decode_input(textureLod(tex, tex_coords.xy, 0.0));    }
+
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));     }
+/*
+//  tex2Dproj:
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+*/
+/*
+//  tex3D:
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
+{   return decode_input(tex3D(tex, tex_coords));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, texel_off));    }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex3Dbias:
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dbias(tex, tex_coords));   }
+
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dbias(tex, tex_coords, texel_off));    }
+
+//  tex3Dfetch:
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
+{   return decode_input(tex3Dfetch(tex, tex_coords));  }
+
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex3Dlod:
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dlod(tex, tex_coords));    }
+
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dlod(tex, tex_coords, texel_off));     }
+
+//  tex3Dproj:
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dproj(tex, tex_coords));   }
+
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dproj(tex, tex_coords, texel_off));    }
+/////////*
+
+//  NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  This narrow selection of nonstandard tex2D* functions can be useful:
+
+//  tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0)));   }
+
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off));    }
+
+
+//  MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a narrower selection of tex2D* wrapper functions that decode an
+//  input sample with a specified gamma value.  These are useful for reading
+//  LUT's and for reading the input of pass0 in a later pass.
+
+//  tex2D:
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma);   }
+
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+/*
+//  tex2Dbias:
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma);   }
+
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma);    }
+
+//  tex2Dfetch:
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma);  }
+
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma);   }
+*/
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma);    }
+
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma);     }
+
+
+#endif  //  GAMMA_MANAGEMENT_H
+
+////////////////////////////  END GAMMA-MANAGEMENT  //////////////////////////
+
+//#include "phosphor-mask-resizing.h"
+
+////////////////////////  BEGIN PHOSPHOR-MASK-RESIZING  ////////////////////////
+
+#ifndef PHOSPHOR_MASK_RESIZING_H
+#define PHOSPHOR_MASK_RESIZING_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//#include "../user-settings.h"
+//#include "derived-settings-and-constants.h"
+
+/////////////////////////////  CODEPATH SELECTION  /////////////////////////////
+
+//  Choose a looping strategy based on what's allowed:
+//  Dynamic loops not allowed: Use a flat static loop.
+//  Dynamic loops accomodated: Coarsely branch around static loops.
+//  Dynamic loops assumed allowed: Use a flat dynamic loop.
+#ifndef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #ifdef ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+        #define BREAK_LOOPS_INTO_PIECES
+    #else
+        #define USE_SINGLE_STATIC_LOOP
+    #endif
+#endif  //  No else needed: Dynamic loops assumed.
+
+
+//////////////////////////////////  CONSTANTS  /////////////////////////////////
+
+//  The larger the resized tile, the fewer samples we'll need for downsizing.
+//  See if we can get a static min tile size > mask_min_allowed_tile_size:
+static const float mask_min_allowed_tile_size = ceil(
+    mask_min_allowed_triad_size * mask_triads_per_tile);
+static const float mask_min_expected_tile_size = 
+        mask_min_allowed_tile_size;
+//  Limit the number of sinc resize taps by the maximum minification factor:
+static const float pi_over_lobes = pi/mask_sinc_lobes;
+static const float max_sinc_resize_samples_float = 2.0 * mask_sinc_lobes *
+    mask_resize_src_lut_size.x/mask_min_expected_tile_size;
+//  Vectorized loops sample in multiples of 4.  Round up to be safe:
+static const float max_sinc_resize_samples_m4 = ceil(
+    max_sinc_resize_samples_float * 0.25) * 4.0;
+
+
+/////////////////////////  RESAMPLING FUNCTION HELPERS  ////////////////////////
+
+inline float get_dynamic_loop_size(const float magnification_scale)
+{
+    //  Requires:   The following global constants must be defined:
+    //              1.) mask_sinc_lobes
+    //              2.) max_sinc_resize_samples_m4
+    //  Returns:    The minimum number of texture samples for a correct downsize
+    //              at magnification_scale.
+    //  We're downsizing, so the filter is sized across 2*lobes output pixels
+    //  (not 2*lobes input texels).  This impacts distance measurements and the
+    //  minimum number of input samples needed.
+    const float min_samples_float = 2.0 * mask_sinc_lobes / magnification_scale;
+    const float min_samples_m4 = ceil(min_samples_float * 0.25) * 4.0;
+    #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+        const float max_samples_m4 = max_sinc_resize_samples_m4;
+    #else   // ifdef BREAK_LOOPS_INTO_PIECES
+        //  Simulating loops with branches imposes a 128-sample limit.
+        const float max_samples_m4 = min(128.0, max_sinc_resize_samples_m4);
+    #endif
+    return min(min_samples_m4, max_samples_m4);
+}
+
+float2 get_first_texel_tile_uv_and_dist(const float2 tex_uv, 
+    const float2 tex_size, const float dr, 
+    const float input_tiles_per_texture_r, const float samples,
+    static const bool vertical)
+{
+    //  Requires:   1.) dr == du == 1.0/texture_size.x or
+    //                  dr == dv == 1.0/texture_size.y
+    //                  (whichever direction we're resampling in).
+    //                  It's a scalar to save register space.
+    //              2.) input_tiles_per_texture_r is the number of input tiles
+    //                  that can fit in the input texture in the direction we're
+    //                  resampling this pass.
+    //              3.) vertical indicates whether we're resampling vertically
+    //                  this pass (or horizontally).
+    //  Returns:    Pack and return the first sample's tile_uv coord in [0, 1]
+    //              and its texel distance from the destination pixel, in the
+    //              resized dimension only.
+    //  We'll start with the topmost or leftmost sample and work down or right,
+    //  so get the first sample location and distance.  Modify both dimensions
+    //  as if we're doing a one-pass 2D resize; we'll throw away the unneeded
+    //  (and incorrect) dimension at the end.
+    const float2 curr_texel = tex_uv * tex_size;
+    const float2 prev_texel =
+        floor(curr_texel - float2(under_half)) + float2(0.5);
+    const float2 first_texel = prev_texel - float2(samples/2.0 - 1.0);
+    const float2 first_texel_uv_wrap_2D = first_texel * dr;
+    const float2 first_texel_dist_2D = curr_texel - first_texel;
+    //  Convert from tex_uv to tile_uv coords so we can sub fracs for fmods.
+    const float2 first_texel_tile_uv_wrap_2D =
+        first_texel_uv_wrap_2D * input_tiles_per_texture_r;
+    //  Project wrapped coordinates to the [0, 1] range.  We'll do this with all
+    //  samples,but the first texel is special, since it might be negative.
+    const float2 coord_negative =
+        float2((first_texel_tile_uv_wrap_2D.x < 0.),(first_texel_tile_uv_wrap_2D.y < 0.));
+    const float2 first_texel_tile_uv_2D =
+        frac(first_texel_tile_uv_wrap_2D) + coord_negative;
+    //  Pack the first texel's tile_uv coord and texel distance in 1D:
+    const float2 tile_u_and_dist =
+        float2(first_texel_tile_uv_2D.x, first_texel_dist_2D.x);
+    const float2 tile_v_and_dist =
+        float2(first_texel_tile_uv_2D.y, first_texel_dist_2D.y);
+    return vertical ? tile_v_and_dist : tile_u_and_dist;
+    //return lerp(tile_u_and_dist, tile_v_and_dist, float(vertical));
+}
+
+inline float4 tex2Dlod0try(const sampler2D tex, const float2 tex_uv)
+{
+    //  Mipmapping and anisotropic filtering get confused by sinc-resampling.
+    //  One [slow] workaround is to select the lowest mip level:
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        return textureLod(tex, float4(tex_uv, 0.0, 0.0).xy);
+    #else
+        #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+            return tex2Dbias(tex, float4(tex_uv, 0.0, -16.0));
+        #else
+            return texture(tex, tex_uv);
+        #endif
+    #endif
+}
+
+
+//////////////////////////////  LOOP BODY MACROS  //////////////////////////////
+
+//  Using inline functions can exceed the temporary register limit, so we're
+//  stuck with #define macros (I'm TRULY sorry).  They're declared here instead
+//  of above to be closer to the actual invocation sites.  Steps:
+//  1.) Get the exact texel location.
+//  2.) Sample the phosphor mask (already assumed encoded in linear RGB).
+//  3.) Get the distance from the current pixel and sinc weight:
+//          sinc(dist) = sin(pi * dist)/(pi * dist)
+//      We can also use the slower/smoother Lanczos instead:
+//          L(x) = sinc(dist) * sinc(dist / lobes)
+//  4.) Accumulate the weight sum in weights, and accumulate the weighted texels
+//      in pixel_color (we'll normalize outside the loop at the end).
+//  We vectorize the loop to help reduce the Lanczos window's cost.
+
+    //  The r coord is the coord in the dimension we're resizing along (u or v),
+    //  and first_texel_tile_uv_rrrr is a float4 of the first texel's u or v
+    //  tile_uv coord in [0, 1].  tex_uv_r will contain the tile_uv u or v coord
+    //  for four new texel samples.
+    #define CALCULATE_R_COORD_FOR_4_SAMPLES                                    \
+        const float4 true_i = float4(i_base + i) + float4(0.0, 1.0, 2.0, 3.0); \
+        const float4 tile_uv_r = frac(                                         \
+            first_texel_tile_uv_rrrr + true_i * tile_dr);                      \
+        const float4 tex_uv_r = tile_uv_r * tile_size_uv_r;
+
+    #ifdef PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+        #define CALCULATE_SINC_RESAMPLE_WEIGHTS                                \
+            const float4 pi_dist_over_lobes = pi_over_lobes * dist;            \
+            const float4 weights = min(sin(pi_dist) * sin(pi_dist_over_lobes) /\
+                (pi_dist*pi_dist_over_lobes), float4(1.0));
+    #else
+        #define CALCULATE_SINC_RESAMPLE_WEIGHTS                                \
+            const float4 weights = min(sin(pi_dist)/pi_dist, float4(1.0));
+    #endif
+
+    #define UPDATE_COLOR_AND_WEIGHT_SUMS                                       \
+        const float4 dist = magnification_scale *                              \
+            abs(first_dist_unscaled - true_i);                                 \
+        const float4 pi_dist = pi * dist;                                      \
+        CALCULATE_SINC_RESAMPLE_WEIGHTS;                                       \
+        pixel_color += new_sample0 * weights.xxx;                              \
+        pixel_color += new_sample1 * weights.yyy;                              \
+        pixel_color += new_sample2 * weights.zzz;                              \
+        pixel_color += new_sample3 * weights.www;                              \
+        weight_sum += weights;
+
+    #define VERTICAL_SINC_RESAMPLE_LOOP_BODY                                   \
+        CALCULATE_R_COORD_FOR_4_SAMPLES;                                       \
+        const float3 new_sample0 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.x)).rgb;                                 \
+        const float3 new_sample1 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.y)).rgb;                                 \
+        const float3 new_sample2 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.z)).rgb;                                 \
+        const float3 new_sample3 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.w)).rgb;                                 \
+        UPDATE_COLOR_AND_WEIGHT_SUMS;
+
+    #define HORIZONTAL_SINC_RESAMPLE_LOOP_BODY                                 \
+        CALCULATE_R_COORD_FOR_4_SAMPLES;                                       \
+        const float3 new_sample0 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.x, tex_uv.y)).rgb;                                 \
+        const float3 new_sample1 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.y, tex_uv.y)).rgb;                                 \
+        const float3 new_sample2 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.z, tex_uv.y)).rgb;                                 \
+        const float3 new_sample3 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.w, tex_uv.y)).rgb;                                 \
+        UPDATE_COLOR_AND_WEIGHT_SUMS;
+
+
+////////////////////////////  RESAMPLING FUNCTIONS  ////////////////////////////
+
+float3 downsample_vertical_sinc_tiled(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size, static const float dr,
+    const float magnification_scale, static const float tile_size_uv_r)
+{
+    //  Requires:   1.) dr == du == 1.0/texture_size.x or
+    //                  dr == dv == 1.0/texture_size.y
+    //                  (whichever direction we're resampling in).
+    //                  It's a scalar to save register space.
+    //              2.) tile_size_uv_r is the number of texels an input tile
+    //                  takes up in the input texture, in the direction we're
+    //                  resampling this pass.
+    //              3.) magnification_scale must be <= 1.0.
+    //  Returns:    Return a [Lanczos] sinc-resampled pixel of a vertically
+    //              downsized input tile embedded in an input texture.  (The
+    //              vertical version is special-cased though: It assumes the
+    //              tile size equals the [static] texture size, since it's used
+    //              on an LUT texture input containing one tile.  For more
+    //              generic use, eliminate the "static" in the parameters.)
+    //  The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension
+    //  we're resizing along, e.g. "dy" in this case.
+    #ifdef USE_SINGLE_STATIC_LOOP
+        //  A static loop can be faster, but it might blur too much from using
+        //  more samples than it should.
+        static const int samples = int(max_sinc_resize_samples_m4);
+    #else
+        const int samples = int(get_dynamic_loop_size(magnification_scale));
+    #endif
+
+    //  Get the first sample location (scalar tile uv coord along the resized
+    //  dimension) and distance from the output location (in texels):
+    static const float input_tiles_per_texture_r = 1.0/tile_size_uv_r;
+    //  true = vertical resize:
+    const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist(
+        tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, true);
+    const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx;
+    const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy;
+    //  Get the tile sample offset:
+    static const float tile_dr = dr * input_tiles_per_texture_r;
+
+    //  Sum up each weight and weighted sample color, varying the looping
+    //  strategy based on our expected dynamic loop capabilities.  See the
+    //  loop body macros above.
+    int i_base = 0;
+    float4 weight_sum = float4(0.0);
+    float3 pixel_color = float3(0.0);
+    static const int i_step = 4;
+    #ifdef BREAK_LOOPS_INTO_PIECES
+        if(samples - i_base >= 64)
+        {
+            for(int i = 0; i < 64; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 64;
+        }
+        if(samples - i_base >= 32)
+        {
+            for(int i = 0; i < 32; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 32;
+        }
+        if(samples - i_base >= 16)
+        {
+            for(int i = 0; i < 16; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 16;
+        }
+        if(samples - i_base >= 8)
+        {
+            for(int i = 0; i < 8; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 8;
+        }
+        if(samples - i_base >= 4)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 4;
+        }
+        //  Do another 4-sample block for a total of 128 max samples.
+        if(samples - i_base > 0)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+        }
+    #else
+        for(int i = 0; i < samples; i += i_step)
+        {
+            VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+        }
+    #endif
+    //  Normalize so the weight_sum == 1.0, and return:
+    const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw;
+    const float3 scalar_weight_sum = float3(weight_sum_reduce.x + 
+        weight_sum_reduce.y);
+    return (pixel_color/scalar_weight_sum);
+}
+
+float3 downsample_horizontal_sinc_tiled(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size, const float dr,
+    const float magnification_scale, const float tile_size_uv_r)
+{
+    //  Differences from downsample_horizontal_sinc_tiled:
+    //  1.) The dr and tile_size_uv_r parameters are not static consts.
+    //  2.) The "vertical" parameter to get_first_texel_tile_uv_and_dist is
+    //      set to false instead of true.
+    //  3.) The horizontal version of the loop body is used.
+    //  TODO: If we can get guaranteed compile-time dead code elimination,
+    //  we can combine the vertical/horizontal downsampling functions by:
+    //  1.) Add an extra static const bool parameter called "vertical."
+    //  2.) Supply it with the result of get_first_texel_tile_uv_and_dist().
+    //  3.) Use a conditional assignment in the loop body macro.  This is the
+    //      tricky part: We DO NOT want to incur the extra conditional
+    //      assignment in the inner loop at runtime!
+    //  The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension
+    //  we're resizing along, e.g. "dx" in this case.
+    #ifdef USE_SINGLE_STATIC_LOOP
+        //  If we have to load all samples, we might as well use them.
+        static const int samples = int(max_sinc_resize_samples_m4);
+    #else
+        const int samples = int(get_dynamic_loop_size(magnification_scale));
+    #endif
+
+    //  Get the first sample location (scalar tile uv coord along resized
+    //  dimension) and distance from the output location (in texels):
+    const float input_tiles_per_texture_r = 1.0/tile_size_uv_r;
+    //  false = horizontal resize:
+    const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist(
+        tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, false);
+    const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx;
+    const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy;
+    //  Get the tile sample offset:
+    const float tile_dr = dr * input_tiles_per_texture_r;
+
+    //  Sum up each weight and weighted sample color, varying the looping
+    //  strategy based on our expected dynamic loop capabilities.  See the
+    //  loop body macros above.
+    int i_base = 0;
+    float4 weight_sum = float4(0.0);
+    float3 pixel_color = float3(0.0);
+    static const int i_step = 4;
+    #ifdef BREAK_LOOPS_INTO_PIECES
+        if(samples - i_base >= 64)
+        {
+            for(int i = 0; i < 64; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 64;
+        }
+        if(samples - i_base >= 32)
+        {
+            for(int i = 0; i < 32; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 32;
+        }
+        if(samples - i_base >= 16)
+        {
+            for(int i = 0; i < 16; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 16;
+        }
+        if(samples - i_base >= 8)
+        {
+            for(int i = 0; i < 8; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 8;
+        }
+        if(samples - i_base >= 4)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 4;
+        }
+        //  Do another 4-sample block for a total of 128 max samples.
+        if(samples - i_base > 0)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+        }
+    #else
+        for(int i = 0; i < samples; i += i_step)
+        {
+            HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+        }
+    #endif
+    //  Normalize so the weight_sum == 1.0, and return:
+    const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw;
+    const float3 scalar_weight_sum = float3(weight_sum_reduce.x +
+        weight_sum_reduce.y);
+    return (pixel_color/scalar_weight_sum);
+}
+
+
+////////////////////////////  TILE SIZE CALCULATION  ///////////////////////////
+
+float2 get_resized_mask_tile_size(const float2 estimated_viewport_size,
+    const float2 estimated_mask_resize_output_size,
+    const bool solemnly_swear_same_inputs_for_every_pass)
+{
+    //  Requires:   The following global constants must be defined according to
+    //              certain constraints:
+    //              1.) mask_resize_num_triads: Must be high enough that our
+    //                  mask sampling method won't have artifacts later
+    //                  (long story; see derived-settings-and-constants.h)
+    //              2.) mask_resize_src_lut_size: Texel size of our mask LUT
+    //              3.) mask_triads_per_tile: Num horizontal triads in our LUT
+    //              4.) mask_min_allowed_triad_size: User setting (the more
+    //                  restrictive it is, the faster the resize will go)
+    //              5.) mask_min_allowed_tile_size_x < mask_resize_src_lut_size.x
+    //              6.) mask_triad_size_desired_{runtime, static}
+    //              7.) mask_num_triads_desired_{runtime, static}
+    //              8.) mask_specify_num_triads must be 0.0/1.0 (false/true)
+    //              The function parameters must be defined as follows:
+    //              1.) estimated_viewport_size == (final viewport size);
+    //                  If mask_specify_num_triads is 1.0/true and the viewport
+    //                  estimate is wrong, the number of triads will differ from
+    //                  the user's preference by about the same factor.
+    //              2.) estimated_mask_resize_output_size: Must equal the
+    //                  output size of the MASK_RESIZE pass.
+    //                  Exception: The x component may be estimated garbage if
+    //                  and only if the caller throws away the x result.
+    //              3.) solemnly_swear_same_inputs_for_every_pass: Set to false,
+    //                  unless you can guarantee that every call across every
+    //                  pass will use the same sizes for the other parameters.
+    //              When calling this across multiple passes, always use the
+    //              same y viewport size/scale, and always use the same x
+    //              viewport size/scale when using the x result.
+    //  Returns:    Return the final size of a manually resized mask tile, after
+    //              constraining the desired size to avoid artifacts.  Under
+    //              unusual circumstances, tiles may become stretched vertically
+    //              (see wall of text below).
+    //  Stated tile properties must be correct:
+    static const float tile_aspect_ratio_inv =
+        mask_resize_src_lut_size.y/mask_resize_src_lut_size.x;
+    static const float tile_aspect_ratio = 1.0/tile_aspect_ratio_inv;
+    static const float2 tile_aspect = float2(1.0, tile_aspect_ratio_inv);
+    //  If mask_specify_num_triads is 1.0/true and estimated_viewport_size.x is
+    //  wrong, the user preference will be misinterpreted:
+    const float desired_tile_size_x = mask_triads_per_tile * lerp(
+        mask_triad_size_desired,
+        estimated_viewport_size.x / mask_num_triads_desired,
+        mask_specify_num_triads);
+    if(get_mask_sample_mode() > 0.5)
+    {
+        //  We don't need constraints unless we're sampling MASK_RESIZE.
+        return desired_tile_size_x * tile_aspect;
+    }
+    //  Make sure we're not upsizing:
+    const float temp_tile_size_x =
+        min(desired_tile_size_x, mask_resize_src_lut_size.x);
+    //  Enforce min_tile_size and max_tile_size in both dimensions:
+    const float2 temp_tile_size = temp_tile_size_x * tile_aspect;
+    static const float2 min_tile_size =
+        mask_min_allowed_tile_size * tile_aspect;
+    const float2 max_tile_size =
+        estimated_mask_resize_output_size / mask_resize_num_tiles;
+    const float2 clamped_tile_size =
+        clamp(temp_tile_size, min_tile_size, max_tile_size);
+    //  Try to maintain tile_aspect_ratio.  This is the tricky part:
+    //  If we're currently resizing in the y dimension, the x components
+    //  could be MEANINGLESS.  (If estimated_mask_resize_output_size.x is
+    //  bogus, then so is max_tile_size.x and clamped_tile_size.x.)
+    //  We can't adjust the y size based on clamped_tile_size.x.  If it
+    //  clamps when it shouldn't, it won't clamp again when later passes
+    //  call this function with the correct sizes, and the discrepancy will
+    //  break the sampling coords in MASKED_SCANLINES.  Instead, we'll limit
+    //  the x size based on the y size, but not vice versa, unless the
+    //  caller swears the parameters were the same (correct) in every pass.
+    //  As a result, triads could appear vertically stretched if:
+    //  a.) mask_resize_src_lut_size.x > mask_resize_src_lut_size.y: Wide
+    //      LUT's might clamp x more than y (all provided LUT's are square)
+    //  b.) true_viewport_size.x < true_viewport_size.y: The user is playing
+    //      with a vertically oriented screen (not accounted for anyway)
+    //  c.) mask_resize_viewport_scale.x < masked_resize_viewport_scale.y:
+    //      Viewport scales are equal by default.
+    //  If any of these are the case, you can fix the stretching by setting:
+    //      mask_resize_viewport_scale.x = mask_resize_viewport_scale.y *
+    //          (1.0 / min_expected_aspect_ratio) *
+    //          (mask_resize_src_lut_size.x / mask_resize_src_lut_size.y)
+    const float x_tile_size_from_y =
+        clamped_tile_size.y * tile_aspect_ratio;
+    const float y_tile_size_from_x = lerp(clamped_tile_size.y,
+        clamped_tile_size.x * tile_aspect_ratio_inv,
+        float(solemnly_swear_same_inputs_for_every_pass));
+    const float2 reclamped_tile_size = float2(
+        min(clamped_tile_size.x, x_tile_size_from_y),
+        min(clamped_tile_size.y, y_tile_size_from_x));
+    //  We need integer tile sizes in both directions for tiled sampling to
+    //  work correctly.  Use floor (to make sure we don't round up), but be
+    //  careful to avoid a rounding bug where floor decreases whole numbers:
+    const float2 final_resized_tile_size =
+        floor(reclamped_tile_size + float2(FIX_ZERO(0.0)));
+    return final_resized_tile_size;
+}
+
+
+/////////////////////////  FINAL MASK SAMPLING HELPERS  ////////////////////////
+
+float4 get_mask_sampling_parameters(const float2 mask_resize_texture_size,
+    const float2 mask_resize_video_size, const float2 true_viewport_size,
+    out float2 mask_tiles_per_screen)
+{
+    //  Requires:   1.) Requirements of get_resized_mask_tile_size() must be
+    //                  met, particularly regarding global constants.
+    //              The function parameters must be defined as follows:
+    //              1.) mask_resize_texture_size == MASK_RESIZE.texture_size
+    //                  if get_mask_sample_mode() is 0 (otherwise anything)
+    //              2.) mask_resize_video_size == MASK_RESIZE.video_size
+    //                  if get_mask_sample_mode() is 0 (otherwise anything)
+    //              3.) true_viewport_size == IN.output_size for a pass set to
+    //                  1.0 viewport scale (i.e. it must be correct)
+    //  Returns:    Return a float4 containing:
+    //                  xy: tex_uv coords for the start of the mask tile
+    //                  zw: tex_uv size of the mask tile from start to end
+    //              mask_tiles_per_screen is an out parameter containing the
+    //              number of mask tiles that will fit on the screen.
+    //  First get the final resized tile size.  The viewport size and mask
+    //  resize viewport scale must be correct, but don't solemnly swear they
+    //  were correct in both mask resize passes unless you know it's true.
+    //  (We can better ensure a correct tile aspect ratio if the parameters are
+    //  guaranteed correct in all passes...but if we lie, we'll get inconsistent
+    //  sizes across passes, resulting in broken texture coordinates.)
+    const float mask_sample_mode = get_mask_sample_mode();
+    const float2 mask_resize_tile_size = get_resized_mask_tile_size(
+        true_viewport_size, mask_resize_video_size, false);
+    if(mask_sample_mode < 0.5)
+    {
+        //  Sample MASK_RESIZE: The resized tile is a fraction of the texture
+        //  size and starts at a nonzero offset to allow for border texels:
+        const float2 mask_tile_uv_size = mask_resize_tile_size /
+            mask_resize_texture_size;
+        const float2 skipped_tiles = mask_start_texels/mask_resize_tile_size;
+        const float2 mask_tile_start_uv = skipped_tiles * mask_tile_uv_size;
+        //  mask_tiles_per_screen must be based on the *true* viewport size:
+        mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size;
+        return float4(mask_tile_start_uv, mask_tile_uv_size);
+    }
+    else
+    {
+        //  If we're tiling at the original size (1:1 pixel:texel), redefine a
+        //  "tile" to be the full texture containing many triads.  Otherwise,
+        //  we're hardware-resampling an LUT, and the texture truly contains a
+        //  single unresized phosphor mask tile anyway.
+        static const float2 mask_tile_uv_size = float2(1.0);
+        static const float2 mask_tile_start_uv = float2(0.0);
+        if(mask_sample_mode > 1.5)
+        {
+            //  Repeat the full LUT at a 1:1 pixel:texel ratio without resizing:
+            mask_tiles_per_screen = true_viewport_size/mask_texture_large_size;
+        }
+        else
+        {
+            //  Hardware-resize the original LUT:
+            mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size;
+        }
+        return float4(mask_tile_start_uv, mask_tile_uv_size);
+    }
+}
+/*
+float2 fix_tiling_discontinuities_normalized(const float2 tile_uv,
+    float2 duv_dx, float2 duv_dy)
+{
+    //  Requires:   1.) duv_dx == ddx(tile_uv)
+    //              2.) duv_dy == ddy(tile_uv)
+    //              3.) tile_uv contains tile-relative uv coords in [0, 1],
+    //                  such that (0.5, 0.5) is the center of a tile, etc.
+    //                  ("Tile" can mean texture, the video embedded in the
+    //                  texture, or some other "tile" embedded in a texture.)
+    //  Returns:    Return new tile_uv coords that contain no discontinuities
+    //              across a 2x2 pixel quad.
+    //  Description:
+    //  When uv coords wrap from 1.0 to 0.0, they create a discontinuity in the
+    //  derivatives, which we assume happened if the absolute difference between
+    //  any fragment in a 2x2 block is > ~half a tile.  If the current block has
+    //  a u or v discontinuity and the current fragment is in the first half of
+    //  the tile along that axis (i.e. it wrapped from 1.0 to 0.0), add a tile
+    //  to that coord to make the 2x2 block continuous.  (It will now have a
+    //  coord > 1.0 in the padding area beyond the tile.)  This function takes
+    //  derivatives as parameters so the caller can reuse them.
+    //  In case we're using high-quality (nVidia-style) derivatives, ensure
+    //  diagonically opposite fragments see each other for correctness:
+    duv_dx = abs(duv_dx) + abs(ddy(duv_dx));
+    duv_dy = abs(duv_dy) + abs(ddx(duv_dy));
+    const float2 pixel_in_first_half_tile = float2((tile_uv.x < 0.5),(tile_uv.y < 0.5));
+    const float2 jump_exists = float2(((duv_dx + duv_dy).x > 0.5),((duv_dx + duv_dy).y > 0.5));
+    return tile_uv + jump_exists * pixel_in_first_half_tile;
+}
+*/
+float2 convert_phosphor_tile_uv_wrap_to_tex_uv(const float2 tile_uv_wrap,
+    const float4 mask_tile_start_uv_and_size)
+{
+    //  Requires:   1.) tile_uv_wrap contains tile-relative uv coords, where the
+    //                  tile spans from [0, 1], such that (0.5, 0.5) is at the
+    //                  tile center.  The input coords can range from [0, inf],
+    //                  and their fractional parts map to a repeated tile.
+    //                  ("Tile" can mean texture, the video embedded in the
+    //                  texture, or some other "tile" embedded in a texture.)
+    //              2.) mask_tile_start_uv_and_size.xy contains tex_uv coords
+    //                  for the start of the embedded tile in the full texture.
+    //              3.) mask_tile_start_uv_and_size.zw contains the [fractional]
+    //                  tex_uv size of the embedded tile in the full texture.
+    //  Returns:    Return tex_uv coords (used for texture sampling)
+    //              corresponding to tile_uv_wrap.
+    if(get_mask_sample_mode() < 0.5)
+    {
+        //  Manually repeat the resized mask tile to fill the screen:
+        //  First get fractional tile_uv coords.  Using frac/fmod on coords
+        //  confuses anisotropic filtering; fix it as user options dictate.
+        //  derived-settings-and-constants.h disables incompatible options.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            float2 tile_uv = frac(tile_uv_wrap * 0.5) * 2.0;
+        #else
+            float2 tile_uv = frac(tile_uv_wrap);
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            const float2 tile_uv_dx = ddx(tile_uv);
+            const float2 tile_uv_dy = ddy(tile_uv);
+            tile_uv = fix_tiling_discontinuities_normalized(tile_uv,
+                tile_uv_dx, tile_uv_dy);
+        #endif
+        //  The tile is embedded in a padded FBO, and it may start at a
+        //  nonzero offset if border texels are used to avoid artifacts:
+        const float2 mask_tex_uv = mask_tile_start_uv_and_size.xy +
+            tile_uv * mask_tile_start_uv_and_size.zw;
+        return mask_tex_uv;
+    }
+    else
+    {
+        //  Sample from the input phosphor mask texture with hardware tiling.
+        //  If we're tiling at the original size (mode 2), the "tile" is the
+        //  whole texture, and it contains a large number of triads mapped with
+        //  a 1:1 pixel:texel ratio.  OTHERWISE, the texture contains a single
+        //  unresized tile.  tile_uv_wrap already has correct coords for both!
+        return tile_uv_wrap;
+    }
+}
+
+
+#endif  //  PHOSPHOR_MASK_RESIZING_H
+
+/////////////////////////  END PHOSPHOR-MASK-RESIZING  /////////////////////////
+
+/////////////////////////////  END VERTEX-INCLUDES  ////////////////////////////
+
+#undef COMPAT_PRECISION
+#undef COMPAT_TEXTURE
+
+float bloom_approx_scale_x = targetSize.y / sourceSize[0].y;
+const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0);
+const float bloom_diff_thresh_ = 1.0/256.0;
+
+// copied from bloom-functions.h
+inline float get_min_sigma_to_blur_triad(const float triad_size,
+    const float thresh)
+{
+    //  Requires:   1.) triad_size is the final phosphor triad size in pixels
+    //              2.) thresh is the max desired pixel difference in the
+    //                  blurred triad (e.g. 1.0/256.0).
+    //  Returns:    Return the minimum sigma that will fully blur a phosphor
+    //              triad on the screen to an even color, within thresh.
+    //              This closed-form function was found by curve-fitting data.
+    //  Estimate: max error = ~0.086036, mean sq. error = ~0.0013387:
+    return -0.05168 + 0.6113*triad_size -
+        1.122*triad_size*sqrt(0.000416 + thresh);
+    //  Estimate: max error = ~0.16486, mean sq. error = ~0.0041041:
+    //return 0.5985*triad_size - triad_size*sqrt(thresh)
+}
+
+void main() {
+   gl_Position = position;
+   vTexCoord = texCoord;
+	tex_uv = vTexCoord.xy * 1.0001;
+   
+	//  Get the uv sample distance between output pixels.  Calculate dxdy like
+    //  blurs/vertex-shader-blur-fast-vertical.h.
+    const float2 dxdy_scale = video_size/output_size;
+    const float2 dxdy = dxdy_scale/texture_size;
+    //  This blur is vertical-only, so zero out the vertical offset:
+    bloom_dxdy = float2(0.0, dxdy.y);
+
+    //  Calculate a runtime bloom_sigma in case it's needed:
+    const float mask_tile_size_x = get_resized_mask_tile_size(
+        output_size, output_size * mask_resize_viewport_scale, false).x;
+    bloom_sigma_runtime = get_min_sigma_to_blur_triad(
+        mask_tile_size_x / mask_triads_per_tile, bloom_diff_thresh_);
+}
\ No newline at end of file
diff --git a/shaders/CRT-Royale.shader/blur9fast-horizontal.fs b/shaders/CRT-Royale.shader/blur9fast-horizontal.fs
new file mode 100644
index 00000000..c7293eed
--- /dev/null
+++ b/shaders/CRT-Royale.shader/blur9fast-horizontal.fs
@@ -0,0 +1,2016 @@
+#version 150
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//  
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+#if __VERSION__ >= 130
+#define COMPAT_TEXTURE texture
+#else
+#define COMPAT_TEXTURE texture2D
+#endif
+
+#ifdef GL_ES
+#define COMPAT_PRECISION mediump
+#else
+#define COMPAT_PRECISION
+#endif
+
+uniform sampler2D source[];
+uniform vec4 sourceSize[];
+uniform vec4 targetSize;
+
+in Vertex {
+   vec2 vTexCoord;
+   vec2 blur_dxdy;
+};
+
+out vec4 FragColor;
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+//  PASS SETTINGS:
+//  gamma-management.h needs to know what kind of pipeline we're using and
+//  what pass this is in that pipeline.  This will become obsolete if/when we
+//  can #define things like this in the .cgp preset file.
+//#define GAMMA_ENCODE_EVERY_FBO
+//#define FIRST_PASS
+//#define LAST_PASS
+//#define SIMULATE_CRT_ON_LCD
+//#define SIMULATE_GBA_ON_LCD
+//#define SIMULATE_LCD_ON_CRT
+//#define SIMULATE_GBA_ON_CRT
+
+#ifndef GAMMA_MANAGEMENT_H
+#define GAMMA_MANAGEMENT_H
+
+///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
+
+//  Set standard gamma constants, but allow users to override them:
+#ifndef OVERRIDE_STANDARD_GAMMA
+    //  Standard encoding gammas:
+    float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
+    float pal_gamma = 2.8;     //  Never actually 2.8 in practice
+    //  Typical device decoding gammas (only use for emulating devices):
+    //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
+    //  gammas: The standards purposely undercorrected for an analog CRT's
+    //  assumed 2.5 reference display gamma to maintain contrast in assumed
+    //  [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
+    //  These unstated assumptions about display gamma and perceptual rendering
+    //  intent caused a lot of confusion, and more modern CRT's seemed to target
+    //  NTSC 2.2 gamma with circuitry.  LCD displays seem to have followed suit
+    //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
+    //  displays designed to view sRGB in bright environments.  (Standards are
+    //  also in flux again with BT.1886, but it's underspecified for displays.)
+    float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
+    float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
+    float lcd_reference_gamma = 2.5;       //  To match CRT
+    float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
+    float lcd_office_gamma = 2.2;  //  Approximates sRGB
+#endif  //  OVERRIDE_STANDARD_GAMMA
+
+//  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
+//  but only if they're aware of it.
+#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
+    bool assume_opaque_alpha = false;
+#endif
+
+
+///////////////////////  DERIVED CONSTANTS AS FUNCTIONS  ///////////////////////
+
+//  gamma-management.h should be compatible with overriding gamma values with
+//  runtime user parameters, but we can only define other global constants in
+//  terms of static constants, not uniform user parameters.  To get around this
+//  limitation, we need to define derived constants using functions.
+
+//  Set device gamma constants, but allow users to override them:
+#ifdef OVERRIDE_DEVICE_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    float get_crt_gamma()    {   return crt_gamma;   }
+    float get_gba_gamma()    {   return gba_gamma;   }
+    float get_lcd_gamma()    {   return lcd_gamma;   }
+#else
+    float get_crt_gamma()    {   return crt_reference_gamma_high;    }
+    float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
+    float get_lcd_gamma()    {   return lcd_office_gamma;            }
+#endif  //  OVERRIDE_DEVICE_GAMMA
+
+//  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
+#ifdef OVERRIDE_FINAL_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    float get_intermediate_gamma()   {   return intermediate_gamma;  }
+    float get_input_gamma()          {   return input_gamma;         }
+    float get_output_gamma()         {   return output_gamma;        }
+#else
+    //  If we gamma-correct every pass, always use ntsc_gamma between passes to
+    //  ensure middle passes don't need to care if anything is being simulated:
+    float get_intermediate_gamma()   {   return ntsc_gamma;          }
+    #ifdef SIMULATE_CRT_ON_LCD
+        float get_input_gamma()      {   return get_crt_gamma();     }
+        float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_LCD
+        float get_input_gamma()      {   return get_gba_gamma();     }
+        float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_LCD_ON_CRT
+        float get_input_gamma()      {   return get_lcd_gamma();     }
+        float get_output_gamma()     {   return get_crt_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_CRT
+        float get_input_gamma()      {   return get_gba_gamma();     }
+        float get_output_gamma()     {   return get_crt_gamma();     }
+    #else   //  Don't simulate anything:
+        float get_input_gamma()      {   return ntsc_gamma;          }
+        float get_output_gamma()     {   return ntsc_gamma;          }
+    #endif  //  SIMULATE_GBA_ON_CRT
+    #endif  //  SIMULATE_LCD_ON_CRT
+    #endif  //  SIMULATE_GBA_ON_LCD
+    #endif  //  SIMULATE_CRT_ON_LCD
+#endif  //  OVERRIDE_FINAL_GAMMA
+
+#ifndef GAMMA_ENCODE_EVERY_FBO
+    #ifdef FIRST_PASS
+        bool linearize_input = true;
+        float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        bool linearize_input = false;
+        float get_pass_input_gamma()     {   return 1.0;                 }
+    #endif
+    #ifdef LAST_PASS
+        bool gamma_encode_output = true;
+        float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        bool gamma_encode_output = false;
+        float get_pass_output_gamma()    {   return 1.0;                 }
+    #endif
+#else
+    bool linearize_input = true;
+    bool gamma_encode_output = true;
+    #ifdef FIRST_PASS
+        float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
+    #endif
+    #ifdef LAST_PASS
+        float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
+    #endif
+#endif
+
+vec4 decode_input(vec4 color)
+{
+    if(linearize_input = true)
+    {
+        if(assume_opaque_alpha = true)
+        {
+            return vec4(pow(color.rgb, vec3(get_pass_input_gamma())), 1.0);
+        }
+        else
+        {
+            return vec4(pow(color.rgb, vec3(get_pass_input_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+vec4 encode_output(vec4 color)
+{
+    if(gamma_encode_output = true)
+    {
+        if(assume_opaque_alpha = true)
+        {
+            return vec4(pow(color.rgb, vec3(1.0/get_pass_output_gamma())), 1.0);
+        }
+        else
+        {
+            return vec4(pow(color.rgb, vec3(1.0/get_pass_output_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D)))
+//vec4 tex2D_linearize(sampler2D tex, vec2 tex_coords)
+//{   return decode_input(vec4(COMPAT_TEXTURE(tex, tex_coords)));   }
+
+//#define tex2D_linearize(C, D, E) decode_input(vec4(COMPAT_TEXTURE(C, D, E)))
+//vec4 tex2D_linearize(sampler2D tex, vec2 tex_coords, int texel_off)
+//{   return decode_input(vec4(COMPAT_TEXTURE(tex, tex_coords, texel_off)));    }
+
+#endif  //  GAMMA_MANAGEMENT_H
+
+#ifndef BLUR_FUNCTIONS_H
+#define BLUR_FUNCTIONS_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides reusable one-pass and separable (two-pass) blurs.
+//  Requires:   All blurs share these requirements (dxdy requirement is split):
+//              1.) All requirements of gamma-management.h must be satisfied!
+//              2.) filter_linearN must == "true" in your .cgp preset unless
+//                  you're using tex2DblurNresize at 1x scale.
+//              3.) mipmap_inputN must == "true" in your .cgp preset if
+//                  IN.output_size < IN.video_size.
+//              4.) IN.output_size == IN.video_size / pow(2, M), where M is some
+//                  positive integer.  tex2Dblur*resize can resize arbitrarily
+//                  (and the blur will be done after resizing), but arbitrary
+//                  resizes "fail" with other blurs due to the way they mix
+//                  static weights with bilinear sample exploitation.
+//              5.) In general, dxdy should contain the uv pixel spacing:
+//                      dxdy = (IN.video_size/IN.output_size)/IN.texture_size
+//              6.) For separable blurs (tex2DblurNresize and tex2DblurNfast),
+//                  zero out the dxdy component in the unblurred dimension:
+//                      dxdy = vec2(dxdy.x, 0.0) or vec2(0.0, dxdy.y)
+//              Many blurs share these requirements:
+//              1.) One-pass blurs require scale_xN == scale_yN or scales > 1.0,
+//                  or they will blur more in the lower-scaled dimension.
+//              2.) One-pass shared sample blurs require ddx(), ddy(), and
+//                  tex2Dlod() to be supported by the current Cg profile, and
+//                  the drivers must support high-quality derivatives.
+//              3.) One-pass shared sample blurs require:
+//                      tex_uv.w == log2(IN.video_size/IN.output_size).y;
+//              Non-wrapper blurs share this requirement:
+//              1.) sigma is the intended standard deviation of the blur
+//              Wrapper blurs share this requirement, which is automatically
+//              met (unless OVERRIDE_BLUR_STD_DEVS is #defined; see below):
+//              1.) blurN_std_dev must be global static float values
+//                  specifying standard deviations for Nx blurs in units
+//                  of destination pixels
+//  Optional:   1.) The including file (or an earlier included file) may
+//                  optionally #define USE_BINOMIAL_BLUR_STD_DEVS to replace
+//                  default standard deviations with those matching a binomial
+//                  distribution.  (See below for details/properties.)
+//              2.) The including file (or an earlier included file) may
+//                  optionally #define OVERRIDE_BLUR_STD_DEVS and override:
+//                      static float blur3_std_dev
+//                      static float blur4_std_dev
+//                      static float blur5_std_dev
+//                      static float blur6_std_dev
+//                      static float blur7_std_dev
+//                      static float blur8_std_dev
+//                      static float blur9_std_dev
+//                      static float blur10_std_dev
+//                      static float blur11_std_dev
+//                      static float blur12_std_dev
+//                      static float blur17_std_dev
+//                      static float blur25_std_dev
+//                      static float blur31_std_dev
+//                      static float blur43_std_dev
+//              3.) The including file (or an earlier included file) may
+//                  optionally #define OVERRIDE_ERROR_BLURRING and override:
+//                      static float error_blurring
+//                  This tuning value helps mitigate weighting errors from one-
+//                  pass shared-sample blurs sharing bilinear samples between
+//                  fragments.  Values closer to 0.0 have "correct" blurriness
+//                  but allow more artifacts, and values closer to 1.0 blur away
+//                  artifacts by sampling closer to halfway between texels.
+//              UPDATE 6/21/14: The above static constants may now be overridden
+//              by non-static uniform constants.  This permits exposing blur
+//              standard deviations as runtime GUI shader parameters.  However,
+//              using them keeps weights from being statically computed, and the
+//              speed hit depends on the blur: On my machine, uniforms kill over
+//              53% of the framerate with tex2Dblur12x12shared, but they only
+//              drop the framerate by about 18% with tex2Dblur11fast.
+//  Quality and Performance Comparisons:
+//  For the purposes of the following discussion, "no sRGB" means
+//  GAMMA_ENCODE_EVERY_FBO is #defined, and "sRGB" means it isn't.
+//  1.) tex2DblurNfast is always faster than tex2DblurNresize.
+//  2.) tex2DblurNresize functions are the only ones that can arbitrarily resize
+//      well, because they're the only ones that don't exploit bilinear samples.
+//      This also means they're the only functions which can be truly gamma-
+//      correct without linear (or sRGB FBO) input, but only at 1x scale.
+//  3.) One-pass shared sample blurs only have a speed advantage without sRGB.
+//      They also have some inaccuracies due to their shared-[bilinear-]sample
+//      design, which grow increasingly bothersome for smaller blurs and higher-
+//      frequency source images (relative to their resolution).  I had high
+//      hopes for them, but their most realistic use case is limited to quickly
+//      reblurring an already blurred input at full resolution.  Otherwise:
+//      a.) If you're blurring a low-resolution source, you want a better blur.
+//      b.) If you're blurring a lower mipmap, you want a better blur.
+//      c.) If you're blurring a high-resolution, high-frequency source, you
+//          want a better blur.
+//  4.) The one-pass blurs without shared samples grow slower for larger blurs,
+//      but they're competitive with separable blurs at 5x5 and smaller, and
+//      even tex2Dblur7x7 isn't bad if you're wanting to conserve passes.
+//  Here are some framerates from a GeForce 8800GTS.  The first pass resizes to
+//  viewport size (4x in this test) and linearizes for sRGB codepaths, and the
+//  remaining passes perform 6 full blurs.  Mipmapped tests are performed at the
+//  same scale, so they just measure the cost of mipmapping each FBO (only every
+//  other FBO is mipmapped for separable blurs, to mimic realistic usage).
+//  Mipmap      Neither     sRGB+Mipmap sRGB        Function
+//  76.0        92.3        131.3       193.7       tex2Dblur3fast
+//  63.2        74.4        122.4       175.5       tex2Dblur3resize
+//  93.7        121.2       159.3       263.2       tex2Dblur3x3
+//  59.7        68.7        115.4       162.1       tex2Dblur3x3resize
+//  63.2        74.4        122.4       175.5       tex2Dblur5fast
+//  49.3        54.8        100.0       132.7       tex2Dblur5resize
+//  59.7        68.7        115.4       162.1       tex2Dblur5x5
+//  64.9        77.2        99.1        137.2       tex2Dblur6x6shared
+//  55.8        63.7        110.4       151.8       tex2Dblur7fast
+//  39.8        43.9        83.9        105.8       tex2Dblur7resize
+//  40.0        44.2        83.2        104.9       tex2Dblur7x7
+//  56.4        65.5        71.9        87.9        tex2Dblur8x8shared
+//  49.3        55.1        99.9        132.5       tex2Dblur9fast
+//  33.3        36.2        72.4        88.0        tex2Dblur9resize
+//  27.8        29.7        61.3        72.2        tex2Dblur9x9
+//  37.2        41.1        52.6        60.2        tex2Dblur10x10shared
+//  44.4        49.5        91.3        117.8       tex2Dblur11fast
+//  28.8        30.8        63.6        75.4        tex2Dblur11resize
+//  33.6        36.5        40.9        45.5        tex2Dblur12x12shared
+//  TODO: Fill in benchmarks for new untested blurs.
+//                                                  tex2Dblur17fast
+//                                                  tex2Dblur25fast
+//                                                  tex2Dblur31fast
+//                                                  tex2Dblur43fast
+//                                                  tex2Dblur3x3resize
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+//  Set static standard deviations, but allow users to override them with their
+//  own constants (even non-static uniforms if they're okay with the speed hit):
+#ifndef OVERRIDE_BLUR_STD_DEVS
+    //  blurN_std_dev values are specified in terms of dxdy strides.
+    #ifdef USE_BINOMIAL_BLUR_STD_DEVS
+        //  By request, we can define standard deviations corresponding to a
+        //  binomial distribution with p = 0.5 (related to Pascal's triangle).
+        //  This distribution works such that blurring multiple times should
+        //  have the same result as a single larger blur.  These values are
+        //  larger than default for blurs up to 6x and smaller thereafter.
+        float blur3_std_dev = 0.84931640625;
+        float blur4_std_dev = 0.84931640625;
+        float blur5_std_dev = 1.0595703125;
+        float blur6_std_dev = 1.06591796875;
+        float blur7_std_dev = 1.17041015625;
+        float blur8_std_dev = 1.1720703125;
+        float blur9_std_dev = 1.2259765625;
+        float blur10_std_dev = 1.21982421875;
+        float blur11_std_dev = 1.25361328125;
+        float blur12_std_dev = 1.2423828125;
+        float blur17_std_dev = 1.27783203125;
+        float blur25_std_dev = 1.2810546875;
+        float blur31_std_dev = 1.28125;
+        float blur43_std_dev = 1.28125;
+    #else
+        //  The defaults are the largest values that keep the largest unused
+        //  blur term on each side <= 1.0/256.0.  (We could get away with more
+        //  or be more conservative, but this compromise is pretty reasonable.)
+        float blur3_std_dev = 0.62666015625;
+        float blur4_std_dev = 0.66171875;
+        float blur5_std_dev = 0.9845703125;
+        float blur6_std_dev = 1.02626953125;
+        float blur7_std_dev = 1.36103515625;
+        float blur8_std_dev = 1.4080078125;
+        float blur9_std_dev = 1.7533203125;
+        float blur10_std_dev = 1.80478515625;
+        float blur11_std_dev = 2.15986328125;
+        float blur12_std_dev = 2.215234375;
+        float blur17_std_dev = 3.45535583496;
+        float blur25_std_dev = 5.3409576416;
+        float blur31_std_dev = 6.86488037109;
+        float blur43_std_dev = 10.1852050781;
+    #endif  //  USE_BINOMIAL_BLUR_STD_DEVS
+#endif  //  OVERRIDE_BLUR_STD_DEVS
+
+#ifndef OVERRIDE_ERROR_BLURRING
+    //  error_blurring should be in [0.0, 1.0].  Higher values reduce ringing
+    //  in shared-sample blurs but increase blurring and feature shifting.
+    float error_blurring = 0.5;
+#endif
+
+//  Make a length squared helper macro (for usage with static constants):
+#define LENGTH_SQ(vec) (dot(vec, vec))
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//  gamma-management.h relies on pass-specific settings to guide its behavior:
+//  FIRST_PASS, LAST_PASS, GAMMA_ENCODE_EVERY_FBO, etc.  See it for details.
+//#include "gamma-management.h"
+//#include "quad-pixel-communication.h"
+//#include "special-functions.h"
+
+#ifndef SPECIAL_FUNCTIONS_H
+#define SPECIAL_FUNCTIONS_H
+
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file implements the following mathematical special functions:
+//  1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2))
+//  2.) gamma(s), a real-numbered extension of the integer factorial function
+//  It also implements normalized_ligamma(s, z), a normalized lower incomplete
+//  gamma function for s < 0.5 only.  Both gamma() and normalized_ligamma() can
+//  be called with an _impl suffix to use an implementation version with a few
+//  extra precomputed parameters (which may be useful for the caller to reuse).
+//  See below for details.
+//
+//  Design Rationale:
+//  Pretty much every line of code in this file is duplicated four times for
+//  different input types (vec4/vec3/vec2/float).  This is unfortunate,
+//  but Cg doesn't allow function templates.  Macros would be far less verbose,
+//  but they would make the code harder to document and read.  I don't expect
+//  these functions will require a whole lot of maintenance changes unless
+//  someone ever has need for more robust incomplete gamma functions, so code
+//  duplication seems to be the lesser evil in this case.
+
+
+///////////////////////////  GAUSSIAN ERROR FUNCTION  //////////////////////////
+
+vec4 erf6(vec4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Return an Abramowitz/Stegun approximation of erf(), where:
+    //                  erf(x) = 2/sqrt(pi) * integral(e**(-x**2))
+    //              This approximation has a max absolute error of 2.5*10**-5
+    //              with solid numerical robustness and efficiency.  See:
+	//                  https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions
+	vec4 one = vec4(1.0);
+	vec4 sign_x = sign(x);
+	vec4 t = one/(one + 0.47047*abs(x));
+	vec4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+vec3 erf6(vec3 x)
+{
+    //  vec3 version:
+	vec3 one = vec3(1.0);
+	vec3 sign_x = sign(x);
+	vec3 t = one/(one + 0.47047*abs(x));
+	vec3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+vec2 erf6(vec2 x)
+{
+    //  vec2 version:
+	vec2 one = vec2(1.0);
+	vec2 sign_x = sign(x);
+	vec2 t = one/(one + 0.47047*abs(x));
+	vec2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float erf6(float x)
+{
+    //  Float version:
+	float sign_x = sign(x);
+	float t = 1.0/(1.0 + 0.47047*abs(x));
+	float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+vec4 erft(vec4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Approximate erf() with the hyperbolic tangent.  The error is
+    //              visually noticeable, but it's blazing fast and perceptually
+    //              close...at least on ATI hardware.  See:
+    //                  http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html
+    //  Warning:    Only use this if your hardware drivers correctly implement
+    //              tanh(): My nVidia 8800GTS returns garbage output.
+	return tanh(1.202760580 * x);
+}
+
+vec3 erft(vec3 x)
+{
+    //  vec3 version:
+	return tanh(1.202760580 * x);
+}
+
+vec2 erft(vec2 x)
+{
+    //  vec2 version:
+	return tanh(1.202760580 * x);
+}
+
+float erft(float x)
+{
+    //  Float version:
+	return tanh(1.202760580 * x);
+}
+
+vec4 erf(vec4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Some approximation of erf(x), depending on user settings.
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+vec3 erf(vec3 x)
+{
+    //  vec3 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+vec2 erf(vec2 x)
+{
+    //  vec2 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+float erf(float x)
+{
+    //  Float version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+///////////////////////////  COMPLETE GAMMA FUNCTION  //////////////////////////
+
+vec4 gamma_impl(vec4 s, vec4 s_inv)
+{
+    //  Requires:   1.) s is the standard parameter to the gamma function, and
+    //                  it should lie in the [0, 36] range.
+    //              2.) s_inv = 1.0/s.  This implementation function requires
+    //                  the caller to precompute this value, giving users the
+    //                  opportunity to reuse it.
+    //  Returns:    Return approximate gamma function (real-numbered factorial)
+    //              output using the Lanczos approximation with two coefficients
+    //              calculated using Paul Godfrey's method here:
+    //                  http://my.fit.edu/~gabdo/gamma.txt
+    //              An optimal g value for s in [0, 36] is ~1.12906830989, with
+    //              a maximum relative error of 0.000463 for 2**16 equally
+    //              evals.  We could use three coeffs (0.0000346 error) without
+    //              hurting latency, but this allows more parallelism with
+    //              outside instructions.
+	vec4 g = vec4(1.12906830989);
+	vec4 c0 = vec4(0.8109119309638332633713423362694399653724431);
+	vec4 c1 = vec4(0.4808354605142681877121661197951496120000040);
+	vec4 e = vec4(2.71828182845904523536028747135266249775724709);
+	vec4 sph = s + vec4(0.5);
+	vec4 lanczos_sum = c0 + c1/(s + vec4(1.0));
+	vec4 base = (sph + g)/e;  //  or (s + g + vec4(0.5))/e
+	//  gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s).
+	//  This has less error for small s's than (s -= 1.0) at the beginning.
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+vec3 gamma_impl(vec3 s, vec3 s_inv)
+{
+    //  vec3 version:
+	vec3 g = vec3(1.12906830989);
+	vec3 c0 = vec3(0.8109119309638332633713423362694399653724431);
+	vec3 c1 = vec3(0.4808354605142681877121661197951496120000040);
+	vec3 e = vec3(2.71828182845904523536028747135266249775724709);
+	vec3 sph = s + vec3(0.5);
+	vec3 lanczos_sum = c0 + c1/(s + vec3(1.0));
+	vec3 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+vec2 gamma_impl(vec2 s, vec2 s_inv)
+{
+    //  vec2 version:
+	vec2 g = vec2(1.12906830989);
+	vec2 c0 = vec2(0.8109119309638332633713423362694399653724431);
+	vec2 c1 = vec2(0.4808354605142681877121661197951496120000040);
+	vec2 e = vec2(2.71828182845904523536028747135266249775724709);
+	vec2 sph = s + vec2(0.5);
+	vec2 lanczos_sum = c0 + c1/(s + vec2(1.0));
+	vec2 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float gamma_impl(float s, float s_inv)
+{
+    //  Float version:
+	float g = 1.12906830989;
+	float c0 = 0.8109119309638332633713423362694399653724431;
+	float c1 = 0.4808354605142681877121661197951496120000040;
+	float e = 2.71828182845904523536028747135266249775724709;
+	float sph = s + 0.5;
+	float lanczos_sum = c0 + c1/(s + 1.0);
+	float base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+vec4 gamma(vec4 s)
+{
+    //  Requires:   s is the standard parameter to the gamma function, and it
+    //              should lie in the [0, 36] range.
+    //  Returns:    Return approximate gamma function output with a maximum
+    //              relative error of 0.000463.  See gamma_impl for details.
+	return gamma_impl(s, vec4(1.0)/s);
+}
+
+vec3 gamma(vec3 s)
+{
+    //  vec3 version:
+	return gamma_impl(s, vec3(1.0)/s);
+}
+
+vec2 gamma(vec2 s)
+{
+    //  vec2 version:
+	return gamma_impl(s, vec2(1.0)/s);
+}
+
+float gamma(float s)
+{
+    //  Float version:
+	return gamma_impl(s, 1.0/s);
+}
+
+////////////////  INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT)  ///////////////
+
+//  Lower incomplete gamma function for small s and z (implementation):
+vec4 ligamma_small_z_impl(vec4 s, vec4 z, vec4 s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z <= ~0.775075
+    //              3.) s_inv = 1.0/s (precomputed for outside reuse)
+    //  Returns:    A series representation for the lower incomplete gamma
+    //              function for small s and small z (4 terms).
+    //  The actual "rolled up" summation looks like:
+	//      last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0;
+	//      sum = last_sign * last_pow / ((s + k) * last_factorial)
+	//      for(int i = 0; i < 4; ++i)
+	//      {
+	//          last_sign *= -1.0; last_pow *= z; last_factorial *= i;
+	//          sum += last_sign * last_pow / ((s + k) * last_factorial);
+	//      }
+	//  Unrolled, constant-unfolded and arranged for madds and parallelism:
+	vec4 scale = pow(z, s);
+	vec4 sum = s_inv;  //  Summation iteration 0 result
+	//  Summation iterations 1, 2, and 3:
+	vec4 z_sq = z*z;
+	vec4 denom1 = s + vec4(1.0);
+	vec4 denom2 = 2.0*s + vec4(4.0);
+	vec4 denom3 = 6.0*s + vec4(18.0);
+	//vec4 denom4 = 24.0*s + vec4(96.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	//sum += z_sq * z_sq / denom4;
+	//  Scale and return:
+	return scale * sum;
+}
+
+vec3 ligamma_small_z_impl(vec3 s, vec3 z, vec3 s_inv)
+{
+    //  vec3 version:
+	vec3 scale = pow(z, s);
+	vec3 sum = s_inv;
+	vec3 z_sq = z*z;
+	vec3 denom1 = s + vec3(1.0);
+	vec3 denom2 = 2.0*s + vec3(4.0);
+	vec3 denom3 = 6.0*s + vec3(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+vec2 ligamma_small_z_impl(vec2 s, vec2 z, vec2 s_inv)
+{
+    //  vec2 version:
+	vec2 scale = pow(z, s);
+	vec2 sum = s_inv;
+	vec2 z_sq = z*z;
+	vec2 denom1 = s + vec2(1.0);
+	vec2 denom2 = 2.0*s + vec2(4.0);
+	vec2 denom3 = 6.0*s + vec2(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float ligamma_small_z_impl(float s, float z, float s_inv)
+{
+    //  Float version:
+	float scale = pow(z, s);
+	float sum = s_inv;
+	float z_sq = z*z;
+	float denom1 = s + 1.0;
+	float denom2 = 2.0*s + 4.0;
+	float denom3 = 6.0*s + 18.0;
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+//  Upper incomplete gamma function for small s and large z (implementation):
+vec4 uigamma_large_z_impl(vec4 s, vec4 z)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z > ~0.775075
+    //  Returns:    Gauss's continued fraction representation for the upper
+    //              incomplete gamma function (4 terms).
+	//  The "rolled up" continued fraction looks like this.  The denominator
+    //  is truncated, and it's calculated "from the bottom up:"
+	//      denom = vec4('inf');
+	//      vec4 one = vec4(1.0);
+	//      for(int i = 4; i > 0; --i)
+	//      {
+	//          denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom;
+	//      }
+	//  Unrolled and constant-unfolded for madds and parallelism:
+	vec4 numerator = pow(z, s) * exp(-z);
+	vec4 denom = vec4(7.0) + z - s;
+	denom = vec4(5.0) + z - s + (3.0*s - vec4(9.0))/denom;
+	denom = vec4(3.0) + z - s + (2.0*s - vec4(4.0))/denom;
+	denom = vec4(1.0) + z - s + (s - vec4(1.0))/denom;
+	return numerator / denom;
+}
+
+vec3 uigamma_large_z_impl(vec3 s, vec3 z)
+{
+    //  vec3 version:
+	vec3 numerator = pow(z, s) * exp(-z);
+	vec3 denom = vec3(7.0) + z - s;
+	denom = vec3(5.0) + z - s + (3.0*s - vec3(9.0))/denom;
+	denom = vec3(3.0) + z - s + (2.0*s - vec3(4.0))/denom;
+	denom = vec3(1.0) + z - s + (s - vec3(1.0))/denom;
+	return numerator / denom;
+}
+
+vec2 uigamma_large_z_impl(vec2 s, vec2 z)
+{
+    //  vec2 version:
+	vec2 numerator = pow(z, s) * exp(-z);
+	vec2 denom = vec2(7.0) + z - s;
+	denom = vec2(5.0) + z - s + (3.0*s - vec2(9.0))/denom;
+	denom = vec2(3.0) + z - s + (2.0*s - vec2(4.0))/denom;
+	denom = vec2(1.0) + z - s + (s - vec2(1.0))/denom;
+	return numerator / denom;
+}
+
+float uigamma_large_z_impl(float s, float z)
+{
+    //  Float version:
+	float numerator = pow(z, s) * exp(-z);
+	float denom = 7.0 + z - s;
+	denom = 5.0 + z - s + (3.0*s - 9.0)/denom;
+	denom = 3.0 + z - s + (2.0*s - 4.0)/denom;
+	denom = 1.0 + z - s + (s - 1.0)/denom;
+	return numerator / denom;
+}
+
+//  Normalized lower incomplete gamma function for small s (implementation):
+vec4 normalized_ligamma_impl(vec4 s, vec4 z,
+    vec4 s_inv, vec4 gamma_s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) s_inv = 1/s (precomputed for outside reuse)
+    //              3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse)
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  Since we only care about s < 0.5, we only need
+    //              to evaluate two branches (not four) based on z.  Each branch
+    //              uses four terms, with a max relative error of ~0.00182.  The
+    //              branch threshold and specifics were adapted for fewer terms
+    //              from Gil/Segura/Temme's paper here:
+    //                  http://oai.cwi.nl/oai/asset/20433/20433B.pdf
+	//  Evaluate both branches: Real branches test slower even when available.
+	vec4 thresh = vec4(0.775075);
+	bvec4 z_is_large = greaterThan(z , thresh);
+	vec4 z_size_check = vec4(z_is_large.x ? 1.0 : 0.0, z_is_large.y ? 1.0 : 0.0, z_is_large.z ? 1.0 : 0.0, z_is_large.w ? 1.0 : 0.0);
+	vec4 large_z = vec4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	vec4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	//  Combine the results from both branches:
+	return large_z * vec4(z_size_check) + small_z * vec4(z_size_check);
+}
+
+vec3 normalized_ligamma_impl(vec3 s, vec3 z,
+    vec3 s_inv, vec3 gamma_s_inv)
+{
+    //  vec3 version:
+	vec3 thresh = vec3(0.775075);
+	bvec3 z_is_large = greaterThan(z , thresh);
+	vec3 z_size_check = vec3(z_is_large.x ? 1.0 : 0.0, z_is_large.y ? 1.0 : 0.0, z_is_large.z ? 1.0 : 0.0);
+	vec3 large_z = vec3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	vec3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	return large_z * vec3(z_size_check) + small_z * vec3(z_size_check);
+}
+
+vec2 normalized_ligamma_impl(vec2 s, vec2 z,
+    vec2 s_inv, vec2 gamma_s_inv)
+{
+    //  vec2 version:
+	vec2 thresh = vec2(0.775075);
+	bvec2 z_is_large = greaterThan(z , thresh);
+	vec2 z_size_check = vec2(z_is_large.x ? 1.0 : 0.0, z_is_large.y ? 1.0 : 0.0);
+	vec2 large_z = vec2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	vec2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	return large_z * vec2(z_size_check) + small_z * vec2(z_size_check);
+}
+
+float normalized_ligamma_impl(float s, float z,
+    float s_inv, float gamma_s_inv)
+{
+    //  Float version:
+	float thresh = 0.775075;
+	float z_size_check = 0.0;
+	if (z > thresh) z_size_check = 1.0;
+	float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	return large_z * float(z_size_check) + small_z * float(z_size_check);
+}
+
+//  Normalized lower incomplete gamma function for small s:
+vec4 normalized_ligamma(vec4 s, vec4 z)
+{
+    //  Requires:   s < ~0.5
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  See normalized_ligamma_impl() for details.
+	vec4 s_inv = vec4(1.0)/s;
+	vec4 gamma_s_inv = vec4(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+vec3 normalized_ligamma(vec3 s, vec3 z)
+{
+    //  vec3 version:
+	vec3 s_inv = vec3(1.0)/s;
+	vec3 gamma_s_inv = vec3(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+vec2 normalized_ligamma(vec2 s, vec2 z)
+{
+    //  vec2 version:
+	vec2 s_inv = vec2(1.0)/s;
+	vec2 gamma_s_inv = vec2(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float normalized_ligamma(float s, float z)
+{
+    //  Float version:
+	float s_inv = 1.0/s;
+	float gamma_s_inv = 1.0/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+#endif  //  SPECIAL_FUNCTIONS_H
+
+///////////////////////////////////  HELPERS  //////////////////////////////////
+
+vec4 uv2_to_uv4(vec2 tex_uv)
+{
+    //  Make a vec2 uv offset safe for adding to vec4 tex2Dlod coords:
+    return vec4(tex_uv, 0.0, 0.0);
+}
+
+//  Make a length squared helper macro (for usage with static constants):
+#define LENGTH_SQ(vec) (dot(vec, vec))
+
+float get_fast_gaussian_weight_sum_inv(float sigma)
+{
+    //  We can use the Gaussian integral to calculate the asymptotic weight for
+    //  the center pixel.  Since the unnormalized center pixel weight is 1.0,
+    //  the normalized weight is the same as the weight sum inverse.  Given a
+    //  large enough blur (9+), the asymptotic weight sum is close and faster:
+    //      center_weight = 0.5 *
+    //          (erf(0.5/(sigma*sqrt(2.0))) - erf(-0.5/(sigma*sqrt(2.0))))
+    //      erf(-x) == -erf(x), so we get 0.5 * (2.0 * erf(blah blah)):
+    //  However, we can get even faster results with curve-fitting.  These are
+    //  also closer than the asymptotic results, because they were constructed
+    //  from 64 blurs sizes from [3, 131) and 255 equally-spaced sigmas from
+    //  (0, blurN_std_dev), so the results for smaller sigmas are biased toward
+    //  smaller blurs.  The max error is 0.0031793913.
+    //  Relative FPS: 134.3 with erf, 135.8 with curve-fitting.
+    //static float temp = 0.5/sqrt(2.0);
+    //return erf(temp/sigma);
+    return min(exp(exp(0.348348412457428/
+        (sigma - 0.0860587260734721))), 0.399334576340352/sigma);
+}
+
+////////////////////  ARBITRARILY RESIZABLE SEPARABLE BLURS  ///////////////////
+
+vec3 tex2Dblur11resize(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 11x Gaussian blurred texture lookup using a 11-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  Calculate Gaussian blur kernel weights and a normalization factor for
+    //  distances of 0-4, ignoring constant factors (since we're normalizing).
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float w4 = exp(-16.0 * denom_inv);
+    float w5 = exp(-25.0 * denom_inv);
+    float weight_sum_inv = 1.0 /
+        (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5));
+    //  Statically normalize weights, sum weighted samples, and return.  Blurs are
+    //  currently optimized for dynamic weights.
+    vec3 sum = vec3(0.0);
+    sum += w5 * tex2D_linearize(tex, tex_uv - 5.0 * dxdy).rgb;
+    sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
+    sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb;
+    sum += w5 * tex2D_linearize(tex, tex_uv + 5.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur9resize(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 9x Gaussian blurred texture lookup using a 9-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float w4 = exp(-16.0 * denom_inv);
+    float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
+    sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur7resize(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 7x Gaussian blurred texture lookup using a 7-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3));
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur5resize(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 5x Gaussian blurred texture lookup using a 5-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2));
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur3resize(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 3x Gaussian blurred texture lookup using a 3-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float weight_sum_inv = 1.0 / (w0 + 2.0 * w1);
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+///////////////////////////  FAST SEPARABLE BLURS  ///////////////////////////
+
+vec3 tex2Dblur11fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   1.) Global requirements must be met (see file description).
+    //              2.) filter_linearN must = "true" in your .cgp file.
+    //              3.) For gamma-correct bilinear filtering, global
+    //                  gamma_aware_bilinear == true (from gamma-management.h)
+    //  Returns:    A 1D 11x Gaussian blurred texture lookup using 6 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float w4 = exp(-16.0 * denom_inv);
+    float w5 = exp(-25.0 * denom_inv);
+    float weight_sum_inv = 1.0 /
+        (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    float w01 = w0 * 0.5 + w1;
+    float w23 = w2 + w3;
+    float w45 = w4 + w5;
+    float w01_ratio = w1/w01;
+    float w23_ratio = w3/w23;
+    float w45_ratio = w5/w45;
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w45 * tex2D_linearize(tex, tex_uv - (4.0 + w45_ratio) * dxdy).rgb;
+    sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb;
+    sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb;
+    sum += w45 * tex2D_linearize(tex, tex_uv + (4.0 + w45_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur17fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 17x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 8 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float w4 = exp(-16.0 * denom_inv);
+    float w5 = exp(-25.0 * denom_inv);
+    float w6 = exp(-36.0 * denom_inv);
+    float w7 = exp(-49.0 * denom_inv);
+    float w8 = exp(-64.0 * denom_inv);
+    //float weight_sum_inv = 1.0 / (w0 + 2.0 * (
+    //    w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8));
+    float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    float w1_2 = w1 + w2;
+    float w3_4 = w3 + w4;
+    float w5_6 = w5 + w6;
+    float w7_8 = w7 + w8;
+    float w1_2_ratio = w2/w1_2;
+    float w3_4_ratio = w4/w3_4;
+    float w5_6_ratio = w6/w5_6;
+    float w7_8_ratio = w8/w7_8;
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur25fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 25x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 12 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float w4 = exp(-16.0 * denom_inv);
+    float w5 = exp(-25.0 * denom_inv);
+    float w6 = exp(-36.0 * denom_inv);
+    float w7 = exp(-49.0 * denom_inv);
+    float w8 = exp(-64.0 * denom_inv);
+    float w9 = exp(-81.0 * denom_inv);
+    float w10 = exp(-100.0 * denom_inv);
+    float w11 = exp(-121.0 * denom_inv);
+    float w12 = exp(-144.0 * denom_inv);
+    //float weight_sum_inv = 1.0 / (w0 + 2.0 * (
+    //    w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12));
+    float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    float w1_2 = w1 + w2;
+    float w3_4 = w3 + w4;
+    float w5_6 = w5 + w6;
+    float w7_8 = w7 + w8;
+    float w9_10 = w9 + w10;
+    float w11_12 = w11 + w12;
+    float w1_2_ratio = w2/w1_2;
+    float w3_4_ratio = w4/w3_4;
+    float w5_6_ratio = w6/w5_6;
+    float w7_8_ratio = w8/w7_8;
+    float w9_10_ratio = w10/w9_10;
+    float w11_12_ratio = w12/w11_12;
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w11_12 * tex2D_linearize(tex, tex_uv - (11.0 + w11_12_ratio) * dxdy).rgb;
+    sum += w9_10 * tex2D_linearize(tex, tex_uv - (9.0 + w9_10_ratio) * dxdy).rgb;
+    sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w9_10 * tex2D_linearize(tex, tex_uv + (9.0 + w9_10_ratio) * dxdy).rgb;
+    sum += w11_12 * tex2D_linearize(tex, tex_uv + (11.0 + w11_12_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur31fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 31x Gaussian blurred texture lookup using 16 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float w4 = exp(-16.0 * denom_inv);
+    float w5 = exp(-25.0 * denom_inv);
+    float w6 = exp(-36.0 * denom_inv);
+    float w7 = exp(-49.0 * denom_inv);
+    float w8 = exp(-64.0 * denom_inv);
+    float w9 = exp(-81.0 * denom_inv);
+    float w10 = exp(-100.0 * denom_inv);
+    float w11 = exp(-121.0 * denom_inv);
+    float w12 = exp(-144.0 * denom_inv);
+    float w13 = exp(-169.0 * denom_inv);
+    float w14 = exp(-196.0 * denom_inv);
+    float w15 = exp(-225.0 * denom_inv);
+    //float weight_sum_inv = 1.0 /
+    //    (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 +
+    //        w9 + w10 + w11 + w12 + w13 + w14 + w15));
+    float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    float w0_1 = w0 * 0.5 + w1;
+    float w2_3 = w2 + w3;
+    float w4_5 = w4 + w5;
+    float w6_7 = w6 + w7;
+    float w8_9 = w8 + w9;
+    float w10_11 = w10 + w11;
+    float w12_13 = w12 + w13;
+    float w14_15 = w14 + w15;
+    float w0_1_ratio = w1/w0_1;
+    float w2_3_ratio = w3/w2_3;
+    float w4_5_ratio = w5/w4_5;
+    float w6_7_ratio = w7/w6_7;
+    float w8_9_ratio = w9/w8_9;
+    float w10_11_ratio = w11/w10_11;
+    float w12_13_ratio = w13/w12_13;
+    float w14_15_ratio = w15/w14_15;
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur43fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 43x Gaussian blurred texture lookup using 22 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float w4 = exp(-16.0 * denom_inv);
+    float w5 = exp(-25.0 * denom_inv);
+    float w6 = exp(-36.0 * denom_inv);
+    float w7 = exp(-49.0 * denom_inv);
+    float w8 = exp(-64.0 * denom_inv);
+    float w9 = exp(-81.0 * denom_inv);
+    float w10 = exp(-100.0 * denom_inv);
+    float w11 = exp(-121.0 * denom_inv);
+    float w12 = exp(-144.0 * denom_inv);
+    float w13 = exp(-169.0 * denom_inv);
+    float w14 = exp(-196.0 * denom_inv);
+    float w15 = exp(-225.0 * denom_inv);
+    float w16 = exp(-256.0 * denom_inv);
+    float w17 = exp(-289.0 * denom_inv);
+    float w18 = exp(-324.0 * denom_inv);
+    float w19 = exp(-361.0 * denom_inv);
+    float w20 = exp(-400.0 * denom_inv);
+    float w21 = exp(-441.0 * denom_inv);
+    //float weight_sum_inv = 1.0 /
+    //    (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 +
+    //        w12 + w13 + w14 + w15 + w16 + w17 + w18 + w19 + w20 + w21));
+    float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    float w0_1 = w0 * 0.5 + w1;
+    float w2_3 = w2 + w3;
+    float w4_5 = w4 + w5;
+    float w6_7 = w6 + w7;
+    float w8_9 = w8 + w9;
+    float w10_11 = w10 + w11;
+    float w12_13 = w12 + w13;
+    float w14_15 = w14 + w15;
+    float w16_17 = w16 + w17;
+    float w18_19 = w18 + w19;
+    float w20_21 = w20 + w21;
+    float w0_1_ratio = w1/w0_1;
+    float w2_3_ratio = w3/w2_3;
+    float w4_5_ratio = w5/w4_5;
+    float w6_7_ratio = w7/w6_7;
+    float w8_9_ratio = w9/w8_9;
+    float w10_11_ratio = w11/w10_11;
+    float w12_13_ratio = w13/w12_13;
+    float w14_15_ratio = w15/w14_15;
+    float w16_17_ratio = w17/w16_17;
+    float w18_19_ratio = w19/w18_19;
+    float w20_21_ratio = w21/w20_21;
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w20_21 * tex2D_linearize(tex, tex_uv - (20.0 + w20_21_ratio) * dxdy).rgb;
+    sum += w18_19 * tex2D_linearize(tex, tex_uv - (18.0 + w18_19_ratio) * dxdy).rgb;
+    sum += w16_17 * tex2D_linearize(tex, tex_uv - (16.0 + w16_17_ratio) * dxdy).rgb;
+    sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb;
+    sum += w16_17 * tex2D_linearize(tex, tex_uv + (16.0 + w16_17_ratio) * dxdy).rgb;
+    sum += w18_19 * tex2D_linearize(tex, tex_uv + (18.0 + w18_19_ratio) * dxdy).rgb;
+    sum += w20_21 * tex2D_linearize(tex, tex_uv + (20.0 + w20_21_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur3fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 3x Gaussian blurred texture lookup using 2 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float weight_sum_inv = 1.0 / (w0 + 2.0 * w1);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    float w01 = w0 * 0.5 + w1;
+    float w01_ratio = w1/w01;
+    //  Weights for all samples are the same, so just average them:
+    return 0.5 * (
+        tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb +
+        tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb);
+}
+
+vec3 tex2Dblur5fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 5x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 2 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    float w12 = w1 + w2;
+    float w12_ratio = w2/w12;
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur7fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 7x Gaussian blurred texture lookup using 4 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    float w01 = w0 * 0.5 + w1;
+    float w23 = w2 + w3;
+    float w01_ratio = w1/w01;
+    float w23_ratio = w3/w23;
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb;
+    sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+////////////////////  ARBITRARILY RESIZABLE ONE-PASS BLURS  ////////////////////
+
+vec3 tex2Dblur3x3resize(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 3x3 Gaussian blurred mipmapped texture lookup of the
+    //              resized input.
+    //  Description:
+    //  This is the only arbitrarily resizable one-pass blur; tex2Dblur5x5resize
+    //  would perform like tex2Dblur9x9, MUCH slower than tex2Dblur5resize.
+    float denom_inv = 0.5/(sigma*sigma);
+    //  Load each sample.  We need all 3x3 samples.  Quad-pixel communication
+    //  won't help either: This should perform like tex2Dblur5x5, but sharing a
+    //  4x4 sample field would perform more like tex2Dblur8x8shared (worse).
+    vec2 sample4_uv = tex_uv;
+    vec2 dx = vec2(dxdy.x, 0.0);
+    vec2 dy = vec2(0.0, dxdy.y);
+    vec2 sample1_uv = sample4_uv - dy;
+    vec2 sample7_uv = sample4_uv + dy;
+    vec3 sample0 = tex2D_linearize(tex, sample1_uv - dx).rgb;
+    vec3 sample1 = tex2D_linearize(tex, sample1_uv).rgb;
+    vec3 sample2 = tex2D_linearize(tex, sample1_uv + dx).rgb;
+    vec3 sample3 = tex2D_linearize(tex, sample4_uv - dx).rgb;
+    vec3 sample4 = tex2D_linearize(tex, sample4_uv).rgb;
+    vec3 sample5 = tex2D_linearize(tex, sample4_uv + dx).rgb;
+    vec3 sample6 = tex2D_linearize(tex, sample7_uv - dx).rgb;
+    vec3 sample7 = tex2D_linearize(tex, sample7_uv).rgb;
+    vec3 sample8 = tex2D_linearize(tex, sample7_uv + dx).rgb;
+    //  Statically compute Gaussian sample weights:
+    float w4 = 1.0;
+    float w1_3_5_7 = exp(-LENGTH_SQ(vec2(1.0, 0.0)) * denom_inv);
+    float w0_2_6_8 = exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv);
+    float weight_sum_inv = 1.0/(w4 + 4.0 * (w1_3_5_7 + w0_2_6_8));
+    //  Weight and sum the samples:
+    vec3 sum = w4 * sample4 +
+        w1_3_5_7 * (sample1 + sample3 + sample5 + sample7) +
+        w0_2_6_8 * (sample0 + sample2 + sample6 + sample8);
+    return sum * weight_sum_inv;
+}
+
+//  Resizable one-pass blurs:
+vec3 tex2Dblur3x3resize(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur3x3resize(texture, tex_uv, dxdy, blur3_std_dev);
+}
+
+vec3 tex2Dblur9fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 9x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 4 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float w4 = exp(-16.0 * denom_inv);
+    float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    float w12 = w1 + w2;
+    float w34 = w3 + w4;
+    float w12_ratio = w2/w12;
+    float w34_ratio = w4/w34;
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w34 * tex2D_linearize(tex, tex_uv - (3.0 + w34_ratio) * dxdy).rgb;
+    sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb;
+    sum += w34 * tex2D_linearize(tex, tex_uv + (3.0 + w34_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur9x9(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Perform a 1-pass 9x9 blur with 5x5 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 9x9 Gaussian blurred mipmapped texture lookup composed of
+    //              5x5 carefully selected bilinear samples.
+    //  Description:
+    //  Perform a 1-pass 9x9 blur with 5x5 bilinear samples.  Adjust the
+    //  bilinear sample location to reflect the true Gaussian weights for each
+    //  underlying texel.  The following diagram illustrates the relative
+    //  locations of bilinear samples.  Each sample with the same number has the
+    //  same weight (notice the symmetry).  The letters a, b, c, d distinguish
+    //  quadrants, and the letters U, D, L, R, C (up, down, left, right, center)
+    //  distinguish 1D directions along the line containing the pixel center:
+    //      6a 5a 2U 5b 6b
+    //      4a 3a 1U 3b 4b
+    //      2L 1L 0C 1R 2R
+    //      4c 3c 1D 3d 4d
+    //      6c 5c 2D 5d 6d
+    //  The following diagram illustrates the underlying equally spaced texels,
+    //  named after the sample that accesses them and subnamed by their location
+    //  within their 2x2, 2x1, 1x2, or 1x1 texel block:
+    //      6a4 6a3 5a4 5a3 2U2 5b3 5b4 6b3 6b4
+    //      6a2 6a1 5a2 5a1 2U1 5b1 5b2 6b1 6b2
+    //      4a4 4a3 3a4 3a3 1U2 3b3 3b4 4b3 4b4
+    //      4a2 4a1 3a2 3a1 1U1 3b1 3b2 4b1 4b2
+    //      2L2 2L1 1L2 1L1 0C1 1R1 1R2 2R1 2R2
+    //      4c2 4c1 3c2 3c1 1D1 3d1 3d2 4d1 4d2
+    //      4c4 4c3 3c4 3c3 1D2 3d3 3d4 4d3 4d4
+    //      6c2 6c1 5c2 5c1 2D1 5d1 5d2 6d1 6d2
+    //      6c4 6c3 5c4 5c3 2D2 5d3 5d4 6d3 6d4
+    //  Note there is only one C texel and only two texels for each U, D, L, or
+    //  R sample.  The center sample is effectively a nearest neighbor sample,
+    //  and the U/D/L/R samples use 1D linear filtering.  All other texels are
+    //  read with bilinear samples somewhere within their 2x2 texel blocks.
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute sampling offsets within each 2x2 texel block, based
+    //  on 1D sampling ratios between texels [1, 2] and [3, 4] texels away from
+    //  the center, and reuse them independently for both dimensions.  Compute
+    //  these offsets based on the relative 1D Gaussian weights of the texels
+    //  in question.  (w1off means "Gaussian weight for the texel 1.0 texels
+    //  away from the pixel center," etc.).
+    float denom_inv = 0.5/(sigma*sigma);
+    float w1off = exp(-1.0 * denom_inv);
+    float w2off = exp(-4.0 * denom_inv);
+    float w3off = exp(-9.0 * denom_inv);
+    float w4off = exp(-16.0 * denom_inv);
+    float texel1to2ratio = w2off/(w1off + w2off);
+    float texel3to4ratio = w4off/(w3off + w4off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including x-axis-aligned:
+    vec2 sample1R_texel_offset = vec2(1.0, 0.0) + vec2(texel1to2ratio, 0.0);
+    vec2 sample2R_texel_offset = vec2(3.0, 0.0) + vec2(texel3to4ratio, 0.0);
+    vec2 sample3d_texel_offset = vec2(1.0, 1.0) + vec2(texel1to2ratio, texel1to2ratio);
+    vec2 sample4d_texel_offset = vec2(3.0, 1.0) + vec2(texel3to4ratio, texel1to2ratio);
+    vec2 sample5d_texel_offset = vec2(1.0, 3.0) + vec2(texel1to2ratio, texel3to4ratio);
+    vec2 sample6d_texel_offset = vec2(3.0, 3.0) + vec2(texel3to4ratio, texel3to4ratio);
+
+    //  CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
+    //  Statically compute Gaussian texel weights for the bottom-right quadrant.
+    //  Read underscores as "and."
+    float w1R1 = w1off;
+    float w1R2 = w2off;
+    float w2R1 = w3off;
+    float w2R2 = w4off;
+    float w3d1 =     exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv);
+    float w3d2_3d3 = exp(-LENGTH_SQ(vec2(2.0, 1.0)) * denom_inv);
+    float w3d4 =     exp(-LENGTH_SQ(vec2(2.0, 2.0)) * denom_inv);
+    float w4d1_5d1 = exp(-LENGTH_SQ(vec2(3.0, 1.0)) * denom_inv);
+    float w4d2_5d3 = exp(-LENGTH_SQ(vec2(4.0, 1.0)) * denom_inv);
+    float w4d3_5d2 = exp(-LENGTH_SQ(vec2(3.0, 2.0)) * denom_inv);
+    float w4d4_5d4 = exp(-LENGTH_SQ(vec2(4.0, 2.0)) * denom_inv);
+    float w6d1 =     exp(-LENGTH_SQ(vec2(3.0, 3.0)) * denom_inv);
+    float w6d2_6d3 = exp(-LENGTH_SQ(vec2(4.0, 3.0)) * denom_inv);
+    float w6d4 =     exp(-LENGTH_SQ(vec2(4.0, 4.0)) * denom_inv);
+    //  Statically add texel weights in each sample to get sample weights:
+    float w0 = 1.0;
+    float w1 = w1R1 + w1R2;
+    float w2 = w2R1 + w2R2;
+    float w3 = w3d1 + 2.0 * w3d2_3d3 + w3d4;
+    float w4 = w4d1_5d1 + w4d2_5d3 + w4d3_5d2 + w4d4_5d4;
+    float w5 = w4;
+    float w6 = w6d1 + 2.0 * w6d2_6d3 + w6d4;
+    //  Get the weight sum inverse (normalization factor):
+    float weight_sum_inv =
+        1.0/(w0 + 4.0 * (w1 + w2 + w3 + w4 + w5 + w6));
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 25 samples (1 nearest, 8 linear, 16 bilinear) using symmetry:
+    vec2 mirror_x = vec2(-1.0, 1.0);
+    vec2 mirror_y = vec2(1.0, -1.0);
+    vec2 mirror_xy = vec2(-1.0, -1.0);
+    vec2 dxdy_mirror_x = dxdy * mirror_x;
+    vec2 dxdy_mirror_y = dxdy * mirror_y;
+    vec2 dxdy_mirror_xy = dxdy * mirror_xy;
+    //  Sampling order doesn't seem to affect performance, so just be clear:
+    vec3 sample0C = tex2D_linearize(tex, tex_uv).rgb;
+    vec3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb;
+    vec3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb;
+    vec3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb;
+    vec3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb;
+    vec3 sample2R = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset).rgb;
+    vec3 sample2D = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset.yx).rgb;
+    vec3 sample2L = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset).rgb;
+    vec3 sample2U = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset.yx).rgb;
+    vec3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb;
+    vec3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb;
+    vec3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb;
+    vec3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb;
+    vec3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb;
+    vec3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb;
+    vec3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb;
+    vec3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb;
+    vec3 sample5d = tex2D_linearize(tex, tex_uv + dxdy * sample5d_texel_offset).rgb;
+    vec3 sample5c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample5d_texel_offset).rgb;
+    vec3 sample5b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample5d_texel_offset).rgb;
+    vec3 sample5a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample5d_texel_offset).rgb;
+    vec3 sample6d = tex2D_linearize(tex, tex_uv + dxdy * sample6d_texel_offset).rgb;
+    vec3 sample6c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample6d_texel_offset).rgb;
+    vec3 sample6b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample6d_texel_offset).rgb;
+    vec3 sample6a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample6d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    vec3 sum = w0 * sample0C;
+    sum += w1 * (sample1R + sample1D + sample1L + sample1U);
+    sum += w2 * (sample2R + sample2D + sample2L + sample2U);
+    sum += w3 * (sample3d + sample3c + sample3b + sample3a);
+    sum += w4 * (sample4d + sample4c + sample4b + sample4a);
+    sum += w5 * (sample5d + sample5c + sample5b + sample5a);
+    sum += w6 * (sample6d + sample6c + sample6b + sample6a);
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur7x7(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Perform a 1-pass 7x7 blur with 5x5 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 7x7 Gaussian blurred mipmapped texture lookup composed of
+    //              4x4 carefully selected bilinear samples.
+    //  Description:
+    //  First see the descriptions for tex2Dblur9x9() and tex2Dblur7().  This
+    //  blur mixes concepts from both.  The sample layout is as follows:
+    //      4a 3a 3b 4b
+    //      2a 1a 1b 2b
+    //      2c 1c 1d 2d
+    //      4c 3c 3d 4d
+    //  The texel layout is as follows.  Note that samples 3a/3b, 1a/1b, 1c/1d,
+    //  and 3c/3d share a vertical column of texels, and samples 2a/2c, 1a/1c,
+    //  1b/1d, and 2b/2d share a horizontal row of texels (all sample1's share
+    //  the center texel):
+    //      4a4  4a3  3a4  3ab3 3b4  4b3  4b4
+    //      4a2  4a1  3a2  3ab1 3b2  4b1  4b2
+    //      2a4  2a3  1a4  1ab3 1b4  2b3  2b4
+    //      2ac2 2ac1 1ac2 1*   1bd2 2bd1 2bd2
+    //      2c4  2c3  1c4  1cd3 1d4  2d3  2d4
+    //      4c2  4c1  3c2  3cd1 3d2  4d1  4d2
+    //      4c4  4c3  3c4  3cd3 3d4  4d3  4d4
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0off = 1.0;
+    float w1off = exp(-1.0 * denom_inv);
+    float w2off = exp(-4.0 * denom_inv);
+    float w3off = exp(-9.0 * denom_inv);
+    float texel0to1ratio = w1off/(w0off * 0.5 + w1off);
+    float texel2to3ratio = w3off/(w2off + w3off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including axis-aligned:
+    vec2 sample1d_texel_offset = vec2(texel0to1ratio, texel0to1ratio);
+    vec2 sample2d_texel_offset = vec2(2.0, 0.0) + vec2(texel2to3ratio, texel0to1ratio);
+    vec2 sample3d_texel_offset = vec2(0.0, 2.0) + vec2(texel0to1ratio, texel2to3ratio);
+    vec2 sample4d_texel_offset = vec2(2.0, 2.0) + vec2(texel2to3ratio, texel2to3ratio);
+
+    //  CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
+    //  Statically compute Gaussian texel weights for the bottom-right quadrant.
+    //  Read underscores as "and."
+    float w1abcd = 1.0;
+    float w1bd2_1cd3 = exp(-LENGTH_SQ(vec2(1.0, 0.0)) * denom_inv);
+    float w2bd1_3cd1 = exp(-LENGTH_SQ(vec2(2.0, 0.0)) * denom_inv);
+    float w2bd2_3cd2 = exp(-LENGTH_SQ(vec2(3.0, 0.0)) * denom_inv);
+    float w1d4 =       exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv);
+    float w2d3_3d2 =   exp(-LENGTH_SQ(vec2(2.0, 1.0)) * denom_inv);
+    float w2d4_3d4 =   exp(-LENGTH_SQ(vec2(3.0, 1.0)) * denom_inv);
+    float w4d1 =       exp(-LENGTH_SQ(vec2(2.0, 2.0)) * denom_inv);
+    float w4d2_4d3 =   exp(-LENGTH_SQ(vec2(3.0, 2.0)) * denom_inv);
+    float w4d4 =       exp(-LENGTH_SQ(vec2(3.0, 3.0)) * denom_inv);
+    //  Statically add texel weights in each sample to get sample weights.
+    //  Split weights for shared texels between samples sharing them:
+    float w1 = w1abcd * 0.25 + w1bd2_1cd3 + w1d4;
+    float w2_3 = (w2bd1_3cd1 + w2bd2_3cd2) * 0.5 + w2d3_3d2 + w2d4_3d4;
+    float w4 = w4d1 + 2.0 * w4d2_4d3 + w4d4;
+    //  Get the weight sum inverse (normalization factor):
+    float weight_sum_inv =
+        1.0/(4.0 * (w1 + 2.0 * w2_3 + w4));
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 16 samples using symmetry:
+    vec2 mirror_x = vec2(-1.0, 1.0);
+    vec2 mirror_y = vec2(1.0, -1.0);
+    vec2 mirror_xy = vec2(-1.0, -1.0);
+    vec2 dxdy_mirror_x = dxdy * mirror_x;
+    vec2 dxdy_mirror_y = dxdy * mirror_y;
+    vec2 dxdy_mirror_xy = dxdy * mirror_xy;
+    vec3 sample1a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample1d_texel_offset).rgb;
+    vec3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb;
+    vec3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb;
+    vec3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb;
+    vec3 sample1b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample1d_texel_offset).rgb;
+    vec3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb;
+    vec3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb;
+    vec3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb;
+    vec3 sample1c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample1d_texel_offset).rgb;
+    vec3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb;
+    vec3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb;
+    vec3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb;
+    vec3 sample1d = tex2D_linearize(tex, tex_uv + dxdy * sample1d_texel_offset).rgb;
+    vec3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb;
+    vec3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb;
+    vec3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    vec3 sum = vec3(0.0);
+    sum += w1 * (sample1a + sample1b + sample1c + sample1d);
+    sum += w2_3 * (sample2a + sample2b + sample2c + sample2d);
+    sum += w2_3 * (sample3a + sample3b + sample3c + sample3d);
+    sum += w4 * (sample4a + sample4b + sample4c + sample4d);
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur5x5(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Perform a 1-pass 5x5 blur with 3x3 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 5x5 Gaussian blurred mipmapped texture lookup composed of
+    //              3x3 carefully selected bilinear samples.
+    //  Description:
+    //  First see the description for tex2Dblur9x9().  This blur uses the same
+    //  concept and sample/texel locations except on a smaller scale.  Samples:
+    //      2a 1U 2b
+    //      1L 0C 1R
+    //      2c 1D 2d
+    //  Texels:
+    //      2a4 2a3 1U2 2b3 2b4
+    //      2a2 2a1 1U1 2b1 2b2
+    //      1L2 1L1 0C1 1R1 1R2
+    //      2c2 2c1 1D1 2d1 2d2
+    //      2c4 2c3 1D2 2d3 2d4
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
+    float denom_inv = 0.5/(sigma*sigma);
+    float w1off = exp(-1.0 * denom_inv);
+    float w2off = exp(-4.0 * denom_inv);
+    float texel1to2ratio = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including x-axis-aligned:
+    vec2 sample1R_texel_offset = vec2(1.0, 0.0) + vec2(texel1to2ratio, 0.0);
+    vec2 sample2d_texel_offset = vec2(1.0, 1.0) + vec2(texel1to2ratio, texel1to2ratio);
+
+    //  CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
+    //  Statically compute Gaussian texel weights for the bottom-right quadrant.
+    //  Read underscores as "and."
+    float w1R1 = w1off;
+    float w1R2 = w2off;
+    float w2d1 =   exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv);
+    float w2d2_3 = exp(-LENGTH_SQ(vec2(2.0, 1.0)) * denom_inv);
+    float w2d4 =   exp(-LENGTH_SQ(vec2(2.0, 2.0)) * denom_inv);
+    //  Statically add texel weights in each sample to get sample weights:
+    float w0 = 1.0;
+    float w1 = w1R1 + w1R2;
+    float w2 = w2d1 + 2.0 * w2d2_3 + w2d4;
+    //  Get the weight sum inverse (normalization factor):
+    float weight_sum_inv = 1.0/(w0 + 4.0 * (w1 + w2));
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 9 samples (1 nearest, 4 linear, 4 bilinear) using symmetry:
+    vec2 mirror_x = vec2(-1.0, 1.0);
+    vec2 mirror_y = vec2(1.0, -1.0);
+    vec2 mirror_xy = vec2(-1.0, -1.0);
+    vec2 dxdy_mirror_x = dxdy * mirror_x;
+    vec2 dxdy_mirror_y = dxdy * mirror_y;
+    vec2 dxdy_mirror_xy = dxdy * mirror_xy;
+    vec3 sample0C = tex2D_linearize(tex, tex_uv).rgb;
+    vec3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb;
+    vec3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb;
+    vec3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb;
+    vec3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb;
+    vec3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb;
+    vec3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb;
+    vec3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb;
+    vec3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    vec3 sum = w0 * sample0C;
+    sum += w1 * (sample1R + sample1D + sample1L + sample1U);
+    sum += w2 * (sample2a + sample2b + sample2c + sample2d);
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur3x3(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Perform a 1-pass 3x3 blur with 5x5 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 3x3 Gaussian blurred mipmapped texture lookup composed of
+    //              2x2 carefully selected bilinear samples.
+    //  Description:
+    //  First see the descriptions for tex2Dblur9x9() and tex2Dblur7().  This
+    //  blur mixes concepts from both.  The sample layout is as follows:
+    //      0a 0b
+    //      0c 0d
+    //  The texel layout is as follows.  Note that samples 0a/0b and 0c/0d share
+    //  a vertical column of texels, and samples 0a/0c and 0b/0d share a
+    //  horizontal row of texels (all samples share the center texel):
+    //      0a3  0ab2 0b3
+    //      0ac1 0*0  0bd1
+    //      0c3  0cd2 0d3
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0off = 1.0;
+    float w1off = exp(-1.0 * denom_inv);
+    float texel0to1ratio = w1off/(w0off * 0.5 + w1off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including axis-aligned:
+    vec2 sample0d_texel_offset = vec2(texel0to1ratio, texel0to1ratio);
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 4 samples using symmetry:
+    vec2 mirror_x = vec2(-1.0, 1.0);
+    vec2 mirror_y = vec2(1.0, -1.0);
+    vec2 mirror_xy = vec2(-1.0, -1.0);
+    vec2 dxdy_mirror_x = dxdy * mirror_x;
+    vec2 dxdy_mirror_y = dxdy * mirror_y;
+    vec2 dxdy_mirror_xy = dxdy * mirror_xy;
+    vec3 sample0a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample0d_texel_offset).rgb;
+    vec3 sample0b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample0d_texel_offset).rgb;
+    vec3 sample0c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample0d_texel_offset).rgb;
+    vec3 sample0d = tex2D_linearize(tex, tex_uv + dxdy * sample0d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Weights for all samples are the same, so just average them:
+    return 0.25 * (sample0a + sample0b + sample0c + sample0d);
+}
+
+vec3 tex2Dblur9fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur9fast(tex, tex_uv, dxdy, blur9_std_dev);
+}
+
+vec3 tex2Dblur17fast(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur17fast(texture, tex_uv, dxdy, blur17_std_dev);
+}
+
+vec3 tex2Dblur25fast(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur25fast(texture, tex_uv, dxdy, blur25_std_dev);
+}
+
+vec3 tex2Dblur43fast(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur43fast(texture, tex_uv, dxdy, blur43_std_dev);
+}
+vec3 tex2Dblur31fast(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur31fast(texture, tex_uv, dxdy, blur31_std_dev);
+}
+
+vec3 tex2Dblur3fast(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur3fast(texture, tex_uv, dxdy, blur3_std_dev);
+}
+
+vec3 tex2Dblur3x3(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur3x3(texture, tex_uv, dxdy, blur3_std_dev);
+}
+
+vec3 tex2Dblur5fast(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur5fast(texture, tex_uv, dxdy, blur5_std_dev);
+}
+
+vec3 tex2Dblur5resize(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur5resize(texture, tex_uv, dxdy, blur5_std_dev);
+}
+vec3 tex2Dblur3resize(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur3resize(texture, tex_uv, dxdy, blur3_std_dev);
+}
+
+vec3 tex2Dblur5x5(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur5x5(texture, tex_uv, dxdy, blur5_std_dev);
+}
+
+vec3 tex2Dblur7resize(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur7resize(texture, tex_uv, dxdy, blur7_std_dev);
+}
+
+vec3 tex2Dblur7fast(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur7fast(texture, tex_uv, dxdy, blur7_std_dev);
+}
+
+vec3 tex2Dblur7x7(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur7x7(texture, tex_uv, dxdy, blur7_std_dev);
+}
+
+vec3 tex2Dblur9resize(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur9resize(texture, tex_uv, dxdy, blur9_std_dev);
+}
+
+vec3 tex2Dblur9x9(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur9x9(texture, tex_uv, dxdy, blur9_std_dev);
+}
+
+vec3 tex2Dblur11resize(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur11resize(texture, tex_uv, dxdy, blur11_std_dev);
+}
+
+vec3 tex2Dblur11fast(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur11fast(texture, tex_uv, dxdy, blur11_std_dev);
+}
+
+#endif  //  BLUR_FUNCTIONS_H
+
+#define Source source[0]
+#define tex_uv vTexCoord.xy
+
+#define InputSize sourceSize[0].xy
+#define TextureSize sourceSize[0].xy
+#define OutputSize targetSize.xy
+
+void main() {
+	vec3 color = tex2Dblur9fast(Source, tex_uv, blur_dxdy);
+    //  Encode and output the blurred image:
+    FragColor = encode_output(vec4(color, 1.0));
+}
\ No newline at end of file
diff --git a/shaders/CRT-Royale.shader/blur9fast-horizontal.vs b/shaders/CRT-Royale.shader/blur9fast-horizontal.vs
new file mode 100644
index 00000000..7f3b2b94
--- /dev/null
+++ b/shaders/CRT-Royale.shader/blur9fast-horizontal.vs
@@ -0,0 +1,2025 @@
+#version 150
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//  
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+#if __VERSION__ >= 130
+#define COMPAT_TEXTURE texture
+#else
+#define COMPAT_TEXTURE texture2D
+#endif
+
+#ifdef GL_ES
+#define COMPAT_PRECISION mediump
+#else
+#define COMPAT_PRECISION
+#endif
+
+in vec4 position;
+in vec2 texCoord;
+
+out Vertex {
+   vec2 vTexCoord;
+   vec2 blur_dxdy;
+};
+
+uniform vec4 targetSize;
+uniform vec4 sourceSize[];
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+//  PASS SETTINGS:
+//  gamma-management.h needs to know what kind of pipeline we're using and
+//  what pass this is in that pipeline.  This will become obsolete if/when we
+//  can #define things like this in the .cgp preset file.
+//#define GAMMA_ENCODE_EVERY_FBO
+//#define FIRST_PASS
+//#define LAST_PASS
+//#define SIMULATE_CRT_ON_LCD
+//#define SIMULATE_GBA_ON_LCD
+//#define SIMULATE_LCD_ON_CRT
+//#define SIMULATE_GBA_ON_CRT
+
+#ifndef GAMMA_MANAGEMENT_H
+#define GAMMA_MANAGEMENT_H
+
+///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
+
+//  Set standard gamma constants, but allow users to override them:
+#ifndef OVERRIDE_STANDARD_GAMMA
+    //  Standard encoding gammas:
+    float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
+    float pal_gamma = 2.8;     //  Never actually 2.8 in practice
+    //  Typical device decoding gammas (only use for emulating devices):
+    //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
+    //  gammas: The standards purposely undercorrected for an analog CRT's
+    //  assumed 2.5 reference display gamma to maintain contrast in assumed
+    //  [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
+    //  These unstated assumptions about display gamma and perceptual rendering
+    //  intent caused a lot of confusion, and more modern CRT's seemed to target
+    //  NTSC 2.2 gamma with circuitry.  LCD displays seem to have followed suit
+    //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
+    //  displays designed to view sRGB in bright environments.  (Standards are
+    //  also in flux again with BT.1886, but it's underspecified for displays.)
+    float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
+    float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
+    float lcd_reference_gamma = 2.5;       //  To match CRT
+    float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
+    float lcd_office_gamma = 2.2;  //  Approximates sRGB
+#endif  //  OVERRIDE_STANDARD_GAMMA
+
+//  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
+//  but only if they're aware of it.
+#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
+    bool assume_opaque_alpha = false;
+#endif
+
+
+///////////////////////  DERIVED CONSTANTS AS FUNCTIONS  ///////////////////////
+
+//  gamma-management.h should be compatible with overriding gamma values with
+//  runtime user parameters, but we can only define other global constants in
+//  terms of static constants, not uniform user parameters.  To get around this
+//  limitation, we need to define derived constants using functions.
+
+//  Set device gamma constants, but allow users to override them:
+#ifdef OVERRIDE_DEVICE_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    float get_crt_gamma()    {   return crt_gamma;   }
+    float get_gba_gamma()    {   return gba_gamma;   }
+    float get_lcd_gamma()    {   return lcd_gamma;   }
+#else
+    float get_crt_gamma()    {   return crt_reference_gamma_high;    }
+    float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
+    float get_lcd_gamma()    {   return lcd_office_gamma;            }
+#endif  //  OVERRIDE_DEVICE_GAMMA
+
+//  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
+#ifdef OVERRIDE_FINAL_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    float get_intermediate_gamma()   {   return intermediate_gamma;  }
+    float get_input_gamma()          {   return input_gamma;         }
+    float get_output_gamma()         {   return output_gamma;        }
+#else
+    //  If we gamma-correct every pass, always use ntsc_gamma between passes to
+    //  ensure middle passes don't need to care if anything is being simulated:
+    float get_intermediate_gamma()   {   return ntsc_gamma;          }
+    #ifdef SIMULATE_CRT_ON_LCD
+        float get_input_gamma()      {   return get_crt_gamma();     }
+        float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_LCD
+        float get_input_gamma()      {   return get_gba_gamma();     }
+        float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_LCD_ON_CRT
+        float get_input_gamma()      {   return get_lcd_gamma();     }
+        float get_output_gamma()     {   return get_crt_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_CRT
+        float get_input_gamma()      {   return get_gba_gamma();     }
+        float get_output_gamma()     {   return get_crt_gamma();     }
+    #else   //  Don't simulate anything:
+        float get_input_gamma()      {   return ntsc_gamma;          }
+        float get_output_gamma()     {   return ntsc_gamma;          }
+    #endif  //  SIMULATE_GBA_ON_CRT
+    #endif  //  SIMULATE_LCD_ON_CRT
+    #endif  //  SIMULATE_GBA_ON_LCD
+    #endif  //  SIMULATE_CRT_ON_LCD
+#endif  //  OVERRIDE_FINAL_GAMMA
+
+#ifndef GAMMA_ENCODE_EVERY_FBO
+    #ifdef FIRST_PASS
+        bool linearize_input = true;
+        float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        bool linearize_input = false;
+        float get_pass_input_gamma()     {   return 1.0;                 }
+    #endif
+    #ifdef LAST_PASS
+        bool gamma_encode_output = true;
+        float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        bool gamma_encode_output = false;
+        float get_pass_output_gamma()    {   return 1.0;                 }
+    #endif
+#else
+    bool linearize_input = true;
+    bool gamma_encode_output = true;
+    #ifdef FIRST_PASS
+        float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
+    #endif
+    #ifdef LAST_PASS
+        float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
+    #endif
+#endif
+
+vec4 decode_input(vec4 color)
+{
+    if(linearize_input = true)
+    {
+        if(assume_opaque_alpha = true)
+        {
+            return vec4(pow(color.rgb, vec3(get_pass_input_gamma())), 1.0);
+        }
+        else
+        {
+            return vec4(pow(color.rgb, vec3(get_pass_input_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+vec4 encode_output(vec4 color)
+{
+    if(gamma_encode_output = true)
+    {
+        if(assume_opaque_alpha = true)
+        {
+            return vec4(pow(color.rgb, vec3(1.0/get_pass_output_gamma())), 1.0);
+        }
+        else
+        {
+            return vec4(pow(color.rgb, vec3(1.0/get_pass_output_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D)))
+//vec4 tex2D_linearize(sampler2D tex, vec2 tex_coords)
+//{   return decode_input(vec4(COMPAT_TEXTURE(tex, tex_coords)));   }
+
+//#define tex2D_linearize(C, D, E) decode_input(vec4(COMPAT_TEXTURE(C, D, E)))
+//vec4 tex2D_linearize(sampler2D tex, vec2 tex_coords, int texel_off)
+//{   return decode_input(vec4(COMPAT_TEXTURE(tex, tex_coords, texel_off)));    }
+
+#endif  //  GAMMA_MANAGEMENT_H
+
+#ifndef BLUR_FUNCTIONS_H
+#define BLUR_FUNCTIONS_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides reusable one-pass and separable (two-pass) blurs.
+//  Requires:   All blurs share these requirements (dxdy requirement is split):
+//              1.) All requirements of gamma-management.h must be satisfied!
+//              2.) filter_linearN must == "true" in your .cgp preset unless
+//                  you're using tex2DblurNresize at 1x scale.
+//              3.) mipmap_inputN must == "true" in your .cgp preset if
+//                  IN.output_size < IN.video_size.
+//              4.) IN.output_size == IN.video_size / pow(2, M), where M is some
+//                  positive integer.  tex2Dblur*resize can resize arbitrarily
+//                  (and the blur will be done after resizing), but arbitrary
+//                  resizes "fail" with other blurs due to the way they mix
+//                  static weights with bilinear sample exploitation.
+//              5.) In general, dxdy should contain the uv pixel spacing:
+//                      dxdy = (IN.video_size/IN.output_size)/IN.texture_size
+//              6.) For separable blurs (tex2DblurNresize and tex2DblurNfast),
+//                  zero out the dxdy component in the unblurred dimension:
+//                      dxdy = vec2(dxdy.x, 0.0) or vec2(0.0, dxdy.y)
+//              Many blurs share these requirements:
+//              1.) One-pass blurs require scale_xN == scale_yN or scales > 1.0,
+//                  or they will blur more in the lower-scaled dimension.
+//              2.) One-pass shared sample blurs require ddx(), ddy(), and
+//                  tex2Dlod() to be supported by the current Cg profile, and
+//                  the drivers must support high-quality derivatives.
+//              3.) One-pass shared sample blurs require:
+//                      tex_uv.w == log2(IN.video_size/IN.output_size).y;
+//              Non-wrapper blurs share this requirement:
+//              1.) sigma is the intended standard deviation of the blur
+//              Wrapper blurs share this requirement, which is automatically
+//              met (unless OVERRIDE_BLUR_STD_DEVS is #defined; see below):
+//              1.) blurN_std_dev must be global static float values
+//                  specifying standard deviations for Nx blurs in units
+//                  of destination pixels
+//  Optional:   1.) The including file (or an earlier included file) may
+//                  optionally #define USE_BINOMIAL_BLUR_STD_DEVS to replace
+//                  default standard deviations with those matching a binomial
+//                  distribution.  (See below for details/properties.)
+//              2.) The including file (or an earlier included file) may
+//                  optionally #define OVERRIDE_BLUR_STD_DEVS and override:
+//                      static float blur3_std_dev
+//                      static float blur4_std_dev
+//                      static float blur5_std_dev
+//                      static float blur6_std_dev
+//                      static float blur7_std_dev
+//                      static float blur8_std_dev
+//                      static float blur9_std_dev
+//                      static float blur10_std_dev
+//                      static float blur11_std_dev
+//                      static float blur12_std_dev
+//                      static float blur17_std_dev
+//                      static float blur25_std_dev
+//                      static float blur31_std_dev
+//                      static float blur43_std_dev
+//              3.) The including file (or an earlier included file) may
+//                  optionally #define OVERRIDE_ERROR_BLURRING and override:
+//                      static float error_blurring
+//                  This tuning value helps mitigate weighting errors from one-
+//                  pass shared-sample blurs sharing bilinear samples between
+//                  fragments.  Values closer to 0.0 have "correct" blurriness
+//                  but allow more artifacts, and values closer to 1.0 blur away
+//                  artifacts by sampling closer to halfway between texels.
+//              UPDATE 6/21/14: The above static constants may now be overridden
+//              by non-static uniform constants.  This permits exposing blur
+//              standard deviations as runtime GUI shader parameters.  However,
+//              using them keeps weights from being statically computed, and the
+//              speed hit depends on the blur: On my machine, uniforms kill over
+//              53% of the framerate with tex2Dblur12x12shared, but they only
+//              drop the framerate by about 18% with tex2Dblur11fast.
+//  Quality and Performance Comparisons:
+//  For the purposes of the following discussion, "no sRGB" means
+//  GAMMA_ENCODE_EVERY_FBO is #defined, and "sRGB" means it isn't.
+//  1.) tex2DblurNfast is always faster than tex2DblurNresize.
+//  2.) tex2DblurNresize functions are the only ones that can arbitrarily resize
+//      well, because they're the only ones that don't exploit bilinear samples.
+//      This also means they're the only functions which can be truly gamma-
+//      correct without linear (or sRGB FBO) input, but only at 1x scale.
+//  3.) One-pass shared sample blurs only have a speed advantage without sRGB.
+//      They also have some inaccuracies due to their shared-[bilinear-]sample
+//      design, which grow increasingly bothersome for smaller blurs and higher-
+//      frequency source images (relative to their resolution).  I had high
+//      hopes for them, but their most realistic use case is limited to quickly
+//      reblurring an already blurred input at full resolution.  Otherwise:
+//      a.) If you're blurring a low-resolution source, you want a better blur.
+//      b.) If you're blurring a lower mipmap, you want a better blur.
+//      c.) If you're blurring a high-resolution, high-frequency source, you
+//          want a better blur.
+//  4.) The one-pass blurs without shared samples grow slower for larger blurs,
+//      but they're competitive with separable blurs at 5x5 and smaller, and
+//      even tex2Dblur7x7 isn't bad if you're wanting to conserve passes.
+//  Here are some framerates from a GeForce 8800GTS.  The first pass resizes to
+//  viewport size (4x in this test) and linearizes for sRGB codepaths, and the
+//  remaining passes perform 6 full blurs.  Mipmapped tests are performed at the
+//  same scale, so they just measure the cost of mipmapping each FBO (only every
+//  other FBO is mipmapped for separable blurs, to mimic realistic usage).
+//  Mipmap      Neither     sRGB+Mipmap sRGB        Function
+//  76.0        92.3        131.3       193.7       tex2Dblur3fast
+//  63.2        74.4        122.4       175.5       tex2Dblur3resize
+//  93.7        121.2       159.3       263.2       tex2Dblur3x3
+//  59.7        68.7        115.4       162.1       tex2Dblur3x3resize
+//  63.2        74.4        122.4       175.5       tex2Dblur5fast
+//  49.3        54.8        100.0       132.7       tex2Dblur5resize
+//  59.7        68.7        115.4       162.1       tex2Dblur5x5
+//  64.9        77.2        99.1        137.2       tex2Dblur6x6shared
+//  55.8        63.7        110.4       151.8       tex2Dblur7fast
+//  39.8        43.9        83.9        105.8       tex2Dblur7resize
+//  40.0        44.2        83.2        104.9       tex2Dblur7x7
+//  56.4        65.5        71.9        87.9        tex2Dblur8x8shared
+//  49.3        55.1        99.9        132.5       tex2Dblur9fast
+//  33.3        36.2        72.4        88.0        tex2Dblur9resize
+//  27.8        29.7        61.3        72.2        tex2Dblur9x9
+//  37.2        41.1        52.6        60.2        tex2Dblur10x10shared
+//  44.4        49.5        91.3        117.8       tex2Dblur11fast
+//  28.8        30.8        63.6        75.4        tex2Dblur11resize
+//  33.6        36.5        40.9        45.5        tex2Dblur12x12shared
+//  TODO: Fill in benchmarks for new untested blurs.
+//                                                  tex2Dblur17fast
+//                                                  tex2Dblur25fast
+//                                                  tex2Dblur31fast
+//                                                  tex2Dblur43fast
+//                                                  tex2Dblur3x3resize
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+//  Set static standard deviations, but allow users to override them with their
+//  own constants (even non-static uniforms if they're okay with the speed hit):
+#ifndef OVERRIDE_BLUR_STD_DEVS
+    //  blurN_std_dev values are specified in terms of dxdy strides.
+    #ifdef USE_BINOMIAL_BLUR_STD_DEVS
+        //  By request, we can define standard deviations corresponding to a
+        //  binomial distribution with p = 0.5 (related to Pascal's triangle).
+        //  This distribution works such that blurring multiple times should
+        //  have the same result as a single larger blur.  These values are
+        //  larger than default for blurs up to 6x and smaller thereafter.
+        float blur3_std_dev = 0.84931640625;
+        float blur4_std_dev = 0.84931640625;
+        float blur5_std_dev = 1.0595703125;
+        float blur6_std_dev = 1.06591796875;
+        float blur7_std_dev = 1.17041015625;
+        float blur8_std_dev = 1.1720703125;
+        float blur9_std_dev = 1.2259765625;
+        float blur10_std_dev = 1.21982421875;
+        float blur11_std_dev = 1.25361328125;
+        float blur12_std_dev = 1.2423828125;
+        float blur17_std_dev = 1.27783203125;
+        float blur25_std_dev = 1.2810546875;
+        float blur31_std_dev = 1.28125;
+        float blur43_std_dev = 1.28125;
+    #else
+        //  The defaults are the largest values that keep the largest unused
+        //  blur term on each side <= 1.0/256.0.  (We could get away with more
+        //  or be more conservative, but this compromise is pretty reasonable.)
+        float blur3_std_dev = 0.62666015625;
+        float blur4_std_dev = 0.66171875;
+        float blur5_std_dev = 0.9845703125;
+        float blur6_std_dev = 1.02626953125;
+        float blur7_std_dev = 1.36103515625;
+        float blur8_std_dev = 1.4080078125;
+        float blur9_std_dev = 1.7533203125;
+        float blur10_std_dev = 1.80478515625;
+        float blur11_std_dev = 2.15986328125;
+        float blur12_std_dev = 2.215234375;
+        float blur17_std_dev = 3.45535583496;
+        float blur25_std_dev = 5.3409576416;
+        float blur31_std_dev = 6.86488037109;
+        float blur43_std_dev = 10.1852050781;
+    #endif  //  USE_BINOMIAL_BLUR_STD_DEVS
+#endif  //  OVERRIDE_BLUR_STD_DEVS
+
+#ifndef OVERRIDE_ERROR_BLURRING
+    //  error_blurring should be in [0.0, 1.0].  Higher values reduce ringing
+    //  in shared-sample blurs but increase blurring and feature shifting.
+    float error_blurring = 0.5;
+#endif
+
+//  Make a length squared helper macro (for usage with static constants):
+#define LENGTH_SQ(vec) (dot(vec, vec))
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//  gamma-management.h relies on pass-specific settings to guide its behavior:
+//  FIRST_PASS, LAST_PASS, GAMMA_ENCODE_EVERY_FBO, etc.  See it for details.
+//#include "gamma-management.h"
+//#include "quad-pixel-communication.h"
+//#include "special-functions.h"
+
+#ifndef SPECIAL_FUNCTIONS_H
+#define SPECIAL_FUNCTIONS_H
+
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file implements the following mathematical special functions:
+//  1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2))
+//  2.) gamma(s), a real-numbered extension of the integer factorial function
+//  It also implements normalized_ligamma(s, z), a normalized lower incomplete
+//  gamma function for s < 0.5 only.  Both gamma() and normalized_ligamma() can
+//  be called with an _impl suffix to use an implementation version with a few
+//  extra precomputed parameters (which may be useful for the caller to reuse).
+//  See below for details.
+//
+//  Design Rationale:
+//  Pretty much every line of code in this file is duplicated four times for
+//  different input types (vec4/vec3/vec2/float).  This is unfortunate,
+//  but Cg doesn't allow function templates.  Macros would be far less verbose,
+//  but they would make the code harder to document and read.  I don't expect
+//  these functions will require a whole lot of maintenance changes unless
+//  someone ever has need for more robust incomplete gamma functions, so code
+//  duplication seems to be the lesser evil in this case.
+
+
+///////////////////////////  GAUSSIAN ERROR FUNCTION  //////////////////////////
+
+vec4 erf6(vec4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Return an Abramowitz/Stegun approximation of erf(), where:
+    //                  erf(x) = 2/sqrt(pi) * integral(e**(-x**2))
+    //              This approximation has a max absolute error of 2.5*10**-5
+    //              with solid numerical robustness and efficiency.  See:
+	//                  https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions
+	vec4 one = vec4(1.0);
+	vec4 sign_x = sign(x);
+	vec4 t = one/(one + 0.47047*abs(x));
+	vec4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+vec3 erf6(vec3 x)
+{
+    //  vec3 version:
+	vec3 one = vec3(1.0);
+	vec3 sign_x = sign(x);
+	vec3 t = one/(one + 0.47047*abs(x));
+	vec3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+vec2 erf6(vec2 x)
+{
+    //  vec2 version:
+	vec2 one = vec2(1.0);
+	vec2 sign_x = sign(x);
+	vec2 t = one/(one + 0.47047*abs(x));
+	vec2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float erf6(float x)
+{
+    //  Float version:
+	float sign_x = sign(x);
+	float t = 1.0/(1.0 + 0.47047*abs(x));
+	float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+vec4 erft(vec4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Approximate erf() with the hyperbolic tangent.  The error is
+    //              visually noticeable, but it's blazing fast and perceptually
+    //              close...at least on ATI hardware.  See:
+    //                  http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html
+    //  Warning:    Only use this if your hardware drivers correctly implement
+    //              tanh(): My nVidia 8800GTS returns garbage output.
+	return tanh(1.202760580 * x);
+}
+
+vec3 erft(vec3 x)
+{
+    //  vec3 version:
+	return tanh(1.202760580 * x);
+}
+
+vec2 erft(vec2 x)
+{
+    //  vec2 version:
+	return tanh(1.202760580 * x);
+}
+
+float erft(float x)
+{
+    //  Float version:
+	return tanh(1.202760580 * x);
+}
+
+vec4 erf(vec4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Some approximation of erf(x), depending on user settings.
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+vec3 erf(vec3 x)
+{
+    //  vec3 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+vec2 erf(vec2 x)
+{
+    //  vec2 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+float erf(float x)
+{
+    //  Float version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+///////////////////////////  COMPLETE GAMMA FUNCTION  //////////////////////////
+
+vec4 gamma_impl(vec4 s, vec4 s_inv)
+{
+    //  Requires:   1.) s is the standard parameter to the gamma function, and
+    //                  it should lie in the [0, 36] range.
+    //              2.) s_inv = 1.0/s.  This implementation function requires
+    //                  the caller to precompute this value, giving users the
+    //                  opportunity to reuse it.
+    //  Returns:    Return approximate gamma function (real-numbered factorial)
+    //              output using the Lanczos approximation with two coefficients
+    //              calculated using Paul Godfrey's method here:
+    //                  http://my.fit.edu/~gabdo/gamma.txt
+    //              An optimal g value for s in [0, 36] is ~1.12906830989, with
+    //              a maximum relative error of 0.000463 for 2**16 equally
+    //              evals.  We could use three coeffs (0.0000346 error) without
+    //              hurting latency, but this allows more parallelism with
+    //              outside instructions.
+	vec4 g = vec4(1.12906830989);
+	vec4 c0 = vec4(0.8109119309638332633713423362694399653724431);
+	vec4 c1 = vec4(0.4808354605142681877121661197951496120000040);
+	vec4 e = vec4(2.71828182845904523536028747135266249775724709);
+	vec4 sph = s + vec4(0.5);
+	vec4 lanczos_sum = c0 + c1/(s + vec4(1.0));
+	vec4 base = (sph + g)/e;  //  or (s + g + vec4(0.5))/e
+	//  gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s).
+	//  This has less error for small s's than (s -= 1.0) at the beginning.
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+vec3 gamma_impl(vec3 s, vec3 s_inv)
+{
+    //  vec3 version:
+	vec3 g = vec3(1.12906830989);
+	vec3 c0 = vec3(0.8109119309638332633713423362694399653724431);
+	vec3 c1 = vec3(0.4808354605142681877121661197951496120000040);
+	vec3 e = vec3(2.71828182845904523536028747135266249775724709);
+	vec3 sph = s + vec3(0.5);
+	vec3 lanczos_sum = c0 + c1/(s + vec3(1.0));
+	vec3 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+vec2 gamma_impl(vec2 s, vec2 s_inv)
+{
+    //  vec2 version:
+	vec2 g = vec2(1.12906830989);
+	vec2 c0 = vec2(0.8109119309638332633713423362694399653724431);
+	vec2 c1 = vec2(0.4808354605142681877121661197951496120000040);
+	vec2 e = vec2(2.71828182845904523536028747135266249775724709);
+	vec2 sph = s + vec2(0.5);
+	vec2 lanczos_sum = c0 + c1/(s + vec2(1.0));
+	vec2 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float gamma_impl(float s, float s_inv)
+{
+    //  Float version:
+	float g = 1.12906830989;
+	float c0 = 0.8109119309638332633713423362694399653724431;
+	float c1 = 0.4808354605142681877121661197951496120000040;
+	float e = 2.71828182845904523536028747135266249775724709;
+	float sph = s + 0.5;
+	float lanczos_sum = c0 + c1/(s + 1.0);
+	float base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+vec4 gamma(vec4 s)
+{
+    //  Requires:   s is the standard parameter to the gamma function, and it
+    //              should lie in the [0, 36] range.
+    //  Returns:    Return approximate gamma function output with a maximum
+    //              relative error of 0.000463.  See gamma_impl for details.
+	return gamma_impl(s, vec4(1.0)/s);
+}
+
+vec3 gamma(vec3 s)
+{
+    //  vec3 version:
+	return gamma_impl(s, vec3(1.0)/s);
+}
+
+vec2 gamma(vec2 s)
+{
+    //  vec2 version:
+	return gamma_impl(s, vec2(1.0)/s);
+}
+
+float gamma(float s)
+{
+    //  Float version:
+	return gamma_impl(s, 1.0/s);
+}
+
+////////////////  INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT)  ///////////////
+
+//  Lower incomplete gamma function for small s and z (implementation):
+vec4 ligamma_small_z_impl(vec4 s, vec4 z, vec4 s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z <= ~0.775075
+    //              3.) s_inv = 1.0/s (precomputed for outside reuse)
+    //  Returns:    A series representation for the lower incomplete gamma
+    //              function for small s and small z (4 terms).
+    //  The actual "rolled up" summation looks like:
+	//      last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0;
+	//      sum = last_sign * last_pow / ((s + k) * last_factorial)
+	//      for(int i = 0; i < 4; ++i)
+	//      {
+	//          last_sign *= -1.0; last_pow *= z; last_factorial *= i;
+	//          sum += last_sign * last_pow / ((s + k) * last_factorial);
+	//      }
+	//  Unrolled, constant-unfolded and arranged for madds and parallelism:
+	vec4 scale = pow(z, s);
+	vec4 sum = s_inv;  //  Summation iteration 0 result
+	//  Summation iterations 1, 2, and 3:
+	vec4 z_sq = z*z;
+	vec4 denom1 = s + vec4(1.0);
+	vec4 denom2 = 2.0*s + vec4(4.0);
+	vec4 denom3 = 6.0*s + vec4(18.0);
+	//vec4 denom4 = 24.0*s + vec4(96.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	//sum += z_sq * z_sq / denom4;
+	//  Scale and return:
+	return scale * sum;
+}
+
+vec3 ligamma_small_z_impl(vec3 s, vec3 z, vec3 s_inv)
+{
+    //  vec3 version:
+	vec3 scale = pow(z, s);
+	vec3 sum = s_inv;
+	vec3 z_sq = z*z;
+	vec3 denom1 = s + vec3(1.0);
+	vec3 denom2 = 2.0*s + vec3(4.0);
+	vec3 denom3 = 6.0*s + vec3(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+vec2 ligamma_small_z_impl(vec2 s, vec2 z, vec2 s_inv)
+{
+    //  vec2 version:
+	vec2 scale = pow(z, s);
+	vec2 sum = s_inv;
+	vec2 z_sq = z*z;
+	vec2 denom1 = s + vec2(1.0);
+	vec2 denom2 = 2.0*s + vec2(4.0);
+	vec2 denom3 = 6.0*s + vec2(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float ligamma_small_z_impl(float s, float z, float s_inv)
+{
+    //  Float version:
+	float scale = pow(z, s);
+	float sum = s_inv;
+	float z_sq = z*z;
+	float denom1 = s + 1.0;
+	float denom2 = 2.0*s + 4.0;
+	float denom3 = 6.0*s + 18.0;
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+//  Upper incomplete gamma function for small s and large z (implementation):
+vec4 uigamma_large_z_impl(vec4 s, vec4 z)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z > ~0.775075
+    //  Returns:    Gauss's continued fraction representation for the upper
+    //              incomplete gamma function (4 terms).
+	//  The "rolled up" continued fraction looks like this.  The denominator
+    //  is truncated, and it's calculated "from the bottom up:"
+	//      denom = vec4('inf');
+	//      vec4 one = vec4(1.0);
+	//      for(int i = 4; i > 0; --i)
+	//      {
+	//          denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom;
+	//      }
+	//  Unrolled and constant-unfolded for madds and parallelism:
+	vec4 numerator = pow(z, s) * exp(-z);
+	vec4 denom = vec4(7.0) + z - s;
+	denom = vec4(5.0) + z - s + (3.0*s - vec4(9.0))/denom;
+	denom = vec4(3.0) + z - s + (2.0*s - vec4(4.0))/denom;
+	denom = vec4(1.0) + z - s + (s - vec4(1.0))/denom;
+	return numerator / denom;
+}
+
+vec3 uigamma_large_z_impl(vec3 s, vec3 z)
+{
+    //  vec3 version:
+	vec3 numerator = pow(z, s) * exp(-z);
+	vec3 denom = vec3(7.0) + z - s;
+	denom = vec3(5.0) + z - s + (3.0*s - vec3(9.0))/denom;
+	denom = vec3(3.0) + z - s + (2.0*s - vec3(4.0))/denom;
+	denom = vec3(1.0) + z - s + (s - vec3(1.0))/denom;
+	return numerator / denom;
+}
+
+vec2 uigamma_large_z_impl(vec2 s, vec2 z)
+{
+    //  vec2 version:
+	vec2 numerator = pow(z, s) * exp(-z);
+	vec2 denom = vec2(7.0) + z - s;
+	denom = vec2(5.0) + z - s + (3.0*s - vec2(9.0))/denom;
+	denom = vec2(3.0) + z - s + (2.0*s - vec2(4.0))/denom;
+	denom = vec2(1.0) + z - s + (s - vec2(1.0))/denom;
+	return numerator / denom;
+}
+
+float uigamma_large_z_impl(float s, float z)
+{
+    //  Float version:
+	float numerator = pow(z, s) * exp(-z);
+	float denom = 7.0 + z - s;
+	denom = 5.0 + z - s + (3.0*s - 9.0)/denom;
+	denom = 3.0 + z - s + (2.0*s - 4.0)/denom;
+	denom = 1.0 + z - s + (s - 1.0)/denom;
+	return numerator / denom;
+}
+
+//  Normalized lower incomplete gamma function for small s (implementation):
+vec4 normalized_ligamma_impl(vec4 s, vec4 z,
+    vec4 s_inv, vec4 gamma_s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) s_inv = 1/s (precomputed for outside reuse)
+    //              3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse)
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  Since we only care about s < 0.5, we only need
+    //              to evaluate two branches (not four) based on z.  Each branch
+    //              uses four terms, with a max relative error of ~0.00182.  The
+    //              branch threshold and specifics were adapted for fewer terms
+    //              from Gil/Segura/Temme's paper here:
+    //                  http://oai.cwi.nl/oai/asset/20433/20433B.pdf
+	//  Evaluate both branches: Real branches test slower even when available.
+	vec4 thresh = vec4(0.775075);
+	bvec4 z_is_large = greaterThan(z , thresh);
+	vec4 z_size_check = vec4(z_is_large.x ? 1.0 : 0.0, z_is_large.y ? 1.0 : 0.0, z_is_large.z ? 1.0 : 0.0, z_is_large.w ? 1.0 : 0.0);
+	vec4 large_z = vec4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	vec4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	//  Combine the results from both branches:
+	return large_z * vec4(z_size_check) + small_z * vec4(z_size_check);
+}
+
+vec3 normalized_ligamma_impl(vec3 s, vec3 z,
+    vec3 s_inv, vec3 gamma_s_inv)
+{
+    //  vec3 version:
+	vec3 thresh = vec3(0.775075);
+	bvec3 z_is_large = greaterThan(z , thresh);
+	vec3 z_size_check = vec3(z_is_large.x ? 1.0 : 0.0, z_is_large.y ? 1.0 : 0.0, z_is_large.z ? 1.0 : 0.0);
+	vec3 large_z = vec3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	vec3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	return large_z * vec3(z_size_check) + small_z * vec3(z_size_check);
+}
+
+vec2 normalized_ligamma_impl(vec2 s, vec2 z,
+    vec2 s_inv, vec2 gamma_s_inv)
+{
+    //  vec2 version:
+	vec2 thresh = vec2(0.775075);
+	bvec2 z_is_large = greaterThan(z , thresh);
+	vec2 z_size_check = vec2(z_is_large.x ? 1.0 : 0.0, z_is_large.y ? 1.0 : 0.0);
+	vec2 large_z = vec2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	vec2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	return large_z * vec2(z_size_check) + small_z * vec2(z_size_check);
+}
+
+float normalized_ligamma_impl(float s, float z,
+    float s_inv, float gamma_s_inv)
+{
+    //  Float version:
+	float thresh = 0.775075;
+	float z_size_check = 0.0;
+	if (z > thresh) z_size_check = 1.0;
+	float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	return large_z * float(z_size_check) + small_z * float(z_size_check);
+}
+
+//  Normalized lower incomplete gamma function for small s:
+vec4 normalized_ligamma(vec4 s, vec4 z)
+{
+    //  Requires:   s < ~0.5
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  See normalized_ligamma_impl() for details.
+	vec4 s_inv = vec4(1.0)/s;
+	vec4 gamma_s_inv = vec4(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+vec3 normalized_ligamma(vec3 s, vec3 z)
+{
+    //  vec3 version:
+	vec3 s_inv = vec3(1.0)/s;
+	vec3 gamma_s_inv = vec3(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+vec2 normalized_ligamma(vec2 s, vec2 z)
+{
+    //  vec2 version:
+	vec2 s_inv = vec2(1.0)/s;
+	vec2 gamma_s_inv = vec2(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float normalized_ligamma(float s, float z)
+{
+    //  Float version:
+	float s_inv = 1.0/s;
+	float gamma_s_inv = 1.0/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+#endif  //  SPECIAL_FUNCTIONS_H
+
+///////////////////////////////////  HELPERS  //////////////////////////////////
+
+vec4 uv2_to_uv4(vec2 tex_uv)
+{
+    //  Make a vec2 uv offset safe for adding to vec4 tex2Dlod coords:
+    return vec4(tex_uv, 0.0, 0.0);
+}
+
+//  Make a length squared helper macro (for usage with static constants):
+#define LENGTH_SQ(vec) (dot(vec, vec))
+
+float get_fast_gaussian_weight_sum_inv(float sigma)
+{
+    //  We can use the Gaussian integral to calculate the asymptotic weight for
+    //  the center pixel.  Since the unnormalized center pixel weight is 1.0,
+    //  the normalized weight is the same as the weight sum inverse.  Given a
+    //  large enough blur (9+), the asymptotic weight sum is close and faster:
+    //      center_weight = 0.5 *
+    //          (erf(0.5/(sigma*sqrt(2.0))) - erf(-0.5/(sigma*sqrt(2.0))))
+    //      erf(-x) == -erf(x), so we get 0.5 * (2.0 * erf(blah blah)):
+    //  However, we can get even faster results with curve-fitting.  These are
+    //  also closer than the asymptotic results, because they were constructed
+    //  from 64 blurs sizes from [3, 131) and 255 equally-spaced sigmas from
+    //  (0, blurN_std_dev), so the results for smaller sigmas are biased toward
+    //  smaller blurs.  The max error is 0.0031793913.
+    //  Relative FPS: 134.3 with erf, 135.8 with curve-fitting.
+    //static float temp = 0.5/sqrt(2.0);
+    //return erf(temp/sigma);
+    return min(exp(exp(0.348348412457428/
+        (sigma - 0.0860587260734721))), 0.399334576340352/sigma);
+}
+
+////////////////////  ARBITRARILY RESIZABLE SEPARABLE BLURS  ///////////////////
+
+vec3 tex2Dblur11resize(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 11x Gaussian blurred texture lookup using a 11-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  Calculate Gaussian blur kernel weights and a normalization factor for
+    //  distances of 0-4, ignoring constant factors (since we're normalizing).
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float w4 = exp(-16.0 * denom_inv);
+    float w5 = exp(-25.0 * denom_inv);
+    float weight_sum_inv = 1.0 /
+        (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5));
+    //  Statically normalize weights, sum weighted samples, and return.  Blurs are
+    //  currently optimized for dynamic weights.
+    vec3 sum = vec3(0.0);
+    sum += w5 * tex2D_linearize(tex, tex_uv - 5.0 * dxdy).rgb;
+    sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
+    sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb;
+    sum += w5 * tex2D_linearize(tex, tex_uv + 5.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur9resize(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 9x Gaussian blurred texture lookup using a 9-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float w4 = exp(-16.0 * denom_inv);
+    float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
+    sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur7resize(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 7x Gaussian blurred texture lookup using a 7-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3));
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur5resize(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 5x Gaussian blurred texture lookup using a 5-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2));
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur3resize(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 3x Gaussian blurred texture lookup using a 3-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float weight_sum_inv = 1.0 / (w0 + 2.0 * w1);
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+///////////////////////////  FAST SEPARABLE BLURS  ///////////////////////////
+
+vec3 tex2Dblur11fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   1.) Global requirements must be met (see file description).
+    //              2.) filter_linearN must = "true" in your .cgp file.
+    //              3.) For gamma-correct bilinear filtering, global
+    //                  gamma_aware_bilinear == true (from gamma-management.h)
+    //  Returns:    A 1D 11x Gaussian blurred texture lookup using 6 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float w4 = exp(-16.0 * denom_inv);
+    float w5 = exp(-25.0 * denom_inv);
+    float weight_sum_inv = 1.0 /
+        (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    float w01 = w0 * 0.5 + w1;
+    float w23 = w2 + w3;
+    float w45 = w4 + w5;
+    float w01_ratio = w1/w01;
+    float w23_ratio = w3/w23;
+    float w45_ratio = w5/w45;
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w45 * tex2D_linearize(tex, tex_uv - (4.0 + w45_ratio) * dxdy).rgb;
+    sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb;
+    sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb;
+    sum += w45 * tex2D_linearize(tex, tex_uv + (4.0 + w45_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur17fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 17x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 8 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float w4 = exp(-16.0 * denom_inv);
+    float w5 = exp(-25.0 * denom_inv);
+    float w6 = exp(-36.0 * denom_inv);
+    float w7 = exp(-49.0 * denom_inv);
+    float w8 = exp(-64.0 * denom_inv);
+    //float weight_sum_inv = 1.0 / (w0 + 2.0 * (
+    //    w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8));
+    float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    float w1_2 = w1 + w2;
+    float w3_4 = w3 + w4;
+    float w5_6 = w5 + w6;
+    float w7_8 = w7 + w8;
+    float w1_2_ratio = w2/w1_2;
+    float w3_4_ratio = w4/w3_4;
+    float w5_6_ratio = w6/w5_6;
+    float w7_8_ratio = w8/w7_8;
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur25fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 25x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 12 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float w4 = exp(-16.0 * denom_inv);
+    float w5 = exp(-25.0 * denom_inv);
+    float w6 = exp(-36.0 * denom_inv);
+    float w7 = exp(-49.0 * denom_inv);
+    float w8 = exp(-64.0 * denom_inv);
+    float w9 = exp(-81.0 * denom_inv);
+    float w10 = exp(-100.0 * denom_inv);
+    float w11 = exp(-121.0 * denom_inv);
+    float w12 = exp(-144.0 * denom_inv);
+    //float weight_sum_inv = 1.0 / (w0 + 2.0 * (
+    //    w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12));
+    float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    float w1_2 = w1 + w2;
+    float w3_4 = w3 + w4;
+    float w5_6 = w5 + w6;
+    float w7_8 = w7 + w8;
+    float w9_10 = w9 + w10;
+    float w11_12 = w11 + w12;
+    float w1_2_ratio = w2/w1_2;
+    float w3_4_ratio = w4/w3_4;
+    float w5_6_ratio = w6/w5_6;
+    float w7_8_ratio = w8/w7_8;
+    float w9_10_ratio = w10/w9_10;
+    float w11_12_ratio = w12/w11_12;
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w11_12 * tex2D_linearize(tex, tex_uv - (11.0 + w11_12_ratio) * dxdy).rgb;
+    sum += w9_10 * tex2D_linearize(tex, tex_uv - (9.0 + w9_10_ratio) * dxdy).rgb;
+    sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w9_10 * tex2D_linearize(tex, tex_uv + (9.0 + w9_10_ratio) * dxdy).rgb;
+    sum += w11_12 * tex2D_linearize(tex, tex_uv + (11.0 + w11_12_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur31fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 31x Gaussian blurred texture lookup using 16 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float w4 = exp(-16.0 * denom_inv);
+    float w5 = exp(-25.0 * denom_inv);
+    float w6 = exp(-36.0 * denom_inv);
+    float w7 = exp(-49.0 * denom_inv);
+    float w8 = exp(-64.0 * denom_inv);
+    float w9 = exp(-81.0 * denom_inv);
+    float w10 = exp(-100.0 * denom_inv);
+    float w11 = exp(-121.0 * denom_inv);
+    float w12 = exp(-144.0 * denom_inv);
+    float w13 = exp(-169.0 * denom_inv);
+    float w14 = exp(-196.0 * denom_inv);
+    float w15 = exp(-225.0 * denom_inv);
+    //float weight_sum_inv = 1.0 /
+    //    (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 +
+    //        w9 + w10 + w11 + w12 + w13 + w14 + w15));
+    float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    float w0_1 = w0 * 0.5 + w1;
+    float w2_3 = w2 + w3;
+    float w4_5 = w4 + w5;
+    float w6_7 = w6 + w7;
+    float w8_9 = w8 + w9;
+    float w10_11 = w10 + w11;
+    float w12_13 = w12 + w13;
+    float w14_15 = w14 + w15;
+    float w0_1_ratio = w1/w0_1;
+    float w2_3_ratio = w3/w2_3;
+    float w4_5_ratio = w5/w4_5;
+    float w6_7_ratio = w7/w6_7;
+    float w8_9_ratio = w9/w8_9;
+    float w10_11_ratio = w11/w10_11;
+    float w12_13_ratio = w13/w12_13;
+    float w14_15_ratio = w15/w14_15;
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur43fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 43x Gaussian blurred texture lookup using 22 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float w4 = exp(-16.0 * denom_inv);
+    float w5 = exp(-25.0 * denom_inv);
+    float w6 = exp(-36.0 * denom_inv);
+    float w7 = exp(-49.0 * denom_inv);
+    float w8 = exp(-64.0 * denom_inv);
+    float w9 = exp(-81.0 * denom_inv);
+    float w10 = exp(-100.0 * denom_inv);
+    float w11 = exp(-121.0 * denom_inv);
+    float w12 = exp(-144.0 * denom_inv);
+    float w13 = exp(-169.0 * denom_inv);
+    float w14 = exp(-196.0 * denom_inv);
+    float w15 = exp(-225.0 * denom_inv);
+    float w16 = exp(-256.0 * denom_inv);
+    float w17 = exp(-289.0 * denom_inv);
+    float w18 = exp(-324.0 * denom_inv);
+    float w19 = exp(-361.0 * denom_inv);
+    float w20 = exp(-400.0 * denom_inv);
+    float w21 = exp(-441.0 * denom_inv);
+    //float weight_sum_inv = 1.0 /
+    //    (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 +
+    //        w12 + w13 + w14 + w15 + w16 + w17 + w18 + w19 + w20 + w21));
+    float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    float w0_1 = w0 * 0.5 + w1;
+    float w2_3 = w2 + w3;
+    float w4_5 = w4 + w5;
+    float w6_7 = w6 + w7;
+    float w8_9 = w8 + w9;
+    float w10_11 = w10 + w11;
+    float w12_13 = w12 + w13;
+    float w14_15 = w14 + w15;
+    float w16_17 = w16 + w17;
+    float w18_19 = w18 + w19;
+    float w20_21 = w20 + w21;
+    float w0_1_ratio = w1/w0_1;
+    float w2_3_ratio = w3/w2_3;
+    float w4_5_ratio = w5/w4_5;
+    float w6_7_ratio = w7/w6_7;
+    float w8_9_ratio = w9/w8_9;
+    float w10_11_ratio = w11/w10_11;
+    float w12_13_ratio = w13/w12_13;
+    float w14_15_ratio = w15/w14_15;
+    float w16_17_ratio = w17/w16_17;
+    float w18_19_ratio = w19/w18_19;
+    float w20_21_ratio = w21/w20_21;
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w20_21 * tex2D_linearize(tex, tex_uv - (20.0 + w20_21_ratio) * dxdy).rgb;
+    sum += w18_19 * tex2D_linearize(tex, tex_uv - (18.0 + w18_19_ratio) * dxdy).rgb;
+    sum += w16_17 * tex2D_linearize(tex, tex_uv - (16.0 + w16_17_ratio) * dxdy).rgb;
+    sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb;
+    sum += w16_17 * tex2D_linearize(tex, tex_uv + (16.0 + w16_17_ratio) * dxdy).rgb;
+    sum += w18_19 * tex2D_linearize(tex, tex_uv + (18.0 + w18_19_ratio) * dxdy).rgb;
+    sum += w20_21 * tex2D_linearize(tex, tex_uv + (20.0 + w20_21_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur3fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 3x Gaussian blurred texture lookup using 2 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float weight_sum_inv = 1.0 / (w0 + 2.0 * w1);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    float w01 = w0 * 0.5 + w1;
+    float w01_ratio = w1/w01;
+    //  Weights for all samples are the same, so just average them:
+    return 0.5 * (
+        tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb +
+        tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb);
+}
+
+vec3 tex2Dblur5fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 5x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 2 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    float w12 = w1 + w2;
+    float w12_ratio = w2/w12;
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur7fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 7x Gaussian blurred texture lookup using 4 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    float w01 = w0 * 0.5 + w1;
+    float w23 = w2 + w3;
+    float w01_ratio = w1/w01;
+    float w23_ratio = w3/w23;
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb;
+    sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+////////////////////  ARBITRARILY RESIZABLE ONE-PASS BLURS  ////////////////////
+
+vec3 tex2Dblur3x3resize(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 3x3 Gaussian blurred mipmapped texture lookup of the
+    //              resized input.
+    //  Description:
+    //  This is the only arbitrarily resizable one-pass blur; tex2Dblur5x5resize
+    //  would perform like tex2Dblur9x9, MUCH slower than tex2Dblur5resize.
+    float denom_inv = 0.5/(sigma*sigma);
+    //  Load each sample.  We need all 3x3 samples.  Quad-pixel communication
+    //  won't help either: This should perform like tex2Dblur5x5, but sharing a
+    //  4x4 sample field would perform more like tex2Dblur8x8shared (worse).
+    vec2 sample4_uv = tex_uv;
+    vec2 dx = vec2(dxdy.x, 0.0);
+    vec2 dy = vec2(0.0, dxdy.y);
+    vec2 sample1_uv = sample4_uv - dy;
+    vec2 sample7_uv = sample4_uv + dy;
+    vec3 sample0 = tex2D_linearize(tex, sample1_uv - dx).rgb;
+    vec3 sample1 = tex2D_linearize(tex, sample1_uv).rgb;
+    vec3 sample2 = tex2D_linearize(tex, sample1_uv + dx).rgb;
+    vec3 sample3 = tex2D_linearize(tex, sample4_uv - dx).rgb;
+    vec3 sample4 = tex2D_linearize(tex, sample4_uv).rgb;
+    vec3 sample5 = tex2D_linearize(tex, sample4_uv + dx).rgb;
+    vec3 sample6 = tex2D_linearize(tex, sample7_uv - dx).rgb;
+    vec3 sample7 = tex2D_linearize(tex, sample7_uv).rgb;
+    vec3 sample8 = tex2D_linearize(tex, sample7_uv + dx).rgb;
+    //  Statically compute Gaussian sample weights:
+    float w4 = 1.0;
+    float w1_3_5_7 = exp(-LENGTH_SQ(vec2(1.0, 0.0)) * denom_inv);
+    float w0_2_6_8 = exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv);
+    float weight_sum_inv = 1.0/(w4 + 4.0 * (w1_3_5_7 + w0_2_6_8));
+    //  Weight and sum the samples:
+    vec3 sum = w4 * sample4 +
+        w1_3_5_7 * (sample1 + sample3 + sample5 + sample7) +
+        w0_2_6_8 * (sample0 + sample2 + sample6 + sample8);
+    return sum * weight_sum_inv;
+}
+
+//  Resizable one-pass blurs:
+vec3 tex2Dblur3x3resize(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur3x3resize(texture, tex_uv, dxdy, blur3_std_dev);
+}
+
+vec3 tex2Dblur9fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 9x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 4 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float w4 = exp(-16.0 * denom_inv);
+    float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    float w12 = w1 + w2;
+    float w34 = w3 + w4;
+    float w12_ratio = w2/w12;
+    float w34_ratio = w4/w34;
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w34 * tex2D_linearize(tex, tex_uv - (3.0 + w34_ratio) * dxdy).rgb;
+    sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb;
+    sum += w34 * tex2D_linearize(tex, tex_uv + (3.0 + w34_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur9x9(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Perform a 1-pass 9x9 blur with 5x5 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 9x9 Gaussian blurred mipmapped texture lookup composed of
+    //              5x5 carefully selected bilinear samples.
+    //  Description:
+    //  Perform a 1-pass 9x9 blur with 5x5 bilinear samples.  Adjust the
+    //  bilinear sample location to reflect the true Gaussian weights for each
+    //  underlying texel.  The following diagram illustrates the relative
+    //  locations of bilinear samples.  Each sample with the same number has the
+    //  same weight (notice the symmetry).  The letters a, b, c, d distinguish
+    //  quadrants, and the letters U, D, L, R, C (up, down, left, right, center)
+    //  distinguish 1D directions along the line containing the pixel center:
+    //      6a 5a 2U 5b 6b
+    //      4a 3a 1U 3b 4b
+    //      2L 1L 0C 1R 2R
+    //      4c 3c 1D 3d 4d
+    //      6c 5c 2D 5d 6d
+    //  The following diagram illustrates the underlying equally spaced texels,
+    //  named after the sample that accesses them and subnamed by their location
+    //  within their 2x2, 2x1, 1x2, or 1x1 texel block:
+    //      6a4 6a3 5a4 5a3 2U2 5b3 5b4 6b3 6b4
+    //      6a2 6a1 5a2 5a1 2U1 5b1 5b2 6b1 6b2
+    //      4a4 4a3 3a4 3a3 1U2 3b3 3b4 4b3 4b4
+    //      4a2 4a1 3a2 3a1 1U1 3b1 3b2 4b1 4b2
+    //      2L2 2L1 1L2 1L1 0C1 1R1 1R2 2R1 2R2
+    //      4c2 4c1 3c2 3c1 1D1 3d1 3d2 4d1 4d2
+    //      4c4 4c3 3c4 3c3 1D2 3d3 3d4 4d3 4d4
+    //      6c2 6c1 5c2 5c1 2D1 5d1 5d2 6d1 6d2
+    //      6c4 6c3 5c4 5c3 2D2 5d3 5d4 6d3 6d4
+    //  Note there is only one C texel and only two texels for each U, D, L, or
+    //  R sample.  The center sample is effectively a nearest neighbor sample,
+    //  and the U/D/L/R samples use 1D linear filtering.  All other texels are
+    //  read with bilinear samples somewhere within their 2x2 texel blocks.
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute sampling offsets within each 2x2 texel block, based
+    //  on 1D sampling ratios between texels [1, 2] and [3, 4] texels away from
+    //  the center, and reuse them independently for both dimensions.  Compute
+    //  these offsets based on the relative 1D Gaussian weights of the texels
+    //  in question.  (w1off means "Gaussian weight for the texel 1.0 texels
+    //  away from the pixel center," etc.).
+    float denom_inv = 0.5/(sigma*sigma);
+    float w1off = exp(-1.0 * denom_inv);
+    float w2off = exp(-4.0 * denom_inv);
+    float w3off = exp(-9.0 * denom_inv);
+    float w4off = exp(-16.0 * denom_inv);
+    float texel1to2ratio = w2off/(w1off + w2off);
+    float texel3to4ratio = w4off/(w3off + w4off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including x-axis-aligned:
+    vec2 sample1R_texel_offset = vec2(1.0, 0.0) + vec2(texel1to2ratio, 0.0);
+    vec2 sample2R_texel_offset = vec2(3.0, 0.0) + vec2(texel3to4ratio, 0.0);
+    vec2 sample3d_texel_offset = vec2(1.0, 1.0) + vec2(texel1to2ratio, texel1to2ratio);
+    vec2 sample4d_texel_offset = vec2(3.0, 1.0) + vec2(texel3to4ratio, texel1to2ratio);
+    vec2 sample5d_texel_offset = vec2(1.0, 3.0) + vec2(texel1to2ratio, texel3to4ratio);
+    vec2 sample6d_texel_offset = vec2(3.0, 3.0) + vec2(texel3to4ratio, texel3to4ratio);
+
+    //  CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
+    //  Statically compute Gaussian texel weights for the bottom-right quadrant.
+    //  Read underscores as "and."
+    float w1R1 = w1off;
+    float w1R2 = w2off;
+    float w2R1 = w3off;
+    float w2R2 = w4off;
+    float w3d1 =     exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv);
+    float w3d2_3d3 = exp(-LENGTH_SQ(vec2(2.0, 1.0)) * denom_inv);
+    float w3d4 =     exp(-LENGTH_SQ(vec2(2.0, 2.0)) * denom_inv);
+    float w4d1_5d1 = exp(-LENGTH_SQ(vec2(3.0, 1.0)) * denom_inv);
+    float w4d2_5d3 = exp(-LENGTH_SQ(vec2(4.0, 1.0)) * denom_inv);
+    float w4d3_5d2 = exp(-LENGTH_SQ(vec2(3.0, 2.0)) * denom_inv);
+    float w4d4_5d4 = exp(-LENGTH_SQ(vec2(4.0, 2.0)) * denom_inv);
+    float w6d1 =     exp(-LENGTH_SQ(vec2(3.0, 3.0)) * denom_inv);
+    float w6d2_6d3 = exp(-LENGTH_SQ(vec2(4.0, 3.0)) * denom_inv);
+    float w6d4 =     exp(-LENGTH_SQ(vec2(4.0, 4.0)) * denom_inv);
+    //  Statically add texel weights in each sample to get sample weights:
+    float w0 = 1.0;
+    float w1 = w1R1 + w1R2;
+    float w2 = w2R1 + w2R2;
+    float w3 = w3d1 + 2.0 * w3d2_3d3 + w3d4;
+    float w4 = w4d1_5d1 + w4d2_5d3 + w4d3_5d2 + w4d4_5d4;
+    float w5 = w4;
+    float w6 = w6d1 + 2.0 * w6d2_6d3 + w6d4;
+    //  Get the weight sum inverse (normalization factor):
+    float weight_sum_inv =
+        1.0/(w0 + 4.0 * (w1 + w2 + w3 + w4 + w5 + w6));
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 25 samples (1 nearest, 8 linear, 16 bilinear) using symmetry:
+    vec2 mirror_x = vec2(-1.0, 1.0);
+    vec2 mirror_y = vec2(1.0, -1.0);
+    vec2 mirror_xy = vec2(-1.0, -1.0);
+    vec2 dxdy_mirror_x = dxdy * mirror_x;
+    vec2 dxdy_mirror_y = dxdy * mirror_y;
+    vec2 dxdy_mirror_xy = dxdy * mirror_xy;
+    //  Sampling order doesn't seem to affect performance, so just be clear:
+    vec3 sample0C = tex2D_linearize(tex, tex_uv).rgb;
+    vec3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb;
+    vec3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb;
+    vec3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb;
+    vec3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb;
+    vec3 sample2R = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset).rgb;
+    vec3 sample2D = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset.yx).rgb;
+    vec3 sample2L = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset).rgb;
+    vec3 sample2U = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset.yx).rgb;
+    vec3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb;
+    vec3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb;
+    vec3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb;
+    vec3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb;
+    vec3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb;
+    vec3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb;
+    vec3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb;
+    vec3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb;
+    vec3 sample5d = tex2D_linearize(tex, tex_uv + dxdy * sample5d_texel_offset).rgb;
+    vec3 sample5c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample5d_texel_offset).rgb;
+    vec3 sample5b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample5d_texel_offset).rgb;
+    vec3 sample5a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample5d_texel_offset).rgb;
+    vec3 sample6d = tex2D_linearize(tex, tex_uv + dxdy * sample6d_texel_offset).rgb;
+    vec3 sample6c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample6d_texel_offset).rgb;
+    vec3 sample6b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample6d_texel_offset).rgb;
+    vec3 sample6a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample6d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    vec3 sum = w0 * sample0C;
+    sum += w1 * (sample1R + sample1D + sample1L + sample1U);
+    sum += w2 * (sample2R + sample2D + sample2L + sample2U);
+    sum += w3 * (sample3d + sample3c + sample3b + sample3a);
+    sum += w4 * (sample4d + sample4c + sample4b + sample4a);
+    sum += w5 * (sample5d + sample5c + sample5b + sample5a);
+    sum += w6 * (sample6d + sample6c + sample6b + sample6a);
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur7x7(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Perform a 1-pass 7x7 blur with 5x5 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 7x7 Gaussian blurred mipmapped texture lookup composed of
+    //              4x4 carefully selected bilinear samples.
+    //  Description:
+    //  First see the descriptions for tex2Dblur9x9() and tex2Dblur7().  This
+    //  blur mixes concepts from both.  The sample layout is as follows:
+    //      4a 3a 3b 4b
+    //      2a 1a 1b 2b
+    //      2c 1c 1d 2d
+    //      4c 3c 3d 4d
+    //  The texel layout is as follows.  Note that samples 3a/3b, 1a/1b, 1c/1d,
+    //  and 3c/3d share a vertical column of texels, and samples 2a/2c, 1a/1c,
+    //  1b/1d, and 2b/2d share a horizontal row of texels (all sample1's share
+    //  the center texel):
+    //      4a4  4a3  3a4  3ab3 3b4  4b3  4b4
+    //      4a2  4a1  3a2  3ab1 3b2  4b1  4b2
+    //      2a4  2a3  1a4  1ab3 1b4  2b3  2b4
+    //      2ac2 2ac1 1ac2 1*   1bd2 2bd1 2bd2
+    //      2c4  2c3  1c4  1cd3 1d4  2d3  2d4
+    //      4c2  4c1  3c2  3cd1 3d2  4d1  4d2
+    //      4c4  4c3  3c4  3cd3 3d4  4d3  4d4
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0off = 1.0;
+    float w1off = exp(-1.0 * denom_inv);
+    float w2off = exp(-4.0 * denom_inv);
+    float w3off = exp(-9.0 * denom_inv);
+    float texel0to1ratio = w1off/(w0off * 0.5 + w1off);
+    float texel2to3ratio = w3off/(w2off + w3off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including axis-aligned:
+    vec2 sample1d_texel_offset = vec2(texel0to1ratio, texel0to1ratio);
+    vec2 sample2d_texel_offset = vec2(2.0, 0.0) + vec2(texel2to3ratio, texel0to1ratio);
+    vec2 sample3d_texel_offset = vec2(0.0, 2.0) + vec2(texel0to1ratio, texel2to3ratio);
+    vec2 sample4d_texel_offset = vec2(2.0, 2.0) + vec2(texel2to3ratio, texel2to3ratio);
+
+    //  CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
+    //  Statically compute Gaussian texel weights for the bottom-right quadrant.
+    //  Read underscores as "and."
+    float w1abcd = 1.0;
+    float w1bd2_1cd3 = exp(-LENGTH_SQ(vec2(1.0, 0.0)) * denom_inv);
+    float w2bd1_3cd1 = exp(-LENGTH_SQ(vec2(2.0, 0.0)) * denom_inv);
+    float w2bd2_3cd2 = exp(-LENGTH_SQ(vec2(3.0, 0.0)) * denom_inv);
+    float w1d4 =       exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv);
+    float w2d3_3d2 =   exp(-LENGTH_SQ(vec2(2.0, 1.0)) * denom_inv);
+    float w2d4_3d4 =   exp(-LENGTH_SQ(vec2(3.0, 1.0)) * denom_inv);
+    float w4d1 =       exp(-LENGTH_SQ(vec2(2.0, 2.0)) * denom_inv);
+    float w4d2_4d3 =   exp(-LENGTH_SQ(vec2(3.0, 2.0)) * denom_inv);
+    float w4d4 =       exp(-LENGTH_SQ(vec2(3.0, 3.0)) * denom_inv);
+    //  Statically add texel weights in each sample to get sample weights.
+    //  Split weights for shared texels between samples sharing them:
+    float w1 = w1abcd * 0.25 + w1bd2_1cd3 + w1d4;
+    float w2_3 = (w2bd1_3cd1 + w2bd2_3cd2) * 0.5 + w2d3_3d2 + w2d4_3d4;
+    float w4 = w4d1 + 2.0 * w4d2_4d3 + w4d4;
+    //  Get the weight sum inverse (normalization factor):
+    float weight_sum_inv =
+        1.0/(4.0 * (w1 + 2.0 * w2_3 + w4));
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 16 samples using symmetry:
+    vec2 mirror_x = vec2(-1.0, 1.0);
+    vec2 mirror_y = vec2(1.0, -1.0);
+    vec2 mirror_xy = vec2(-1.0, -1.0);
+    vec2 dxdy_mirror_x = dxdy * mirror_x;
+    vec2 dxdy_mirror_y = dxdy * mirror_y;
+    vec2 dxdy_mirror_xy = dxdy * mirror_xy;
+    vec3 sample1a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample1d_texel_offset).rgb;
+    vec3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb;
+    vec3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb;
+    vec3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb;
+    vec3 sample1b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample1d_texel_offset).rgb;
+    vec3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb;
+    vec3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb;
+    vec3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb;
+    vec3 sample1c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample1d_texel_offset).rgb;
+    vec3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb;
+    vec3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb;
+    vec3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb;
+    vec3 sample1d = tex2D_linearize(tex, tex_uv + dxdy * sample1d_texel_offset).rgb;
+    vec3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb;
+    vec3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb;
+    vec3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    vec3 sum = vec3(0.0);
+    sum += w1 * (sample1a + sample1b + sample1c + sample1d);
+    sum += w2_3 * (sample2a + sample2b + sample2c + sample2d);
+    sum += w2_3 * (sample3a + sample3b + sample3c + sample3d);
+    sum += w4 * (sample4a + sample4b + sample4c + sample4d);
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur5x5(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Perform a 1-pass 5x5 blur with 3x3 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 5x5 Gaussian blurred mipmapped texture lookup composed of
+    //              3x3 carefully selected bilinear samples.
+    //  Description:
+    //  First see the description for tex2Dblur9x9().  This blur uses the same
+    //  concept and sample/texel locations except on a smaller scale.  Samples:
+    //      2a 1U 2b
+    //      1L 0C 1R
+    //      2c 1D 2d
+    //  Texels:
+    //      2a4 2a3 1U2 2b3 2b4
+    //      2a2 2a1 1U1 2b1 2b2
+    //      1L2 1L1 0C1 1R1 1R2
+    //      2c2 2c1 1D1 2d1 2d2
+    //      2c4 2c3 1D2 2d3 2d4
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
+    float denom_inv = 0.5/(sigma*sigma);
+    float w1off = exp(-1.0 * denom_inv);
+    float w2off = exp(-4.0 * denom_inv);
+    float texel1to2ratio = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including x-axis-aligned:
+    vec2 sample1R_texel_offset = vec2(1.0, 0.0) + vec2(texel1to2ratio, 0.0);
+    vec2 sample2d_texel_offset = vec2(1.0, 1.0) + vec2(texel1to2ratio, texel1to2ratio);
+
+    //  CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
+    //  Statically compute Gaussian texel weights for the bottom-right quadrant.
+    //  Read underscores as "and."
+    float w1R1 = w1off;
+    float w1R2 = w2off;
+    float w2d1 =   exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv);
+    float w2d2_3 = exp(-LENGTH_SQ(vec2(2.0, 1.0)) * denom_inv);
+    float w2d4 =   exp(-LENGTH_SQ(vec2(2.0, 2.0)) * denom_inv);
+    //  Statically add texel weights in each sample to get sample weights:
+    float w0 = 1.0;
+    float w1 = w1R1 + w1R2;
+    float w2 = w2d1 + 2.0 * w2d2_3 + w2d4;
+    //  Get the weight sum inverse (normalization factor):
+    float weight_sum_inv = 1.0/(w0 + 4.0 * (w1 + w2));
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 9 samples (1 nearest, 4 linear, 4 bilinear) using symmetry:
+    vec2 mirror_x = vec2(-1.0, 1.0);
+    vec2 mirror_y = vec2(1.0, -1.0);
+    vec2 mirror_xy = vec2(-1.0, -1.0);
+    vec2 dxdy_mirror_x = dxdy * mirror_x;
+    vec2 dxdy_mirror_y = dxdy * mirror_y;
+    vec2 dxdy_mirror_xy = dxdy * mirror_xy;
+    vec3 sample0C = tex2D_linearize(tex, tex_uv).rgb;
+    vec3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb;
+    vec3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb;
+    vec3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb;
+    vec3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb;
+    vec3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb;
+    vec3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb;
+    vec3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb;
+    vec3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    vec3 sum = w0 * sample0C;
+    sum += w1 * (sample1R + sample1D + sample1L + sample1U);
+    sum += w2 * (sample2a + sample2b + sample2c + sample2d);
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur3x3(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Perform a 1-pass 3x3 blur with 5x5 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 3x3 Gaussian blurred mipmapped texture lookup composed of
+    //              2x2 carefully selected bilinear samples.
+    //  Description:
+    //  First see the descriptions for tex2Dblur9x9() and tex2Dblur7().  This
+    //  blur mixes concepts from both.  The sample layout is as follows:
+    //      0a 0b
+    //      0c 0d
+    //  The texel layout is as follows.  Note that samples 0a/0b and 0c/0d share
+    //  a vertical column of texels, and samples 0a/0c and 0b/0d share a
+    //  horizontal row of texels (all samples share the center texel):
+    //      0a3  0ab2 0b3
+    //      0ac1 0*0  0bd1
+    //      0c3  0cd2 0d3
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0off = 1.0;
+    float w1off = exp(-1.0 * denom_inv);
+    float texel0to1ratio = w1off/(w0off * 0.5 + w1off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including axis-aligned:
+    vec2 sample0d_texel_offset = vec2(texel0to1ratio, texel0to1ratio);
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 4 samples using symmetry:
+    vec2 mirror_x = vec2(-1.0, 1.0);
+    vec2 mirror_y = vec2(1.0, -1.0);
+    vec2 mirror_xy = vec2(-1.0, -1.0);
+    vec2 dxdy_mirror_x = dxdy * mirror_x;
+    vec2 dxdy_mirror_y = dxdy * mirror_y;
+    vec2 dxdy_mirror_xy = dxdy * mirror_xy;
+    vec3 sample0a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample0d_texel_offset).rgb;
+    vec3 sample0b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample0d_texel_offset).rgb;
+    vec3 sample0c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample0d_texel_offset).rgb;
+    vec3 sample0d = tex2D_linearize(tex, tex_uv + dxdy * sample0d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Weights for all samples are the same, so just average them:
+    return 0.25 * (sample0a + sample0b + sample0c + sample0d);
+}
+
+vec3 tex2Dblur9fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur9fast(tex, tex_uv, dxdy, blur9_std_dev);
+}
+
+vec3 tex2Dblur17fast(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur17fast(texture, tex_uv, dxdy, blur17_std_dev);
+}
+
+vec3 tex2Dblur25fast(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur25fast(texture, tex_uv, dxdy, blur25_std_dev);
+}
+
+vec3 tex2Dblur43fast(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur43fast(texture, tex_uv, dxdy, blur43_std_dev);
+}
+vec3 tex2Dblur31fast(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur31fast(texture, tex_uv, dxdy, blur31_std_dev);
+}
+
+vec3 tex2Dblur3fast(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur3fast(texture, tex_uv, dxdy, blur3_std_dev);
+}
+
+vec3 tex2Dblur3x3(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur3x3(texture, tex_uv, dxdy, blur3_std_dev);
+}
+
+vec3 tex2Dblur5fast(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur5fast(texture, tex_uv, dxdy, blur5_std_dev);
+}
+
+vec3 tex2Dblur5resize(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur5resize(texture, tex_uv, dxdy, blur5_std_dev);
+}
+vec3 tex2Dblur3resize(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur3resize(texture, tex_uv, dxdy, blur3_std_dev);
+}
+
+vec3 tex2Dblur5x5(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur5x5(texture, tex_uv, dxdy, blur5_std_dev);
+}
+
+vec3 tex2Dblur7resize(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur7resize(texture, tex_uv, dxdy, blur7_std_dev);
+}
+
+vec3 tex2Dblur7fast(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur7fast(texture, tex_uv, dxdy, blur7_std_dev);
+}
+
+vec3 tex2Dblur7x7(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur7x7(texture, tex_uv, dxdy, blur7_std_dev);
+}
+
+vec3 tex2Dblur9resize(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur9resize(texture, tex_uv, dxdy, blur9_std_dev);
+}
+
+vec3 tex2Dblur9x9(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur9x9(texture, tex_uv, dxdy, blur9_std_dev);
+}
+
+vec3 tex2Dblur11resize(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur11resize(texture, tex_uv, dxdy, blur11_std_dev);
+}
+
+vec3 tex2Dblur11fast(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur11fast(texture, tex_uv, dxdy, blur11_std_dev);
+}
+
+#endif  //  BLUR_FUNCTIONS_H
+
+#define InputSize sourceSize[0].xy
+#define TextureSize sourceSize[0].xy
+#define OutputSize targetSize.xy
+
+void main() {
+   gl_Position = position;
+   vTexCoord = texCoord;
+	//  Get the uv sample distance between output pixels.  Blurs are not generic
+    //  Gaussian resizers, and correct blurs require:
+    //  1.) OutputSize == InputSize * 2^m, where m is an integer <= 0.
+    //  2.) mipmap_inputN = "true" for this pass in the preset if m != 0
+    //  3.) filter_linearN = "true" except for 1x scale nearest neighbor blurs
+    //  Gaussian resizers would upsize using the distance between input texels
+    //  (not output pixels), but we avoid this and consistently blur at the
+    //  destination size.  Otherwise, combining statically calculated weights
+    //  with bilinear sample exploitation would result in terrible artifacts.
+    vec2 dxdy_scale = InputSize/OutputSize;
+	vec2 dxdy = dxdy_scale/TextureSize;
+    //  This blur is vertical-only, so zero out the horizontal offset:
+	blur_dxdy = vec2(dxdy.x, 0.0);
+}
\ No newline at end of file
diff --git a/shaders/CRT-Royale.shader/blur9fast-vertical.fs b/shaders/CRT-Royale.shader/blur9fast-vertical.fs
new file mode 100644
index 00000000..c7293eed
--- /dev/null
+++ b/shaders/CRT-Royale.shader/blur9fast-vertical.fs
@@ -0,0 +1,2016 @@
+#version 150
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//  
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+#if __VERSION__ >= 130
+#define COMPAT_TEXTURE texture
+#else
+#define COMPAT_TEXTURE texture2D
+#endif
+
+#ifdef GL_ES
+#define COMPAT_PRECISION mediump
+#else
+#define COMPAT_PRECISION
+#endif
+
+uniform sampler2D source[];
+uniform vec4 sourceSize[];
+uniform vec4 targetSize;
+
+in Vertex {
+   vec2 vTexCoord;
+   vec2 blur_dxdy;
+};
+
+out vec4 FragColor;
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+//  PASS SETTINGS:
+//  gamma-management.h needs to know what kind of pipeline we're using and
+//  what pass this is in that pipeline.  This will become obsolete if/when we
+//  can #define things like this in the .cgp preset file.
+//#define GAMMA_ENCODE_EVERY_FBO
+//#define FIRST_PASS
+//#define LAST_PASS
+//#define SIMULATE_CRT_ON_LCD
+//#define SIMULATE_GBA_ON_LCD
+//#define SIMULATE_LCD_ON_CRT
+//#define SIMULATE_GBA_ON_CRT
+
+#ifndef GAMMA_MANAGEMENT_H
+#define GAMMA_MANAGEMENT_H
+
+///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
+
+//  Set standard gamma constants, but allow users to override them:
+#ifndef OVERRIDE_STANDARD_GAMMA
+    //  Standard encoding gammas:
+    float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
+    float pal_gamma = 2.8;     //  Never actually 2.8 in practice
+    //  Typical device decoding gammas (only use for emulating devices):
+    //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
+    //  gammas: The standards purposely undercorrected for an analog CRT's
+    //  assumed 2.5 reference display gamma to maintain contrast in assumed
+    //  [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
+    //  These unstated assumptions about display gamma and perceptual rendering
+    //  intent caused a lot of confusion, and more modern CRT's seemed to target
+    //  NTSC 2.2 gamma with circuitry.  LCD displays seem to have followed suit
+    //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
+    //  displays designed to view sRGB in bright environments.  (Standards are
+    //  also in flux again with BT.1886, but it's underspecified for displays.)
+    float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
+    float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
+    float lcd_reference_gamma = 2.5;       //  To match CRT
+    float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
+    float lcd_office_gamma = 2.2;  //  Approximates sRGB
+#endif  //  OVERRIDE_STANDARD_GAMMA
+
+//  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
+//  but only if they're aware of it.
+#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
+    bool assume_opaque_alpha = false;
+#endif
+
+
+///////////////////////  DERIVED CONSTANTS AS FUNCTIONS  ///////////////////////
+
+//  gamma-management.h should be compatible with overriding gamma values with
+//  runtime user parameters, but we can only define other global constants in
+//  terms of static constants, not uniform user parameters.  To get around this
+//  limitation, we need to define derived constants using functions.
+
+//  Set device gamma constants, but allow users to override them:
+#ifdef OVERRIDE_DEVICE_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    float get_crt_gamma()    {   return crt_gamma;   }
+    float get_gba_gamma()    {   return gba_gamma;   }
+    float get_lcd_gamma()    {   return lcd_gamma;   }
+#else
+    float get_crt_gamma()    {   return crt_reference_gamma_high;    }
+    float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
+    float get_lcd_gamma()    {   return lcd_office_gamma;            }
+#endif  //  OVERRIDE_DEVICE_GAMMA
+
+//  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
+#ifdef OVERRIDE_FINAL_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    float get_intermediate_gamma()   {   return intermediate_gamma;  }
+    float get_input_gamma()          {   return input_gamma;         }
+    float get_output_gamma()         {   return output_gamma;        }
+#else
+    //  If we gamma-correct every pass, always use ntsc_gamma between passes to
+    //  ensure middle passes don't need to care if anything is being simulated:
+    float get_intermediate_gamma()   {   return ntsc_gamma;          }
+    #ifdef SIMULATE_CRT_ON_LCD
+        float get_input_gamma()      {   return get_crt_gamma();     }
+        float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_LCD
+        float get_input_gamma()      {   return get_gba_gamma();     }
+        float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_LCD_ON_CRT
+        float get_input_gamma()      {   return get_lcd_gamma();     }
+        float get_output_gamma()     {   return get_crt_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_CRT
+        float get_input_gamma()      {   return get_gba_gamma();     }
+        float get_output_gamma()     {   return get_crt_gamma();     }
+    #else   //  Don't simulate anything:
+        float get_input_gamma()      {   return ntsc_gamma;          }
+        float get_output_gamma()     {   return ntsc_gamma;          }
+    #endif  //  SIMULATE_GBA_ON_CRT
+    #endif  //  SIMULATE_LCD_ON_CRT
+    #endif  //  SIMULATE_GBA_ON_LCD
+    #endif  //  SIMULATE_CRT_ON_LCD
+#endif  //  OVERRIDE_FINAL_GAMMA
+
+#ifndef GAMMA_ENCODE_EVERY_FBO
+    #ifdef FIRST_PASS
+        bool linearize_input = true;
+        float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        bool linearize_input = false;
+        float get_pass_input_gamma()     {   return 1.0;                 }
+    #endif
+    #ifdef LAST_PASS
+        bool gamma_encode_output = true;
+        float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        bool gamma_encode_output = false;
+        float get_pass_output_gamma()    {   return 1.0;                 }
+    #endif
+#else
+    bool linearize_input = true;
+    bool gamma_encode_output = true;
+    #ifdef FIRST_PASS
+        float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
+    #endif
+    #ifdef LAST_PASS
+        float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
+    #endif
+#endif
+
+vec4 decode_input(vec4 color)
+{
+    if(linearize_input = true)
+    {
+        if(assume_opaque_alpha = true)
+        {
+            return vec4(pow(color.rgb, vec3(get_pass_input_gamma())), 1.0);
+        }
+        else
+        {
+            return vec4(pow(color.rgb, vec3(get_pass_input_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+vec4 encode_output(vec4 color)
+{
+    if(gamma_encode_output = true)
+    {
+        if(assume_opaque_alpha = true)
+        {
+            return vec4(pow(color.rgb, vec3(1.0/get_pass_output_gamma())), 1.0);
+        }
+        else
+        {
+            return vec4(pow(color.rgb, vec3(1.0/get_pass_output_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D)))
+//vec4 tex2D_linearize(sampler2D tex, vec2 tex_coords)
+//{   return decode_input(vec4(COMPAT_TEXTURE(tex, tex_coords)));   }
+
+//#define tex2D_linearize(C, D, E) decode_input(vec4(COMPAT_TEXTURE(C, D, E)))
+//vec4 tex2D_linearize(sampler2D tex, vec2 tex_coords, int texel_off)
+//{   return decode_input(vec4(COMPAT_TEXTURE(tex, tex_coords, texel_off)));    }
+
+#endif  //  GAMMA_MANAGEMENT_H
+
+#ifndef BLUR_FUNCTIONS_H
+#define BLUR_FUNCTIONS_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides reusable one-pass and separable (two-pass) blurs.
+//  Requires:   All blurs share these requirements (dxdy requirement is split):
+//              1.) All requirements of gamma-management.h must be satisfied!
+//              2.) filter_linearN must == "true" in your .cgp preset unless
+//                  you're using tex2DblurNresize at 1x scale.
+//              3.) mipmap_inputN must == "true" in your .cgp preset if
+//                  IN.output_size < IN.video_size.
+//              4.) IN.output_size == IN.video_size / pow(2, M), where M is some
+//                  positive integer.  tex2Dblur*resize can resize arbitrarily
+//                  (and the blur will be done after resizing), but arbitrary
+//                  resizes "fail" with other blurs due to the way they mix
+//                  static weights with bilinear sample exploitation.
+//              5.) In general, dxdy should contain the uv pixel spacing:
+//                      dxdy = (IN.video_size/IN.output_size)/IN.texture_size
+//              6.) For separable blurs (tex2DblurNresize and tex2DblurNfast),
+//                  zero out the dxdy component in the unblurred dimension:
+//                      dxdy = vec2(dxdy.x, 0.0) or vec2(0.0, dxdy.y)
+//              Many blurs share these requirements:
+//              1.) One-pass blurs require scale_xN == scale_yN or scales > 1.0,
+//                  or they will blur more in the lower-scaled dimension.
+//              2.) One-pass shared sample blurs require ddx(), ddy(), and
+//                  tex2Dlod() to be supported by the current Cg profile, and
+//                  the drivers must support high-quality derivatives.
+//              3.) One-pass shared sample blurs require:
+//                      tex_uv.w == log2(IN.video_size/IN.output_size).y;
+//              Non-wrapper blurs share this requirement:
+//              1.) sigma is the intended standard deviation of the blur
+//              Wrapper blurs share this requirement, which is automatically
+//              met (unless OVERRIDE_BLUR_STD_DEVS is #defined; see below):
+//              1.) blurN_std_dev must be global static float values
+//                  specifying standard deviations for Nx blurs in units
+//                  of destination pixels
+//  Optional:   1.) The including file (or an earlier included file) may
+//                  optionally #define USE_BINOMIAL_BLUR_STD_DEVS to replace
+//                  default standard deviations with those matching a binomial
+//                  distribution.  (See below for details/properties.)
+//              2.) The including file (or an earlier included file) may
+//                  optionally #define OVERRIDE_BLUR_STD_DEVS and override:
+//                      static float blur3_std_dev
+//                      static float blur4_std_dev
+//                      static float blur5_std_dev
+//                      static float blur6_std_dev
+//                      static float blur7_std_dev
+//                      static float blur8_std_dev
+//                      static float blur9_std_dev
+//                      static float blur10_std_dev
+//                      static float blur11_std_dev
+//                      static float blur12_std_dev
+//                      static float blur17_std_dev
+//                      static float blur25_std_dev
+//                      static float blur31_std_dev
+//                      static float blur43_std_dev
+//              3.) The including file (or an earlier included file) may
+//                  optionally #define OVERRIDE_ERROR_BLURRING and override:
+//                      static float error_blurring
+//                  This tuning value helps mitigate weighting errors from one-
+//                  pass shared-sample blurs sharing bilinear samples between
+//                  fragments.  Values closer to 0.0 have "correct" blurriness
+//                  but allow more artifacts, and values closer to 1.0 blur away
+//                  artifacts by sampling closer to halfway between texels.
+//              UPDATE 6/21/14: The above static constants may now be overridden
+//              by non-static uniform constants.  This permits exposing blur
+//              standard deviations as runtime GUI shader parameters.  However,
+//              using them keeps weights from being statically computed, and the
+//              speed hit depends on the blur: On my machine, uniforms kill over
+//              53% of the framerate with tex2Dblur12x12shared, but they only
+//              drop the framerate by about 18% with tex2Dblur11fast.
+//  Quality and Performance Comparisons:
+//  For the purposes of the following discussion, "no sRGB" means
+//  GAMMA_ENCODE_EVERY_FBO is #defined, and "sRGB" means it isn't.
+//  1.) tex2DblurNfast is always faster than tex2DblurNresize.
+//  2.) tex2DblurNresize functions are the only ones that can arbitrarily resize
+//      well, because they're the only ones that don't exploit bilinear samples.
+//      This also means they're the only functions which can be truly gamma-
+//      correct without linear (or sRGB FBO) input, but only at 1x scale.
+//  3.) One-pass shared sample blurs only have a speed advantage without sRGB.
+//      They also have some inaccuracies due to their shared-[bilinear-]sample
+//      design, which grow increasingly bothersome for smaller blurs and higher-
+//      frequency source images (relative to their resolution).  I had high
+//      hopes for them, but their most realistic use case is limited to quickly
+//      reblurring an already blurred input at full resolution.  Otherwise:
+//      a.) If you're blurring a low-resolution source, you want a better blur.
+//      b.) If you're blurring a lower mipmap, you want a better blur.
+//      c.) If you're blurring a high-resolution, high-frequency source, you
+//          want a better blur.
+//  4.) The one-pass blurs without shared samples grow slower for larger blurs,
+//      but they're competitive with separable blurs at 5x5 and smaller, and
+//      even tex2Dblur7x7 isn't bad if you're wanting to conserve passes.
+//  Here are some framerates from a GeForce 8800GTS.  The first pass resizes to
+//  viewport size (4x in this test) and linearizes for sRGB codepaths, and the
+//  remaining passes perform 6 full blurs.  Mipmapped tests are performed at the
+//  same scale, so they just measure the cost of mipmapping each FBO (only every
+//  other FBO is mipmapped for separable blurs, to mimic realistic usage).
+//  Mipmap      Neither     sRGB+Mipmap sRGB        Function
+//  76.0        92.3        131.3       193.7       tex2Dblur3fast
+//  63.2        74.4        122.4       175.5       tex2Dblur3resize
+//  93.7        121.2       159.3       263.2       tex2Dblur3x3
+//  59.7        68.7        115.4       162.1       tex2Dblur3x3resize
+//  63.2        74.4        122.4       175.5       tex2Dblur5fast
+//  49.3        54.8        100.0       132.7       tex2Dblur5resize
+//  59.7        68.7        115.4       162.1       tex2Dblur5x5
+//  64.9        77.2        99.1        137.2       tex2Dblur6x6shared
+//  55.8        63.7        110.4       151.8       tex2Dblur7fast
+//  39.8        43.9        83.9        105.8       tex2Dblur7resize
+//  40.0        44.2        83.2        104.9       tex2Dblur7x7
+//  56.4        65.5        71.9        87.9        tex2Dblur8x8shared
+//  49.3        55.1        99.9        132.5       tex2Dblur9fast
+//  33.3        36.2        72.4        88.0        tex2Dblur9resize
+//  27.8        29.7        61.3        72.2        tex2Dblur9x9
+//  37.2        41.1        52.6        60.2        tex2Dblur10x10shared
+//  44.4        49.5        91.3        117.8       tex2Dblur11fast
+//  28.8        30.8        63.6        75.4        tex2Dblur11resize
+//  33.6        36.5        40.9        45.5        tex2Dblur12x12shared
+//  TODO: Fill in benchmarks for new untested blurs.
+//                                                  tex2Dblur17fast
+//                                                  tex2Dblur25fast
+//                                                  tex2Dblur31fast
+//                                                  tex2Dblur43fast
+//                                                  tex2Dblur3x3resize
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+//  Set static standard deviations, but allow users to override them with their
+//  own constants (even non-static uniforms if they're okay with the speed hit):
+#ifndef OVERRIDE_BLUR_STD_DEVS
+    //  blurN_std_dev values are specified in terms of dxdy strides.
+    #ifdef USE_BINOMIAL_BLUR_STD_DEVS
+        //  By request, we can define standard deviations corresponding to a
+        //  binomial distribution with p = 0.5 (related to Pascal's triangle).
+        //  This distribution works such that blurring multiple times should
+        //  have the same result as a single larger blur.  These values are
+        //  larger than default for blurs up to 6x and smaller thereafter.
+        float blur3_std_dev = 0.84931640625;
+        float blur4_std_dev = 0.84931640625;
+        float blur5_std_dev = 1.0595703125;
+        float blur6_std_dev = 1.06591796875;
+        float blur7_std_dev = 1.17041015625;
+        float blur8_std_dev = 1.1720703125;
+        float blur9_std_dev = 1.2259765625;
+        float blur10_std_dev = 1.21982421875;
+        float blur11_std_dev = 1.25361328125;
+        float blur12_std_dev = 1.2423828125;
+        float blur17_std_dev = 1.27783203125;
+        float blur25_std_dev = 1.2810546875;
+        float blur31_std_dev = 1.28125;
+        float blur43_std_dev = 1.28125;
+    #else
+        //  The defaults are the largest values that keep the largest unused
+        //  blur term on each side <= 1.0/256.0.  (We could get away with more
+        //  or be more conservative, but this compromise is pretty reasonable.)
+        float blur3_std_dev = 0.62666015625;
+        float blur4_std_dev = 0.66171875;
+        float blur5_std_dev = 0.9845703125;
+        float blur6_std_dev = 1.02626953125;
+        float blur7_std_dev = 1.36103515625;
+        float blur8_std_dev = 1.4080078125;
+        float blur9_std_dev = 1.7533203125;
+        float blur10_std_dev = 1.80478515625;
+        float blur11_std_dev = 2.15986328125;
+        float blur12_std_dev = 2.215234375;
+        float blur17_std_dev = 3.45535583496;
+        float blur25_std_dev = 5.3409576416;
+        float blur31_std_dev = 6.86488037109;
+        float blur43_std_dev = 10.1852050781;
+    #endif  //  USE_BINOMIAL_BLUR_STD_DEVS
+#endif  //  OVERRIDE_BLUR_STD_DEVS
+
+#ifndef OVERRIDE_ERROR_BLURRING
+    //  error_blurring should be in [0.0, 1.0].  Higher values reduce ringing
+    //  in shared-sample blurs but increase blurring and feature shifting.
+    float error_blurring = 0.5;
+#endif
+
+//  Make a length squared helper macro (for usage with static constants):
+#define LENGTH_SQ(vec) (dot(vec, vec))
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//  gamma-management.h relies on pass-specific settings to guide its behavior:
+//  FIRST_PASS, LAST_PASS, GAMMA_ENCODE_EVERY_FBO, etc.  See it for details.
+//#include "gamma-management.h"
+//#include "quad-pixel-communication.h"
+//#include "special-functions.h"
+
+#ifndef SPECIAL_FUNCTIONS_H
+#define SPECIAL_FUNCTIONS_H
+
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file implements the following mathematical special functions:
+//  1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2))
+//  2.) gamma(s), a real-numbered extension of the integer factorial function
+//  It also implements normalized_ligamma(s, z), a normalized lower incomplete
+//  gamma function for s < 0.5 only.  Both gamma() and normalized_ligamma() can
+//  be called with an _impl suffix to use an implementation version with a few
+//  extra precomputed parameters (which may be useful for the caller to reuse).
+//  See below for details.
+//
+//  Design Rationale:
+//  Pretty much every line of code in this file is duplicated four times for
+//  different input types (vec4/vec3/vec2/float).  This is unfortunate,
+//  but Cg doesn't allow function templates.  Macros would be far less verbose,
+//  but they would make the code harder to document and read.  I don't expect
+//  these functions will require a whole lot of maintenance changes unless
+//  someone ever has need for more robust incomplete gamma functions, so code
+//  duplication seems to be the lesser evil in this case.
+
+
+///////////////////////////  GAUSSIAN ERROR FUNCTION  //////////////////////////
+
+vec4 erf6(vec4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Return an Abramowitz/Stegun approximation of erf(), where:
+    //                  erf(x) = 2/sqrt(pi) * integral(e**(-x**2))
+    //              This approximation has a max absolute error of 2.5*10**-5
+    //              with solid numerical robustness and efficiency.  See:
+	//                  https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions
+	vec4 one = vec4(1.0);
+	vec4 sign_x = sign(x);
+	vec4 t = one/(one + 0.47047*abs(x));
+	vec4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+vec3 erf6(vec3 x)
+{
+    //  vec3 version:
+	vec3 one = vec3(1.0);
+	vec3 sign_x = sign(x);
+	vec3 t = one/(one + 0.47047*abs(x));
+	vec3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+vec2 erf6(vec2 x)
+{
+    //  vec2 version:
+	vec2 one = vec2(1.0);
+	vec2 sign_x = sign(x);
+	vec2 t = one/(one + 0.47047*abs(x));
+	vec2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float erf6(float x)
+{
+    //  Float version:
+	float sign_x = sign(x);
+	float t = 1.0/(1.0 + 0.47047*abs(x));
+	float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+vec4 erft(vec4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Approximate erf() with the hyperbolic tangent.  The error is
+    //              visually noticeable, but it's blazing fast and perceptually
+    //              close...at least on ATI hardware.  See:
+    //                  http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html
+    //  Warning:    Only use this if your hardware drivers correctly implement
+    //              tanh(): My nVidia 8800GTS returns garbage output.
+	return tanh(1.202760580 * x);
+}
+
+vec3 erft(vec3 x)
+{
+    //  vec3 version:
+	return tanh(1.202760580 * x);
+}
+
+vec2 erft(vec2 x)
+{
+    //  vec2 version:
+	return tanh(1.202760580 * x);
+}
+
+float erft(float x)
+{
+    //  Float version:
+	return tanh(1.202760580 * x);
+}
+
+vec4 erf(vec4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Some approximation of erf(x), depending on user settings.
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+vec3 erf(vec3 x)
+{
+    //  vec3 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+vec2 erf(vec2 x)
+{
+    //  vec2 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+float erf(float x)
+{
+    //  Float version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+///////////////////////////  COMPLETE GAMMA FUNCTION  //////////////////////////
+
+vec4 gamma_impl(vec4 s, vec4 s_inv)
+{
+    //  Requires:   1.) s is the standard parameter to the gamma function, and
+    //                  it should lie in the [0, 36] range.
+    //              2.) s_inv = 1.0/s.  This implementation function requires
+    //                  the caller to precompute this value, giving users the
+    //                  opportunity to reuse it.
+    //  Returns:    Return approximate gamma function (real-numbered factorial)
+    //              output using the Lanczos approximation with two coefficients
+    //              calculated using Paul Godfrey's method here:
+    //                  http://my.fit.edu/~gabdo/gamma.txt
+    //              An optimal g value for s in [0, 36] is ~1.12906830989, with
+    //              a maximum relative error of 0.000463 for 2**16 equally
+    //              evals.  We could use three coeffs (0.0000346 error) without
+    //              hurting latency, but this allows more parallelism with
+    //              outside instructions.
+	vec4 g = vec4(1.12906830989);
+	vec4 c0 = vec4(0.8109119309638332633713423362694399653724431);
+	vec4 c1 = vec4(0.4808354605142681877121661197951496120000040);
+	vec4 e = vec4(2.71828182845904523536028747135266249775724709);
+	vec4 sph = s + vec4(0.5);
+	vec4 lanczos_sum = c0 + c1/(s + vec4(1.0));
+	vec4 base = (sph + g)/e;  //  or (s + g + vec4(0.5))/e
+	//  gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s).
+	//  This has less error for small s's than (s -= 1.0) at the beginning.
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+vec3 gamma_impl(vec3 s, vec3 s_inv)
+{
+    //  vec3 version:
+	vec3 g = vec3(1.12906830989);
+	vec3 c0 = vec3(0.8109119309638332633713423362694399653724431);
+	vec3 c1 = vec3(0.4808354605142681877121661197951496120000040);
+	vec3 e = vec3(2.71828182845904523536028747135266249775724709);
+	vec3 sph = s + vec3(0.5);
+	vec3 lanczos_sum = c0 + c1/(s + vec3(1.0));
+	vec3 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+vec2 gamma_impl(vec2 s, vec2 s_inv)
+{
+    //  vec2 version:
+	vec2 g = vec2(1.12906830989);
+	vec2 c0 = vec2(0.8109119309638332633713423362694399653724431);
+	vec2 c1 = vec2(0.4808354605142681877121661197951496120000040);
+	vec2 e = vec2(2.71828182845904523536028747135266249775724709);
+	vec2 sph = s + vec2(0.5);
+	vec2 lanczos_sum = c0 + c1/(s + vec2(1.0));
+	vec2 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float gamma_impl(float s, float s_inv)
+{
+    //  Float version:
+	float g = 1.12906830989;
+	float c0 = 0.8109119309638332633713423362694399653724431;
+	float c1 = 0.4808354605142681877121661197951496120000040;
+	float e = 2.71828182845904523536028747135266249775724709;
+	float sph = s + 0.5;
+	float lanczos_sum = c0 + c1/(s + 1.0);
+	float base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+vec4 gamma(vec4 s)
+{
+    //  Requires:   s is the standard parameter to the gamma function, and it
+    //              should lie in the [0, 36] range.
+    //  Returns:    Return approximate gamma function output with a maximum
+    //              relative error of 0.000463.  See gamma_impl for details.
+	return gamma_impl(s, vec4(1.0)/s);
+}
+
+vec3 gamma(vec3 s)
+{
+    //  vec3 version:
+	return gamma_impl(s, vec3(1.0)/s);
+}
+
+vec2 gamma(vec2 s)
+{
+    //  vec2 version:
+	return gamma_impl(s, vec2(1.0)/s);
+}
+
+float gamma(float s)
+{
+    //  Float version:
+	return gamma_impl(s, 1.0/s);
+}
+
+////////////////  INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT)  ///////////////
+
+//  Lower incomplete gamma function for small s and z (implementation):
+vec4 ligamma_small_z_impl(vec4 s, vec4 z, vec4 s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z <= ~0.775075
+    //              3.) s_inv = 1.0/s (precomputed for outside reuse)
+    //  Returns:    A series representation for the lower incomplete gamma
+    //              function for small s and small z (4 terms).
+    //  The actual "rolled up" summation looks like:
+	//      last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0;
+	//      sum = last_sign * last_pow / ((s + k) * last_factorial)
+	//      for(int i = 0; i < 4; ++i)
+	//      {
+	//          last_sign *= -1.0; last_pow *= z; last_factorial *= i;
+	//          sum += last_sign * last_pow / ((s + k) * last_factorial);
+	//      }
+	//  Unrolled, constant-unfolded and arranged for madds and parallelism:
+	vec4 scale = pow(z, s);
+	vec4 sum = s_inv;  //  Summation iteration 0 result
+	//  Summation iterations 1, 2, and 3:
+	vec4 z_sq = z*z;
+	vec4 denom1 = s + vec4(1.0);
+	vec4 denom2 = 2.0*s + vec4(4.0);
+	vec4 denom3 = 6.0*s + vec4(18.0);
+	//vec4 denom4 = 24.0*s + vec4(96.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	//sum += z_sq * z_sq / denom4;
+	//  Scale and return:
+	return scale * sum;
+}
+
+vec3 ligamma_small_z_impl(vec3 s, vec3 z, vec3 s_inv)
+{
+    //  vec3 version:
+	vec3 scale = pow(z, s);
+	vec3 sum = s_inv;
+	vec3 z_sq = z*z;
+	vec3 denom1 = s + vec3(1.0);
+	vec3 denom2 = 2.0*s + vec3(4.0);
+	vec3 denom3 = 6.0*s + vec3(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+vec2 ligamma_small_z_impl(vec2 s, vec2 z, vec2 s_inv)
+{
+    //  vec2 version:
+	vec2 scale = pow(z, s);
+	vec2 sum = s_inv;
+	vec2 z_sq = z*z;
+	vec2 denom1 = s + vec2(1.0);
+	vec2 denom2 = 2.0*s + vec2(4.0);
+	vec2 denom3 = 6.0*s + vec2(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float ligamma_small_z_impl(float s, float z, float s_inv)
+{
+    //  Float version:
+	float scale = pow(z, s);
+	float sum = s_inv;
+	float z_sq = z*z;
+	float denom1 = s + 1.0;
+	float denom2 = 2.0*s + 4.0;
+	float denom3 = 6.0*s + 18.0;
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+//  Upper incomplete gamma function for small s and large z (implementation):
+vec4 uigamma_large_z_impl(vec4 s, vec4 z)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z > ~0.775075
+    //  Returns:    Gauss's continued fraction representation for the upper
+    //              incomplete gamma function (4 terms).
+	//  The "rolled up" continued fraction looks like this.  The denominator
+    //  is truncated, and it's calculated "from the bottom up:"
+	//      denom = vec4('inf');
+	//      vec4 one = vec4(1.0);
+	//      for(int i = 4; i > 0; --i)
+	//      {
+	//          denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom;
+	//      }
+	//  Unrolled and constant-unfolded for madds and parallelism:
+	vec4 numerator = pow(z, s) * exp(-z);
+	vec4 denom = vec4(7.0) + z - s;
+	denom = vec4(5.0) + z - s + (3.0*s - vec4(9.0))/denom;
+	denom = vec4(3.0) + z - s + (2.0*s - vec4(4.0))/denom;
+	denom = vec4(1.0) + z - s + (s - vec4(1.0))/denom;
+	return numerator / denom;
+}
+
+vec3 uigamma_large_z_impl(vec3 s, vec3 z)
+{
+    //  vec3 version:
+	vec3 numerator = pow(z, s) * exp(-z);
+	vec3 denom = vec3(7.0) + z - s;
+	denom = vec3(5.0) + z - s + (3.0*s - vec3(9.0))/denom;
+	denom = vec3(3.0) + z - s + (2.0*s - vec3(4.0))/denom;
+	denom = vec3(1.0) + z - s + (s - vec3(1.0))/denom;
+	return numerator / denom;
+}
+
+vec2 uigamma_large_z_impl(vec2 s, vec2 z)
+{
+    //  vec2 version:
+	vec2 numerator = pow(z, s) * exp(-z);
+	vec2 denom = vec2(7.0) + z - s;
+	denom = vec2(5.0) + z - s + (3.0*s - vec2(9.0))/denom;
+	denom = vec2(3.0) + z - s + (2.0*s - vec2(4.0))/denom;
+	denom = vec2(1.0) + z - s + (s - vec2(1.0))/denom;
+	return numerator / denom;
+}
+
+float uigamma_large_z_impl(float s, float z)
+{
+    //  Float version:
+	float numerator = pow(z, s) * exp(-z);
+	float denom = 7.0 + z - s;
+	denom = 5.0 + z - s + (3.0*s - 9.0)/denom;
+	denom = 3.0 + z - s + (2.0*s - 4.0)/denom;
+	denom = 1.0 + z - s + (s - 1.0)/denom;
+	return numerator / denom;
+}
+
+//  Normalized lower incomplete gamma function for small s (implementation):
+vec4 normalized_ligamma_impl(vec4 s, vec4 z,
+    vec4 s_inv, vec4 gamma_s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) s_inv = 1/s (precomputed for outside reuse)
+    //              3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse)
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  Since we only care about s < 0.5, we only need
+    //              to evaluate two branches (not four) based on z.  Each branch
+    //              uses four terms, with a max relative error of ~0.00182.  The
+    //              branch threshold and specifics were adapted for fewer terms
+    //              from Gil/Segura/Temme's paper here:
+    //                  http://oai.cwi.nl/oai/asset/20433/20433B.pdf
+	//  Evaluate both branches: Real branches test slower even when available.
+	vec4 thresh = vec4(0.775075);
+	bvec4 z_is_large = greaterThan(z , thresh);
+	vec4 z_size_check = vec4(z_is_large.x ? 1.0 : 0.0, z_is_large.y ? 1.0 : 0.0, z_is_large.z ? 1.0 : 0.0, z_is_large.w ? 1.0 : 0.0);
+	vec4 large_z = vec4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	vec4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	//  Combine the results from both branches:
+	return large_z * vec4(z_size_check) + small_z * vec4(z_size_check);
+}
+
+vec3 normalized_ligamma_impl(vec3 s, vec3 z,
+    vec3 s_inv, vec3 gamma_s_inv)
+{
+    //  vec3 version:
+	vec3 thresh = vec3(0.775075);
+	bvec3 z_is_large = greaterThan(z , thresh);
+	vec3 z_size_check = vec3(z_is_large.x ? 1.0 : 0.0, z_is_large.y ? 1.0 : 0.0, z_is_large.z ? 1.0 : 0.0);
+	vec3 large_z = vec3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	vec3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	return large_z * vec3(z_size_check) + small_z * vec3(z_size_check);
+}
+
+vec2 normalized_ligamma_impl(vec2 s, vec2 z,
+    vec2 s_inv, vec2 gamma_s_inv)
+{
+    //  vec2 version:
+	vec2 thresh = vec2(0.775075);
+	bvec2 z_is_large = greaterThan(z , thresh);
+	vec2 z_size_check = vec2(z_is_large.x ? 1.0 : 0.0, z_is_large.y ? 1.0 : 0.0);
+	vec2 large_z = vec2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	vec2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	return large_z * vec2(z_size_check) + small_z * vec2(z_size_check);
+}
+
+float normalized_ligamma_impl(float s, float z,
+    float s_inv, float gamma_s_inv)
+{
+    //  Float version:
+	float thresh = 0.775075;
+	float z_size_check = 0.0;
+	if (z > thresh) z_size_check = 1.0;
+	float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	return large_z * float(z_size_check) + small_z * float(z_size_check);
+}
+
+//  Normalized lower incomplete gamma function for small s:
+vec4 normalized_ligamma(vec4 s, vec4 z)
+{
+    //  Requires:   s < ~0.5
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  See normalized_ligamma_impl() for details.
+	vec4 s_inv = vec4(1.0)/s;
+	vec4 gamma_s_inv = vec4(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+vec3 normalized_ligamma(vec3 s, vec3 z)
+{
+    //  vec3 version:
+	vec3 s_inv = vec3(1.0)/s;
+	vec3 gamma_s_inv = vec3(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+vec2 normalized_ligamma(vec2 s, vec2 z)
+{
+    //  vec2 version:
+	vec2 s_inv = vec2(1.0)/s;
+	vec2 gamma_s_inv = vec2(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float normalized_ligamma(float s, float z)
+{
+    //  Float version:
+	float s_inv = 1.0/s;
+	float gamma_s_inv = 1.0/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+#endif  //  SPECIAL_FUNCTIONS_H
+
+///////////////////////////////////  HELPERS  //////////////////////////////////
+
+vec4 uv2_to_uv4(vec2 tex_uv)
+{
+    //  Make a vec2 uv offset safe for adding to vec4 tex2Dlod coords:
+    return vec4(tex_uv, 0.0, 0.0);
+}
+
+//  Make a length squared helper macro (for usage with static constants):
+#define LENGTH_SQ(vec) (dot(vec, vec))
+
+float get_fast_gaussian_weight_sum_inv(float sigma)
+{
+    //  We can use the Gaussian integral to calculate the asymptotic weight for
+    //  the center pixel.  Since the unnormalized center pixel weight is 1.0,
+    //  the normalized weight is the same as the weight sum inverse.  Given a
+    //  large enough blur (9+), the asymptotic weight sum is close and faster:
+    //      center_weight = 0.5 *
+    //          (erf(0.5/(sigma*sqrt(2.0))) - erf(-0.5/(sigma*sqrt(2.0))))
+    //      erf(-x) == -erf(x), so we get 0.5 * (2.0 * erf(blah blah)):
+    //  However, we can get even faster results with curve-fitting.  These are
+    //  also closer than the asymptotic results, because they were constructed
+    //  from 64 blurs sizes from [3, 131) and 255 equally-spaced sigmas from
+    //  (0, blurN_std_dev), so the results for smaller sigmas are biased toward
+    //  smaller blurs.  The max error is 0.0031793913.
+    //  Relative FPS: 134.3 with erf, 135.8 with curve-fitting.
+    //static float temp = 0.5/sqrt(2.0);
+    //return erf(temp/sigma);
+    return min(exp(exp(0.348348412457428/
+        (sigma - 0.0860587260734721))), 0.399334576340352/sigma);
+}
+
+////////////////////  ARBITRARILY RESIZABLE SEPARABLE BLURS  ///////////////////
+
+vec3 tex2Dblur11resize(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 11x Gaussian blurred texture lookup using a 11-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  Calculate Gaussian blur kernel weights and a normalization factor for
+    //  distances of 0-4, ignoring constant factors (since we're normalizing).
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float w4 = exp(-16.0 * denom_inv);
+    float w5 = exp(-25.0 * denom_inv);
+    float weight_sum_inv = 1.0 /
+        (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5));
+    //  Statically normalize weights, sum weighted samples, and return.  Blurs are
+    //  currently optimized for dynamic weights.
+    vec3 sum = vec3(0.0);
+    sum += w5 * tex2D_linearize(tex, tex_uv - 5.0 * dxdy).rgb;
+    sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
+    sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb;
+    sum += w5 * tex2D_linearize(tex, tex_uv + 5.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur9resize(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 9x Gaussian blurred texture lookup using a 9-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float w4 = exp(-16.0 * denom_inv);
+    float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
+    sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur7resize(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 7x Gaussian blurred texture lookup using a 7-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3));
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur5resize(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 5x Gaussian blurred texture lookup using a 5-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2));
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur3resize(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 3x Gaussian blurred texture lookup using a 3-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float weight_sum_inv = 1.0 / (w0 + 2.0 * w1);
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+///////////////////////////  FAST SEPARABLE BLURS  ///////////////////////////
+
+vec3 tex2Dblur11fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   1.) Global requirements must be met (see file description).
+    //              2.) filter_linearN must = "true" in your .cgp file.
+    //              3.) For gamma-correct bilinear filtering, global
+    //                  gamma_aware_bilinear == true (from gamma-management.h)
+    //  Returns:    A 1D 11x Gaussian blurred texture lookup using 6 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float w4 = exp(-16.0 * denom_inv);
+    float w5 = exp(-25.0 * denom_inv);
+    float weight_sum_inv = 1.0 /
+        (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    float w01 = w0 * 0.5 + w1;
+    float w23 = w2 + w3;
+    float w45 = w4 + w5;
+    float w01_ratio = w1/w01;
+    float w23_ratio = w3/w23;
+    float w45_ratio = w5/w45;
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w45 * tex2D_linearize(tex, tex_uv - (4.0 + w45_ratio) * dxdy).rgb;
+    sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb;
+    sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb;
+    sum += w45 * tex2D_linearize(tex, tex_uv + (4.0 + w45_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur17fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 17x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 8 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float w4 = exp(-16.0 * denom_inv);
+    float w5 = exp(-25.0 * denom_inv);
+    float w6 = exp(-36.0 * denom_inv);
+    float w7 = exp(-49.0 * denom_inv);
+    float w8 = exp(-64.0 * denom_inv);
+    //float weight_sum_inv = 1.0 / (w0 + 2.0 * (
+    //    w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8));
+    float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    float w1_2 = w1 + w2;
+    float w3_4 = w3 + w4;
+    float w5_6 = w5 + w6;
+    float w7_8 = w7 + w8;
+    float w1_2_ratio = w2/w1_2;
+    float w3_4_ratio = w4/w3_4;
+    float w5_6_ratio = w6/w5_6;
+    float w7_8_ratio = w8/w7_8;
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur25fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 25x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 12 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float w4 = exp(-16.0 * denom_inv);
+    float w5 = exp(-25.0 * denom_inv);
+    float w6 = exp(-36.0 * denom_inv);
+    float w7 = exp(-49.0 * denom_inv);
+    float w8 = exp(-64.0 * denom_inv);
+    float w9 = exp(-81.0 * denom_inv);
+    float w10 = exp(-100.0 * denom_inv);
+    float w11 = exp(-121.0 * denom_inv);
+    float w12 = exp(-144.0 * denom_inv);
+    //float weight_sum_inv = 1.0 / (w0 + 2.0 * (
+    //    w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12));
+    float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    float w1_2 = w1 + w2;
+    float w3_4 = w3 + w4;
+    float w5_6 = w5 + w6;
+    float w7_8 = w7 + w8;
+    float w9_10 = w9 + w10;
+    float w11_12 = w11 + w12;
+    float w1_2_ratio = w2/w1_2;
+    float w3_4_ratio = w4/w3_4;
+    float w5_6_ratio = w6/w5_6;
+    float w7_8_ratio = w8/w7_8;
+    float w9_10_ratio = w10/w9_10;
+    float w11_12_ratio = w12/w11_12;
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w11_12 * tex2D_linearize(tex, tex_uv - (11.0 + w11_12_ratio) * dxdy).rgb;
+    sum += w9_10 * tex2D_linearize(tex, tex_uv - (9.0 + w9_10_ratio) * dxdy).rgb;
+    sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w9_10 * tex2D_linearize(tex, tex_uv + (9.0 + w9_10_ratio) * dxdy).rgb;
+    sum += w11_12 * tex2D_linearize(tex, tex_uv + (11.0 + w11_12_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur31fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 31x Gaussian blurred texture lookup using 16 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float w4 = exp(-16.0 * denom_inv);
+    float w5 = exp(-25.0 * denom_inv);
+    float w6 = exp(-36.0 * denom_inv);
+    float w7 = exp(-49.0 * denom_inv);
+    float w8 = exp(-64.0 * denom_inv);
+    float w9 = exp(-81.0 * denom_inv);
+    float w10 = exp(-100.0 * denom_inv);
+    float w11 = exp(-121.0 * denom_inv);
+    float w12 = exp(-144.0 * denom_inv);
+    float w13 = exp(-169.0 * denom_inv);
+    float w14 = exp(-196.0 * denom_inv);
+    float w15 = exp(-225.0 * denom_inv);
+    //float weight_sum_inv = 1.0 /
+    //    (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 +
+    //        w9 + w10 + w11 + w12 + w13 + w14 + w15));
+    float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    float w0_1 = w0 * 0.5 + w1;
+    float w2_3 = w2 + w3;
+    float w4_5 = w4 + w5;
+    float w6_7 = w6 + w7;
+    float w8_9 = w8 + w9;
+    float w10_11 = w10 + w11;
+    float w12_13 = w12 + w13;
+    float w14_15 = w14 + w15;
+    float w0_1_ratio = w1/w0_1;
+    float w2_3_ratio = w3/w2_3;
+    float w4_5_ratio = w5/w4_5;
+    float w6_7_ratio = w7/w6_7;
+    float w8_9_ratio = w9/w8_9;
+    float w10_11_ratio = w11/w10_11;
+    float w12_13_ratio = w13/w12_13;
+    float w14_15_ratio = w15/w14_15;
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur43fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 43x Gaussian blurred texture lookup using 22 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float w4 = exp(-16.0 * denom_inv);
+    float w5 = exp(-25.0 * denom_inv);
+    float w6 = exp(-36.0 * denom_inv);
+    float w7 = exp(-49.0 * denom_inv);
+    float w8 = exp(-64.0 * denom_inv);
+    float w9 = exp(-81.0 * denom_inv);
+    float w10 = exp(-100.0 * denom_inv);
+    float w11 = exp(-121.0 * denom_inv);
+    float w12 = exp(-144.0 * denom_inv);
+    float w13 = exp(-169.0 * denom_inv);
+    float w14 = exp(-196.0 * denom_inv);
+    float w15 = exp(-225.0 * denom_inv);
+    float w16 = exp(-256.0 * denom_inv);
+    float w17 = exp(-289.0 * denom_inv);
+    float w18 = exp(-324.0 * denom_inv);
+    float w19 = exp(-361.0 * denom_inv);
+    float w20 = exp(-400.0 * denom_inv);
+    float w21 = exp(-441.0 * denom_inv);
+    //float weight_sum_inv = 1.0 /
+    //    (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 +
+    //        w12 + w13 + w14 + w15 + w16 + w17 + w18 + w19 + w20 + w21));
+    float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    float w0_1 = w0 * 0.5 + w1;
+    float w2_3 = w2 + w3;
+    float w4_5 = w4 + w5;
+    float w6_7 = w6 + w7;
+    float w8_9 = w8 + w9;
+    float w10_11 = w10 + w11;
+    float w12_13 = w12 + w13;
+    float w14_15 = w14 + w15;
+    float w16_17 = w16 + w17;
+    float w18_19 = w18 + w19;
+    float w20_21 = w20 + w21;
+    float w0_1_ratio = w1/w0_1;
+    float w2_3_ratio = w3/w2_3;
+    float w4_5_ratio = w5/w4_5;
+    float w6_7_ratio = w7/w6_7;
+    float w8_9_ratio = w9/w8_9;
+    float w10_11_ratio = w11/w10_11;
+    float w12_13_ratio = w13/w12_13;
+    float w14_15_ratio = w15/w14_15;
+    float w16_17_ratio = w17/w16_17;
+    float w18_19_ratio = w19/w18_19;
+    float w20_21_ratio = w21/w20_21;
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w20_21 * tex2D_linearize(tex, tex_uv - (20.0 + w20_21_ratio) * dxdy).rgb;
+    sum += w18_19 * tex2D_linearize(tex, tex_uv - (18.0 + w18_19_ratio) * dxdy).rgb;
+    sum += w16_17 * tex2D_linearize(tex, tex_uv - (16.0 + w16_17_ratio) * dxdy).rgb;
+    sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb;
+    sum += w16_17 * tex2D_linearize(tex, tex_uv + (16.0 + w16_17_ratio) * dxdy).rgb;
+    sum += w18_19 * tex2D_linearize(tex, tex_uv + (18.0 + w18_19_ratio) * dxdy).rgb;
+    sum += w20_21 * tex2D_linearize(tex, tex_uv + (20.0 + w20_21_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur3fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 3x Gaussian blurred texture lookup using 2 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float weight_sum_inv = 1.0 / (w0 + 2.0 * w1);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    float w01 = w0 * 0.5 + w1;
+    float w01_ratio = w1/w01;
+    //  Weights for all samples are the same, so just average them:
+    return 0.5 * (
+        tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb +
+        tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb);
+}
+
+vec3 tex2Dblur5fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 5x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 2 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    float w12 = w1 + w2;
+    float w12_ratio = w2/w12;
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur7fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 7x Gaussian blurred texture lookup using 4 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    float w01 = w0 * 0.5 + w1;
+    float w23 = w2 + w3;
+    float w01_ratio = w1/w01;
+    float w23_ratio = w3/w23;
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb;
+    sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+////////////////////  ARBITRARILY RESIZABLE ONE-PASS BLURS  ////////////////////
+
+vec3 tex2Dblur3x3resize(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 3x3 Gaussian blurred mipmapped texture lookup of the
+    //              resized input.
+    //  Description:
+    //  This is the only arbitrarily resizable one-pass blur; tex2Dblur5x5resize
+    //  would perform like tex2Dblur9x9, MUCH slower than tex2Dblur5resize.
+    float denom_inv = 0.5/(sigma*sigma);
+    //  Load each sample.  We need all 3x3 samples.  Quad-pixel communication
+    //  won't help either: This should perform like tex2Dblur5x5, but sharing a
+    //  4x4 sample field would perform more like tex2Dblur8x8shared (worse).
+    vec2 sample4_uv = tex_uv;
+    vec2 dx = vec2(dxdy.x, 0.0);
+    vec2 dy = vec2(0.0, dxdy.y);
+    vec2 sample1_uv = sample4_uv - dy;
+    vec2 sample7_uv = sample4_uv + dy;
+    vec3 sample0 = tex2D_linearize(tex, sample1_uv - dx).rgb;
+    vec3 sample1 = tex2D_linearize(tex, sample1_uv).rgb;
+    vec3 sample2 = tex2D_linearize(tex, sample1_uv + dx).rgb;
+    vec3 sample3 = tex2D_linearize(tex, sample4_uv - dx).rgb;
+    vec3 sample4 = tex2D_linearize(tex, sample4_uv).rgb;
+    vec3 sample5 = tex2D_linearize(tex, sample4_uv + dx).rgb;
+    vec3 sample6 = tex2D_linearize(tex, sample7_uv - dx).rgb;
+    vec3 sample7 = tex2D_linearize(tex, sample7_uv).rgb;
+    vec3 sample8 = tex2D_linearize(tex, sample7_uv + dx).rgb;
+    //  Statically compute Gaussian sample weights:
+    float w4 = 1.0;
+    float w1_3_5_7 = exp(-LENGTH_SQ(vec2(1.0, 0.0)) * denom_inv);
+    float w0_2_6_8 = exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv);
+    float weight_sum_inv = 1.0/(w4 + 4.0 * (w1_3_5_7 + w0_2_6_8));
+    //  Weight and sum the samples:
+    vec3 sum = w4 * sample4 +
+        w1_3_5_7 * (sample1 + sample3 + sample5 + sample7) +
+        w0_2_6_8 * (sample0 + sample2 + sample6 + sample8);
+    return sum * weight_sum_inv;
+}
+
+//  Resizable one-pass blurs:
+vec3 tex2Dblur3x3resize(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur3x3resize(texture, tex_uv, dxdy, blur3_std_dev);
+}
+
+vec3 tex2Dblur9fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 9x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 4 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float w4 = exp(-16.0 * denom_inv);
+    float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    float w12 = w1 + w2;
+    float w34 = w3 + w4;
+    float w12_ratio = w2/w12;
+    float w34_ratio = w4/w34;
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w34 * tex2D_linearize(tex, tex_uv - (3.0 + w34_ratio) * dxdy).rgb;
+    sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb;
+    sum += w34 * tex2D_linearize(tex, tex_uv + (3.0 + w34_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur9x9(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Perform a 1-pass 9x9 blur with 5x5 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 9x9 Gaussian blurred mipmapped texture lookup composed of
+    //              5x5 carefully selected bilinear samples.
+    //  Description:
+    //  Perform a 1-pass 9x9 blur with 5x5 bilinear samples.  Adjust the
+    //  bilinear sample location to reflect the true Gaussian weights for each
+    //  underlying texel.  The following diagram illustrates the relative
+    //  locations of bilinear samples.  Each sample with the same number has the
+    //  same weight (notice the symmetry).  The letters a, b, c, d distinguish
+    //  quadrants, and the letters U, D, L, R, C (up, down, left, right, center)
+    //  distinguish 1D directions along the line containing the pixel center:
+    //      6a 5a 2U 5b 6b
+    //      4a 3a 1U 3b 4b
+    //      2L 1L 0C 1R 2R
+    //      4c 3c 1D 3d 4d
+    //      6c 5c 2D 5d 6d
+    //  The following diagram illustrates the underlying equally spaced texels,
+    //  named after the sample that accesses them and subnamed by their location
+    //  within their 2x2, 2x1, 1x2, or 1x1 texel block:
+    //      6a4 6a3 5a4 5a3 2U2 5b3 5b4 6b3 6b4
+    //      6a2 6a1 5a2 5a1 2U1 5b1 5b2 6b1 6b2
+    //      4a4 4a3 3a4 3a3 1U2 3b3 3b4 4b3 4b4
+    //      4a2 4a1 3a2 3a1 1U1 3b1 3b2 4b1 4b2
+    //      2L2 2L1 1L2 1L1 0C1 1R1 1R2 2R1 2R2
+    //      4c2 4c1 3c2 3c1 1D1 3d1 3d2 4d1 4d2
+    //      4c4 4c3 3c4 3c3 1D2 3d3 3d4 4d3 4d4
+    //      6c2 6c1 5c2 5c1 2D1 5d1 5d2 6d1 6d2
+    //      6c4 6c3 5c4 5c3 2D2 5d3 5d4 6d3 6d4
+    //  Note there is only one C texel and only two texels for each U, D, L, or
+    //  R sample.  The center sample is effectively a nearest neighbor sample,
+    //  and the U/D/L/R samples use 1D linear filtering.  All other texels are
+    //  read with bilinear samples somewhere within their 2x2 texel blocks.
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute sampling offsets within each 2x2 texel block, based
+    //  on 1D sampling ratios between texels [1, 2] and [3, 4] texels away from
+    //  the center, and reuse them independently for both dimensions.  Compute
+    //  these offsets based on the relative 1D Gaussian weights of the texels
+    //  in question.  (w1off means "Gaussian weight for the texel 1.0 texels
+    //  away from the pixel center," etc.).
+    float denom_inv = 0.5/(sigma*sigma);
+    float w1off = exp(-1.0 * denom_inv);
+    float w2off = exp(-4.0 * denom_inv);
+    float w3off = exp(-9.0 * denom_inv);
+    float w4off = exp(-16.0 * denom_inv);
+    float texel1to2ratio = w2off/(w1off + w2off);
+    float texel3to4ratio = w4off/(w3off + w4off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including x-axis-aligned:
+    vec2 sample1R_texel_offset = vec2(1.0, 0.0) + vec2(texel1to2ratio, 0.0);
+    vec2 sample2R_texel_offset = vec2(3.0, 0.0) + vec2(texel3to4ratio, 0.0);
+    vec2 sample3d_texel_offset = vec2(1.0, 1.0) + vec2(texel1to2ratio, texel1to2ratio);
+    vec2 sample4d_texel_offset = vec2(3.0, 1.0) + vec2(texel3to4ratio, texel1to2ratio);
+    vec2 sample5d_texel_offset = vec2(1.0, 3.0) + vec2(texel1to2ratio, texel3to4ratio);
+    vec2 sample6d_texel_offset = vec2(3.0, 3.0) + vec2(texel3to4ratio, texel3to4ratio);
+
+    //  CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
+    //  Statically compute Gaussian texel weights for the bottom-right quadrant.
+    //  Read underscores as "and."
+    float w1R1 = w1off;
+    float w1R2 = w2off;
+    float w2R1 = w3off;
+    float w2R2 = w4off;
+    float w3d1 =     exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv);
+    float w3d2_3d3 = exp(-LENGTH_SQ(vec2(2.0, 1.0)) * denom_inv);
+    float w3d4 =     exp(-LENGTH_SQ(vec2(2.0, 2.0)) * denom_inv);
+    float w4d1_5d1 = exp(-LENGTH_SQ(vec2(3.0, 1.0)) * denom_inv);
+    float w4d2_5d3 = exp(-LENGTH_SQ(vec2(4.0, 1.0)) * denom_inv);
+    float w4d3_5d2 = exp(-LENGTH_SQ(vec2(3.0, 2.0)) * denom_inv);
+    float w4d4_5d4 = exp(-LENGTH_SQ(vec2(4.0, 2.0)) * denom_inv);
+    float w6d1 =     exp(-LENGTH_SQ(vec2(3.0, 3.0)) * denom_inv);
+    float w6d2_6d3 = exp(-LENGTH_SQ(vec2(4.0, 3.0)) * denom_inv);
+    float w6d4 =     exp(-LENGTH_SQ(vec2(4.0, 4.0)) * denom_inv);
+    //  Statically add texel weights in each sample to get sample weights:
+    float w0 = 1.0;
+    float w1 = w1R1 + w1R2;
+    float w2 = w2R1 + w2R2;
+    float w3 = w3d1 + 2.0 * w3d2_3d3 + w3d4;
+    float w4 = w4d1_5d1 + w4d2_5d3 + w4d3_5d2 + w4d4_5d4;
+    float w5 = w4;
+    float w6 = w6d1 + 2.0 * w6d2_6d3 + w6d4;
+    //  Get the weight sum inverse (normalization factor):
+    float weight_sum_inv =
+        1.0/(w0 + 4.0 * (w1 + w2 + w3 + w4 + w5 + w6));
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 25 samples (1 nearest, 8 linear, 16 bilinear) using symmetry:
+    vec2 mirror_x = vec2(-1.0, 1.0);
+    vec2 mirror_y = vec2(1.0, -1.0);
+    vec2 mirror_xy = vec2(-1.0, -1.0);
+    vec2 dxdy_mirror_x = dxdy * mirror_x;
+    vec2 dxdy_mirror_y = dxdy * mirror_y;
+    vec2 dxdy_mirror_xy = dxdy * mirror_xy;
+    //  Sampling order doesn't seem to affect performance, so just be clear:
+    vec3 sample0C = tex2D_linearize(tex, tex_uv).rgb;
+    vec3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb;
+    vec3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb;
+    vec3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb;
+    vec3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb;
+    vec3 sample2R = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset).rgb;
+    vec3 sample2D = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset.yx).rgb;
+    vec3 sample2L = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset).rgb;
+    vec3 sample2U = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset.yx).rgb;
+    vec3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb;
+    vec3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb;
+    vec3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb;
+    vec3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb;
+    vec3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb;
+    vec3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb;
+    vec3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb;
+    vec3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb;
+    vec3 sample5d = tex2D_linearize(tex, tex_uv + dxdy * sample5d_texel_offset).rgb;
+    vec3 sample5c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample5d_texel_offset).rgb;
+    vec3 sample5b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample5d_texel_offset).rgb;
+    vec3 sample5a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample5d_texel_offset).rgb;
+    vec3 sample6d = tex2D_linearize(tex, tex_uv + dxdy * sample6d_texel_offset).rgb;
+    vec3 sample6c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample6d_texel_offset).rgb;
+    vec3 sample6b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample6d_texel_offset).rgb;
+    vec3 sample6a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample6d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    vec3 sum = w0 * sample0C;
+    sum += w1 * (sample1R + sample1D + sample1L + sample1U);
+    sum += w2 * (sample2R + sample2D + sample2L + sample2U);
+    sum += w3 * (sample3d + sample3c + sample3b + sample3a);
+    sum += w4 * (sample4d + sample4c + sample4b + sample4a);
+    sum += w5 * (sample5d + sample5c + sample5b + sample5a);
+    sum += w6 * (sample6d + sample6c + sample6b + sample6a);
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur7x7(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Perform a 1-pass 7x7 blur with 5x5 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 7x7 Gaussian blurred mipmapped texture lookup composed of
+    //              4x4 carefully selected bilinear samples.
+    //  Description:
+    //  First see the descriptions for tex2Dblur9x9() and tex2Dblur7().  This
+    //  blur mixes concepts from both.  The sample layout is as follows:
+    //      4a 3a 3b 4b
+    //      2a 1a 1b 2b
+    //      2c 1c 1d 2d
+    //      4c 3c 3d 4d
+    //  The texel layout is as follows.  Note that samples 3a/3b, 1a/1b, 1c/1d,
+    //  and 3c/3d share a vertical column of texels, and samples 2a/2c, 1a/1c,
+    //  1b/1d, and 2b/2d share a horizontal row of texels (all sample1's share
+    //  the center texel):
+    //      4a4  4a3  3a4  3ab3 3b4  4b3  4b4
+    //      4a2  4a1  3a2  3ab1 3b2  4b1  4b2
+    //      2a4  2a3  1a4  1ab3 1b4  2b3  2b4
+    //      2ac2 2ac1 1ac2 1*   1bd2 2bd1 2bd2
+    //      2c4  2c3  1c4  1cd3 1d4  2d3  2d4
+    //      4c2  4c1  3c2  3cd1 3d2  4d1  4d2
+    //      4c4  4c3  3c4  3cd3 3d4  4d3  4d4
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0off = 1.0;
+    float w1off = exp(-1.0 * denom_inv);
+    float w2off = exp(-4.0 * denom_inv);
+    float w3off = exp(-9.0 * denom_inv);
+    float texel0to1ratio = w1off/(w0off * 0.5 + w1off);
+    float texel2to3ratio = w3off/(w2off + w3off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including axis-aligned:
+    vec2 sample1d_texel_offset = vec2(texel0to1ratio, texel0to1ratio);
+    vec2 sample2d_texel_offset = vec2(2.0, 0.0) + vec2(texel2to3ratio, texel0to1ratio);
+    vec2 sample3d_texel_offset = vec2(0.0, 2.0) + vec2(texel0to1ratio, texel2to3ratio);
+    vec2 sample4d_texel_offset = vec2(2.0, 2.0) + vec2(texel2to3ratio, texel2to3ratio);
+
+    //  CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
+    //  Statically compute Gaussian texel weights for the bottom-right quadrant.
+    //  Read underscores as "and."
+    float w1abcd = 1.0;
+    float w1bd2_1cd3 = exp(-LENGTH_SQ(vec2(1.0, 0.0)) * denom_inv);
+    float w2bd1_3cd1 = exp(-LENGTH_SQ(vec2(2.0, 0.0)) * denom_inv);
+    float w2bd2_3cd2 = exp(-LENGTH_SQ(vec2(3.0, 0.0)) * denom_inv);
+    float w1d4 =       exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv);
+    float w2d3_3d2 =   exp(-LENGTH_SQ(vec2(2.0, 1.0)) * denom_inv);
+    float w2d4_3d4 =   exp(-LENGTH_SQ(vec2(3.0, 1.0)) * denom_inv);
+    float w4d1 =       exp(-LENGTH_SQ(vec2(2.0, 2.0)) * denom_inv);
+    float w4d2_4d3 =   exp(-LENGTH_SQ(vec2(3.0, 2.0)) * denom_inv);
+    float w4d4 =       exp(-LENGTH_SQ(vec2(3.0, 3.0)) * denom_inv);
+    //  Statically add texel weights in each sample to get sample weights.
+    //  Split weights for shared texels between samples sharing them:
+    float w1 = w1abcd * 0.25 + w1bd2_1cd3 + w1d4;
+    float w2_3 = (w2bd1_3cd1 + w2bd2_3cd2) * 0.5 + w2d3_3d2 + w2d4_3d4;
+    float w4 = w4d1 + 2.0 * w4d2_4d3 + w4d4;
+    //  Get the weight sum inverse (normalization factor):
+    float weight_sum_inv =
+        1.0/(4.0 * (w1 + 2.0 * w2_3 + w4));
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 16 samples using symmetry:
+    vec2 mirror_x = vec2(-1.0, 1.0);
+    vec2 mirror_y = vec2(1.0, -1.0);
+    vec2 mirror_xy = vec2(-1.0, -1.0);
+    vec2 dxdy_mirror_x = dxdy * mirror_x;
+    vec2 dxdy_mirror_y = dxdy * mirror_y;
+    vec2 dxdy_mirror_xy = dxdy * mirror_xy;
+    vec3 sample1a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample1d_texel_offset).rgb;
+    vec3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb;
+    vec3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb;
+    vec3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb;
+    vec3 sample1b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample1d_texel_offset).rgb;
+    vec3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb;
+    vec3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb;
+    vec3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb;
+    vec3 sample1c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample1d_texel_offset).rgb;
+    vec3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb;
+    vec3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb;
+    vec3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb;
+    vec3 sample1d = tex2D_linearize(tex, tex_uv + dxdy * sample1d_texel_offset).rgb;
+    vec3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb;
+    vec3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb;
+    vec3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    vec3 sum = vec3(0.0);
+    sum += w1 * (sample1a + sample1b + sample1c + sample1d);
+    sum += w2_3 * (sample2a + sample2b + sample2c + sample2d);
+    sum += w2_3 * (sample3a + sample3b + sample3c + sample3d);
+    sum += w4 * (sample4a + sample4b + sample4c + sample4d);
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur5x5(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Perform a 1-pass 5x5 blur with 3x3 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 5x5 Gaussian blurred mipmapped texture lookup composed of
+    //              3x3 carefully selected bilinear samples.
+    //  Description:
+    //  First see the description for tex2Dblur9x9().  This blur uses the same
+    //  concept and sample/texel locations except on a smaller scale.  Samples:
+    //      2a 1U 2b
+    //      1L 0C 1R
+    //      2c 1D 2d
+    //  Texels:
+    //      2a4 2a3 1U2 2b3 2b4
+    //      2a2 2a1 1U1 2b1 2b2
+    //      1L2 1L1 0C1 1R1 1R2
+    //      2c2 2c1 1D1 2d1 2d2
+    //      2c4 2c3 1D2 2d3 2d4
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
+    float denom_inv = 0.5/(sigma*sigma);
+    float w1off = exp(-1.0 * denom_inv);
+    float w2off = exp(-4.0 * denom_inv);
+    float texel1to2ratio = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including x-axis-aligned:
+    vec2 sample1R_texel_offset = vec2(1.0, 0.0) + vec2(texel1to2ratio, 0.0);
+    vec2 sample2d_texel_offset = vec2(1.0, 1.0) + vec2(texel1to2ratio, texel1to2ratio);
+
+    //  CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
+    //  Statically compute Gaussian texel weights for the bottom-right quadrant.
+    //  Read underscores as "and."
+    float w1R1 = w1off;
+    float w1R2 = w2off;
+    float w2d1 =   exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv);
+    float w2d2_3 = exp(-LENGTH_SQ(vec2(2.0, 1.0)) * denom_inv);
+    float w2d4 =   exp(-LENGTH_SQ(vec2(2.0, 2.0)) * denom_inv);
+    //  Statically add texel weights in each sample to get sample weights:
+    float w0 = 1.0;
+    float w1 = w1R1 + w1R2;
+    float w2 = w2d1 + 2.0 * w2d2_3 + w2d4;
+    //  Get the weight sum inverse (normalization factor):
+    float weight_sum_inv = 1.0/(w0 + 4.0 * (w1 + w2));
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 9 samples (1 nearest, 4 linear, 4 bilinear) using symmetry:
+    vec2 mirror_x = vec2(-1.0, 1.0);
+    vec2 mirror_y = vec2(1.0, -1.0);
+    vec2 mirror_xy = vec2(-1.0, -1.0);
+    vec2 dxdy_mirror_x = dxdy * mirror_x;
+    vec2 dxdy_mirror_y = dxdy * mirror_y;
+    vec2 dxdy_mirror_xy = dxdy * mirror_xy;
+    vec3 sample0C = tex2D_linearize(tex, tex_uv).rgb;
+    vec3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb;
+    vec3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb;
+    vec3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb;
+    vec3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb;
+    vec3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb;
+    vec3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb;
+    vec3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb;
+    vec3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    vec3 sum = w0 * sample0C;
+    sum += w1 * (sample1R + sample1D + sample1L + sample1U);
+    sum += w2 * (sample2a + sample2b + sample2c + sample2d);
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur3x3(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Perform a 1-pass 3x3 blur with 5x5 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 3x3 Gaussian blurred mipmapped texture lookup composed of
+    //              2x2 carefully selected bilinear samples.
+    //  Description:
+    //  First see the descriptions for tex2Dblur9x9() and tex2Dblur7().  This
+    //  blur mixes concepts from both.  The sample layout is as follows:
+    //      0a 0b
+    //      0c 0d
+    //  The texel layout is as follows.  Note that samples 0a/0b and 0c/0d share
+    //  a vertical column of texels, and samples 0a/0c and 0b/0d share a
+    //  horizontal row of texels (all samples share the center texel):
+    //      0a3  0ab2 0b3
+    //      0ac1 0*0  0bd1
+    //      0c3  0cd2 0d3
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0off = 1.0;
+    float w1off = exp(-1.0 * denom_inv);
+    float texel0to1ratio = w1off/(w0off * 0.5 + w1off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including axis-aligned:
+    vec2 sample0d_texel_offset = vec2(texel0to1ratio, texel0to1ratio);
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 4 samples using symmetry:
+    vec2 mirror_x = vec2(-1.0, 1.0);
+    vec2 mirror_y = vec2(1.0, -1.0);
+    vec2 mirror_xy = vec2(-1.0, -1.0);
+    vec2 dxdy_mirror_x = dxdy * mirror_x;
+    vec2 dxdy_mirror_y = dxdy * mirror_y;
+    vec2 dxdy_mirror_xy = dxdy * mirror_xy;
+    vec3 sample0a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample0d_texel_offset).rgb;
+    vec3 sample0b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample0d_texel_offset).rgb;
+    vec3 sample0c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample0d_texel_offset).rgb;
+    vec3 sample0d = tex2D_linearize(tex, tex_uv + dxdy * sample0d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Weights for all samples are the same, so just average them:
+    return 0.25 * (sample0a + sample0b + sample0c + sample0d);
+}
+
+vec3 tex2Dblur9fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur9fast(tex, tex_uv, dxdy, blur9_std_dev);
+}
+
+vec3 tex2Dblur17fast(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur17fast(texture, tex_uv, dxdy, blur17_std_dev);
+}
+
+vec3 tex2Dblur25fast(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur25fast(texture, tex_uv, dxdy, blur25_std_dev);
+}
+
+vec3 tex2Dblur43fast(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur43fast(texture, tex_uv, dxdy, blur43_std_dev);
+}
+vec3 tex2Dblur31fast(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur31fast(texture, tex_uv, dxdy, blur31_std_dev);
+}
+
+vec3 tex2Dblur3fast(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur3fast(texture, tex_uv, dxdy, blur3_std_dev);
+}
+
+vec3 tex2Dblur3x3(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur3x3(texture, tex_uv, dxdy, blur3_std_dev);
+}
+
+vec3 tex2Dblur5fast(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur5fast(texture, tex_uv, dxdy, blur5_std_dev);
+}
+
+vec3 tex2Dblur5resize(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur5resize(texture, tex_uv, dxdy, blur5_std_dev);
+}
+vec3 tex2Dblur3resize(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur3resize(texture, tex_uv, dxdy, blur3_std_dev);
+}
+
+vec3 tex2Dblur5x5(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur5x5(texture, tex_uv, dxdy, blur5_std_dev);
+}
+
+vec3 tex2Dblur7resize(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur7resize(texture, tex_uv, dxdy, blur7_std_dev);
+}
+
+vec3 tex2Dblur7fast(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur7fast(texture, tex_uv, dxdy, blur7_std_dev);
+}
+
+vec3 tex2Dblur7x7(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur7x7(texture, tex_uv, dxdy, blur7_std_dev);
+}
+
+vec3 tex2Dblur9resize(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur9resize(texture, tex_uv, dxdy, blur9_std_dev);
+}
+
+vec3 tex2Dblur9x9(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur9x9(texture, tex_uv, dxdy, blur9_std_dev);
+}
+
+vec3 tex2Dblur11resize(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur11resize(texture, tex_uv, dxdy, blur11_std_dev);
+}
+
+vec3 tex2Dblur11fast(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur11fast(texture, tex_uv, dxdy, blur11_std_dev);
+}
+
+#endif  //  BLUR_FUNCTIONS_H
+
+#define Source source[0]
+#define tex_uv vTexCoord.xy
+
+#define InputSize sourceSize[0].xy
+#define TextureSize sourceSize[0].xy
+#define OutputSize targetSize.xy
+
+void main() {
+	vec3 color = tex2Dblur9fast(Source, tex_uv, blur_dxdy);
+    //  Encode and output the blurred image:
+    FragColor = encode_output(vec4(color, 1.0));
+}
\ No newline at end of file
diff --git a/shaders/CRT-Royale.shader/blur9fast-vertical.vs b/shaders/CRT-Royale.shader/blur9fast-vertical.vs
new file mode 100644
index 00000000..8c10ad96
--- /dev/null
+++ b/shaders/CRT-Royale.shader/blur9fast-vertical.vs
@@ -0,0 +1,2025 @@
+#version 150
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//  
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+#if __VERSION__ >= 130
+#define COMPAT_TEXTURE texture
+#else
+#define COMPAT_TEXTURE texture2D
+#endif
+
+#ifdef GL_ES
+#define COMPAT_PRECISION mediump
+#else
+#define COMPAT_PRECISION
+#endif
+
+in vec4 position;
+in vec2 texCoord;
+
+out Vertex {
+   vec2 vTexCoord;
+   vec2 blur_dxdy;
+};
+
+uniform vec4 targetSize;
+uniform vec4 sourceSize[];
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+//  PASS SETTINGS:
+//  gamma-management.h needs to know what kind of pipeline we're using and
+//  what pass this is in that pipeline.  This will become obsolete if/when we
+//  can #define things like this in the .cgp preset file.
+//#define GAMMA_ENCODE_EVERY_FBO
+//#define FIRST_PASS
+//#define LAST_PASS
+//#define SIMULATE_CRT_ON_LCD
+//#define SIMULATE_GBA_ON_LCD
+//#define SIMULATE_LCD_ON_CRT
+//#define SIMULATE_GBA_ON_CRT
+
+#ifndef GAMMA_MANAGEMENT_H
+#define GAMMA_MANAGEMENT_H
+
+///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
+
+//  Set standard gamma constants, but allow users to override them:
+#ifndef OVERRIDE_STANDARD_GAMMA
+    //  Standard encoding gammas:
+    float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
+    float pal_gamma = 2.8;     //  Never actually 2.8 in practice
+    //  Typical device decoding gammas (only use for emulating devices):
+    //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
+    //  gammas: The standards purposely undercorrected for an analog CRT's
+    //  assumed 2.5 reference display gamma to maintain contrast in assumed
+    //  [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
+    //  These unstated assumptions about display gamma and perceptual rendering
+    //  intent caused a lot of confusion, and more modern CRT's seemed to target
+    //  NTSC 2.2 gamma with circuitry.  LCD displays seem to have followed suit
+    //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
+    //  displays designed to view sRGB in bright environments.  (Standards are
+    //  also in flux again with BT.1886, but it's underspecified for displays.)
+    float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
+    float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
+    float lcd_reference_gamma = 2.5;       //  To match CRT
+    float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
+    float lcd_office_gamma = 2.2;  //  Approximates sRGB
+#endif  //  OVERRIDE_STANDARD_GAMMA
+
+//  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
+//  but only if they're aware of it.
+#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
+    bool assume_opaque_alpha = false;
+#endif
+
+
+///////////////////////  DERIVED CONSTANTS AS FUNCTIONS  ///////////////////////
+
+//  gamma-management.h should be compatible with overriding gamma values with
+//  runtime user parameters, but we can only define other global constants in
+//  terms of static constants, not uniform user parameters.  To get around this
+//  limitation, we need to define derived constants using functions.
+
+//  Set device gamma constants, but allow users to override them:
+#ifdef OVERRIDE_DEVICE_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    float get_crt_gamma()    {   return crt_gamma;   }
+    float get_gba_gamma()    {   return gba_gamma;   }
+    float get_lcd_gamma()    {   return lcd_gamma;   }
+#else
+    float get_crt_gamma()    {   return crt_reference_gamma_high;    }
+    float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
+    float get_lcd_gamma()    {   return lcd_office_gamma;            }
+#endif  //  OVERRIDE_DEVICE_GAMMA
+
+//  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
+#ifdef OVERRIDE_FINAL_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    float get_intermediate_gamma()   {   return intermediate_gamma;  }
+    float get_input_gamma()          {   return input_gamma;         }
+    float get_output_gamma()         {   return output_gamma;        }
+#else
+    //  If we gamma-correct every pass, always use ntsc_gamma between passes to
+    //  ensure middle passes don't need to care if anything is being simulated:
+    float get_intermediate_gamma()   {   return ntsc_gamma;          }
+    #ifdef SIMULATE_CRT_ON_LCD
+        float get_input_gamma()      {   return get_crt_gamma();     }
+        float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_LCD
+        float get_input_gamma()      {   return get_gba_gamma();     }
+        float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_LCD_ON_CRT
+        float get_input_gamma()      {   return get_lcd_gamma();     }
+        float get_output_gamma()     {   return get_crt_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_CRT
+        float get_input_gamma()      {   return get_gba_gamma();     }
+        float get_output_gamma()     {   return get_crt_gamma();     }
+    #else   //  Don't simulate anything:
+        float get_input_gamma()      {   return ntsc_gamma;          }
+        float get_output_gamma()     {   return ntsc_gamma;          }
+    #endif  //  SIMULATE_GBA_ON_CRT
+    #endif  //  SIMULATE_LCD_ON_CRT
+    #endif  //  SIMULATE_GBA_ON_LCD
+    #endif  //  SIMULATE_CRT_ON_LCD
+#endif  //  OVERRIDE_FINAL_GAMMA
+
+#ifndef GAMMA_ENCODE_EVERY_FBO
+    #ifdef FIRST_PASS
+        bool linearize_input = true;
+        float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        bool linearize_input = false;
+        float get_pass_input_gamma()     {   return 1.0;                 }
+    #endif
+    #ifdef LAST_PASS
+        bool gamma_encode_output = true;
+        float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        bool gamma_encode_output = false;
+        float get_pass_output_gamma()    {   return 1.0;                 }
+    #endif
+#else
+    bool linearize_input = true;
+    bool gamma_encode_output = true;
+    #ifdef FIRST_PASS
+        float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
+    #endif
+    #ifdef LAST_PASS
+        float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
+    #endif
+#endif
+
+vec4 decode_input(vec4 color)
+{
+    if(linearize_input = true)
+    {
+        if(assume_opaque_alpha = true)
+        {
+            return vec4(pow(color.rgb, vec3(get_pass_input_gamma())), 1.0);
+        }
+        else
+        {
+            return vec4(pow(color.rgb, vec3(get_pass_input_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+vec4 encode_output(vec4 color)
+{
+    if(gamma_encode_output = true)
+    {
+        if(assume_opaque_alpha = true)
+        {
+            return vec4(pow(color.rgb, vec3(1.0/get_pass_output_gamma())), 1.0);
+        }
+        else
+        {
+            return vec4(pow(color.rgb, vec3(1.0/get_pass_output_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D)))
+//vec4 tex2D_linearize(sampler2D tex, vec2 tex_coords)
+//{   return decode_input(vec4(COMPAT_TEXTURE(tex, tex_coords)));   }
+
+//#define tex2D_linearize(C, D, E) decode_input(vec4(COMPAT_TEXTURE(C, D, E)))
+//vec4 tex2D_linearize(sampler2D tex, vec2 tex_coords, int texel_off)
+//{   return decode_input(vec4(COMPAT_TEXTURE(tex, tex_coords, texel_off)));    }
+
+#endif  //  GAMMA_MANAGEMENT_H
+
+#ifndef BLUR_FUNCTIONS_H
+#define BLUR_FUNCTIONS_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides reusable one-pass and separable (two-pass) blurs.
+//  Requires:   All blurs share these requirements (dxdy requirement is split):
+//              1.) All requirements of gamma-management.h must be satisfied!
+//              2.) filter_linearN must == "true" in your .cgp preset unless
+//                  you're using tex2DblurNresize at 1x scale.
+//              3.) mipmap_inputN must == "true" in your .cgp preset if
+//                  IN.output_size < IN.video_size.
+//              4.) IN.output_size == IN.video_size / pow(2, M), where M is some
+//                  positive integer.  tex2Dblur*resize can resize arbitrarily
+//                  (and the blur will be done after resizing), but arbitrary
+//                  resizes "fail" with other blurs due to the way they mix
+//                  static weights with bilinear sample exploitation.
+//              5.) In general, dxdy should contain the uv pixel spacing:
+//                      dxdy = (IN.video_size/IN.output_size)/IN.texture_size
+//              6.) For separable blurs (tex2DblurNresize and tex2DblurNfast),
+//                  zero out the dxdy component in the unblurred dimension:
+//                      dxdy = vec2(dxdy.x, 0.0) or vec2(0.0, dxdy.y)
+//              Many blurs share these requirements:
+//              1.) One-pass blurs require scale_xN == scale_yN or scales > 1.0,
+//                  or they will blur more in the lower-scaled dimension.
+//              2.) One-pass shared sample blurs require ddx(), ddy(), and
+//                  tex2Dlod() to be supported by the current Cg profile, and
+//                  the drivers must support high-quality derivatives.
+//              3.) One-pass shared sample blurs require:
+//                      tex_uv.w == log2(IN.video_size/IN.output_size).y;
+//              Non-wrapper blurs share this requirement:
+//              1.) sigma is the intended standard deviation of the blur
+//              Wrapper blurs share this requirement, which is automatically
+//              met (unless OVERRIDE_BLUR_STD_DEVS is #defined; see below):
+//              1.) blurN_std_dev must be global static float values
+//                  specifying standard deviations for Nx blurs in units
+//                  of destination pixels
+//  Optional:   1.) The including file (or an earlier included file) may
+//                  optionally #define USE_BINOMIAL_BLUR_STD_DEVS to replace
+//                  default standard deviations with those matching a binomial
+//                  distribution.  (See below for details/properties.)
+//              2.) The including file (or an earlier included file) may
+//                  optionally #define OVERRIDE_BLUR_STD_DEVS and override:
+//                      static float blur3_std_dev
+//                      static float blur4_std_dev
+//                      static float blur5_std_dev
+//                      static float blur6_std_dev
+//                      static float blur7_std_dev
+//                      static float blur8_std_dev
+//                      static float blur9_std_dev
+//                      static float blur10_std_dev
+//                      static float blur11_std_dev
+//                      static float blur12_std_dev
+//                      static float blur17_std_dev
+//                      static float blur25_std_dev
+//                      static float blur31_std_dev
+//                      static float blur43_std_dev
+//              3.) The including file (or an earlier included file) may
+//                  optionally #define OVERRIDE_ERROR_BLURRING and override:
+//                      static float error_blurring
+//                  This tuning value helps mitigate weighting errors from one-
+//                  pass shared-sample blurs sharing bilinear samples between
+//                  fragments.  Values closer to 0.0 have "correct" blurriness
+//                  but allow more artifacts, and values closer to 1.0 blur away
+//                  artifacts by sampling closer to halfway between texels.
+//              UPDATE 6/21/14: The above static constants may now be overridden
+//              by non-static uniform constants.  This permits exposing blur
+//              standard deviations as runtime GUI shader parameters.  However,
+//              using them keeps weights from being statically computed, and the
+//              speed hit depends on the blur: On my machine, uniforms kill over
+//              53% of the framerate with tex2Dblur12x12shared, but they only
+//              drop the framerate by about 18% with tex2Dblur11fast.
+//  Quality and Performance Comparisons:
+//  For the purposes of the following discussion, "no sRGB" means
+//  GAMMA_ENCODE_EVERY_FBO is #defined, and "sRGB" means it isn't.
+//  1.) tex2DblurNfast is always faster than tex2DblurNresize.
+//  2.) tex2DblurNresize functions are the only ones that can arbitrarily resize
+//      well, because they're the only ones that don't exploit bilinear samples.
+//      This also means they're the only functions which can be truly gamma-
+//      correct without linear (or sRGB FBO) input, but only at 1x scale.
+//  3.) One-pass shared sample blurs only have a speed advantage without sRGB.
+//      They also have some inaccuracies due to their shared-[bilinear-]sample
+//      design, which grow increasingly bothersome for smaller blurs and higher-
+//      frequency source images (relative to their resolution).  I had high
+//      hopes for them, but their most realistic use case is limited to quickly
+//      reblurring an already blurred input at full resolution.  Otherwise:
+//      a.) If you're blurring a low-resolution source, you want a better blur.
+//      b.) If you're blurring a lower mipmap, you want a better blur.
+//      c.) If you're blurring a high-resolution, high-frequency source, you
+//          want a better blur.
+//  4.) The one-pass blurs without shared samples grow slower for larger blurs,
+//      but they're competitive with separable blurs at 5x5 and smaller, and
+//      even tex2Dblur7x7 isn't bad if you're wanting to conserve passes.
+//  Here are some framerates from a GeForce 8800GTS.  The first pass resizes to
+//  viewport size (4x in this test) and linearizes for sRGB codepaths, and the
+//  remaining passes perform 6 full blurs.  Mipmapped tests are performed at the
+//  same scale, so they just measure the cost of mipmapping each FBO (only every
+//  other FBO is mipmapped for separable blurs, to mimic realistic usage).
+//  Mipmap      Neither     sRGB+Mipmap sRGB        Function
+//  76.0        92.3        131.3       193.7       tex2Dblur3fast
+//  63.2        74.4        122.4       175.5       tex2Dblur3resize
+//  93.7        121.2       159.3       263.2       tex2Dblur3x3
+//  59.7        68.7        115.4       162.1       tex2Dblur3x3resize
+//  63.2        74.4        122.4       175.5       tex2Dblur5fast
+//  49.3        54.8        100.0       132.7       tex2Dblur5resize
+//  59.7        68.7        115.4       162.1       tex2Dblur5x5
+//  64.9        77.2        99.1        137.2       tex2Dblur6x6shared
+//  55.8        63.7        110.4       151.8       tex2Dblur7fast
+//  39.8        43.9        83.9        105.8       tex2Dblur7resize
+//  40.0        44.2        83.2        104.9       tex2Dblur7x7
+//  56.4        65.5        71.9        87.9        tex2Dblur8x8shared
+//  49.3        55.1        99.9        132.5       tex2Dblur9fast
+//  33.3        36.2        72.4        88.0        tex2Dblur9resize
+//  27.8        29.7        61.3        72.2        tex2Dblur9x9
+//  37.2        41.1        52.6        60.2        tex2Dblur10x10shared
+//  44.4        49.5        91.3        117.8       tex2Dblur11fast
+//  28.8        30.8        63.6        75.4        tex2Dblur11resize
+//  33.6        36.5        40.9        45.5        tex2Dblur12x12shared
+//  TODO: Fill in benchmarks for new untested blurs.
+//                                                  tex2Dblur17fast
+//                                                  tex2Dblur25fast
+//                                                  tex2Dblur31fast
+//                                                  tex2Dblur43fast
+//                                                  tex2Dblur3x3resize
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+//  Set static standard deviations, but allow users to override them with their
+//  own constants (even non-static uniforms if they're okay with the speed hit):
+#ifndef OVERRIDE_BLUR_STD_DEVS
+    //  blurN_std_dev values are specified in terms of dxdy strides.
+    #ifdef USE_BINOMIAL_BLUR_STD_DEVS
+        //  By request, we can define standard deviations corresponding to a
+        //  binomial distribution with p = 0.5 (related to Pascal's triangle).
+        //  This distribution works such that blurring multiple times should
+        //  have the same result as a single larger blur.  These values are
+        //  larger than default for blurs up to 6x and smaller thereafter.
+        float blur3_std_dev = 0.84931640625;
+        float blur4_std_dev = 0.84931640625;
+        float blur5_std_dev = 1.0595703125;
+        float blur6_std_dev = 1.06591796875;
+        float blur7_std_dev = 1.17041015625;
+        float blur8_std_dev = 1.1720703125;
+        float blur9_std_dev = 1.2259765625;
+        float blur10_std_dev = 1.21982421875;
+        float blur11_std_dev = 1.25361328125;
+        float blur12_std_dev = 1.2423828125;
+        float blur17_std_dev = 1.27783203125;
+        float blur25_std_dev = 1.2810546875;
+        float blur31_std_dev = 1.28125;
+        float blur43_std_dev = 1.28125;
+    #else
+        //  The defaults are the largest values that keep the largest unused
+        //  blur term on each side <= 1.0/256.0.  (We could get away with more
+        //  or be more conservative, but this compromise is pretty reasonable.)
+        float blur3_std_dev = 0.62666015625;
+        float blur4_std_dev = 0.66171875;
+        float blur5_std_dev = 0.9845703125;
+        float blur6_std_dev = 1.02626953125;
+        float blur7_std_dev = 1.36103515625;
+        float blur8_std_dev = 1.4080078125;
+        float blur9_std_dev = 1.7533203125;
+        float blur10_std_dev = 1.80478515625;
+        float blur11_std_dev = 2.15986328125;
+        float blur12_std_dev = 2.215234375;
+        float blur17_std_dev = 3.45535583496;
+        float blur25_std_dev = 5.3409576416;
+        float blur31_std_dev = 6.86488037109;
+        float blur43_std_dev = 10.1852050781;
+    #endif  //  USE_BINOMIAL_BLUR_STD_DEVS
+#endif  //  OVERRIDE_BLUR_STD_DEVS
+
+#ifndef OVERRIDE_ERROR_BLURRING
+    //  error_blurring should be in [0.0, 1.0].  Higher values reduce ringing
+    //  in shared-sample blurs but increase blurring and feature shifting.
+    float error_blurring = 0.5;
+#endif
+
+//  Make a length squared helper macro (for usage with static constants):
+#define LENGTH_SQ(vec) (dot(vec, vec))
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//  gamma-management.h relies on pass-specific settings to guide its behavior:
+//  FIRST_PASS, LAST_PASS, GAMMA_ENCODE_EVERY_FBO, etc.  See it for details.
+//#include "gamma-management.h"
+//#include "quad-pixel-communication.h"
+//#include "special-functions.h"
+
+#ifndef SPECIAL_FUNCTIONS_H
+#define SPECIAL_FUNCTIONS_H
+
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file implements the following mathematical special functions:
+//  1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2))
+//  2.) gamma(s), a real-numbered extension of the integer factorial function
+//  It also implements normalized_ligamma(s, z), a normalized lower incomplete
+//  gamma function for s < 0.5 only.  Both gamma() and normalized_ligamma() can
+//  be called with an _impl suffix to use an implementation version with a few
+//  extra precomputed parameters (which may be useful for the caller to reuse).
+//  See below for details.
+//
+//  Design Rationale:
+//  Pretty much every line of code in this file is duplicated four times for
+//  different input types (vec4/vec3/vec2/float).  This is unfortunate,
+//  but Cg doesn't allow function templates.  Macros would be far less verbose,
+//  but they would make the code harder to document and read.  I don't expect
+//  these functions will require a whole lot of maintenance changes unless
+//  someone ever has need for more robust incomplete gamma functions, so code
+//  duplication seems to be the lesser evil in this case.
+
+
+///////////////////////////  GAUSSIAN ERROR FUNCTION  //////////////////////////
+
+vec4 erf6(vec4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Return an Abramowitz/Stegun approximation of erf(), where:
+    //                  erf(x) = 2/sqrt(pi) * integral(e**(-x**2))
+    //              This approximation has a max absolute error of 2.5*10**-5
+    //              with solid numerical robustness and efficiency.  See:
+	//                  https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions
+	vec4 one = vec4(1.0);
+	vec4 sign_x = sign(x);
+	vec4 t = one/(one + 0.47047*abs(x));
+	vec4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+vec3 erf6(vec3 x)
+{
+    //  vec3 version:
+	vec3 one = vec3(1.0);
+	vec3 sign_x = sign(x);
+	vec3 t = one/(one + 0.47047*abs(x));
+	vec3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+vec2 erf6(vec2 x)
+{
+    //  vec2 version:
+	vec2 one = vec2(1.0);
+	vec2 sign_x = sign(x);
+	vec2 t = one/(one + 0.47047*abs(x));
+	vec2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float erf6(float x)
+{
+    //  Float version:
+	float sign_x = sign(x);
+	float t = 1.0/(1.0 + 0.47047*abs(x));
+	float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+vec4 erft(vec4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Approximate erf() with the hyperbolic tangent.  The error is
+    //              visually noticeable, but it's blazing fast and perceptually
+    //              close...at least on ATI hardware.  See:
+    //                  http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html
+    //  Warning:    Only use this if your hardware drivers correctly implement
+    //              tanh(): My nVidia 8800GTS returns garbage output.
+	return tanh(1.202760580 * x);
+}
+
+vec3 erft(vec3 x)
+{
+    //  vec3 version:
+	return tanh(1.202760580 * x);
+}
+
+vec2 erft(vec2 x)
+{
+    //  vec2 version:
+	return tanh(1.202760580 * x);
+}
+
+float erft(float x)
+{
+    //  Float version:
+	return tanh(1.202760580 * x);
+}
+
+vec4 erf(vec4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Some approximation of erf(x), depending on user settings.
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+vec3 erf(vec3 x)
+{
+    //  vec3 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+vec2 erf(vec2 x)
+{
+    //  vec2 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+float erf(float x)
+{
+    //  Float version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+///////////////////////////  COMPLETE GAMMA FUNCTION  //////////////////////////
+
+vec4 gamma_impl(vec4 s, vec4 s_inv)
+{
+    //  Requires:   1.) s is the standard parameter to the gamma function, and
+    //                  it should lie in the [0, 36] range.
+    //              2.) s_inv = 1.0/s.  This implementation function requires
+    //                  the caller to precompute this value, giving users the
+    //                  opportunity to reuse it.
+    //  Returns:    Return approximate gamma function (real-numbered factorial)
+    //              output using the Lanczos approximation with two coefficients
+    //              calculated using Paul Godfrey's method here:
+    //                  http://my.fit.edu/~gabdo/gamma.txt
+    //              An optimal g value for s in [0, 36] is ~1.12906830989, with
+    //              a maximum relative error of 0.000463 for 2**16 equally
+    //              evals.  We could use three coeffs (0.0000346 error) without
+    //              hurting latency, but this allows more parallelism with
+    //              outside instructions.
+	vec4 g = vec4(1.12906830989);
+	vec4 c0 = vec4(0.8109119309638332633713423362694399653724431);
+	vec4 c1 = vec4(0.4808354605142681877121661197951496120000040);
+	vec4 e = vec4(2.71828182845904523536028747135266249775724709);
+	vec4 sph = s + vec4(0.5);
+	vec4 lanczos_sum = c0 + c1/(s + vec4(1.0));
+	vec4 base = (sph + g)/e;  //  or (s + g + vec4(0.5))/e
+	//  gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s).
+	//  This has less error for small s's than (s -= 1.0) at the beginning.
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+vec3 gamma_impl(vec3 s, vec3 s_inv)
+{
+    //  vec3 version:
+	vec3 g = vec3(1.12906830989);
+	vec3 c0 = vec3(0.8109119309638332633713423362694399653724431);
+	vec3 c1 = vec3(0.4808354605142681877121661197951496120000040);
+	vec3 e = vec3(2.71828182845904523536028747135266249775724709);
+	vec3 sph = s + vec3(0.5);
+	vec3 lanczos_sum = c0 + c1/(s + vec3(1.0));
+	vec3 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+vec2 gamma_impl(vec2 s, vec2 s_inv)
+{
+    //  vec2 version:
+	vec2 g = vec2(1.12906830989);
+	vec2 c0 = vec2(0.8109119309638332633713423362694399653724431);
+	vec2 c1 = vec2(0.4808354605142681877121661197951496120000040);
+	vec2 e = vec2(2.71828182845904523536028747135266249775724709);
+	vec2 sph = s + vec2(0.5);
+	vec2 lanczos_sum = c0 + c1/(s + vec2(1.0));
+	vec2 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float gamma_impl(float s, float s_inv)
+{
+    //  Float version:
+	float g = 1.12906830989;
+	float c0 = 0.8109119309638332633713423362694399653724431;
+	float c1 = 0.4808354605142681877121661197951496120000040;
+	float e = 2.71828182845904523536028747135266249775724709;
+	float sph = s + 0.5;
+	float lanczos_sum = c0 + c1/(s + 1.0);
+	float base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+vec4 gamma(vec4 s)
+{
+    //  Requires:   s is the standard parameter to the gamma function, and it
+    //              should lie in the [0, 36] range.
+    //  Returns:    Return approximate gamma function output with a maximum
+    //              relative error of 0.000463.  See gamma_impl for details.
+	return gamma_impl(s, vec4(1.0)/s);
+}
+
+vec3 gamma(vec3 s)
+{
+    //  vec3 version:
+	return gamma_impl(s, vec3(1.0)/s);
+}
+
+vec2 gamma(vec2 s)
+{
+    //  vec2 version:
+	return gamma_impl(s, vec2(1.0)/s);
+}
+
+float gamma(float s)
+{
+    //  Float version:
+	return gamma_impl(s, 1.0/s);
+}
+
+////////////////  INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT)  ///////////////
+
+//  Lower incomplete gamma function for small s and z (implementation):
+vec4 ligamma_small_z_impl(vec4 s, vec4 z, vec4 s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z <= ~0.775075
+    //              3.) s_inv = 1.0/s (precomputed for outside reuse)
+    //  Returns:    A series representation for the lower incomplete gamma
+    //              function for small s and small z (4 terms).
+    //  The actual "rolled up" summation looks like:
+	//      last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0;
+	//      sum = last_sign * last_pow / ((s + k) * last_factorial)
+	//      for(int i = 0; i < 4; ++i)
+	//      {
+	//          last_sign *= -1.0; last_pow *= z; last_factorial *= i;
+	//          sum += last_sign * last_pow / ((s + k) * last_factorial);
+	//      }
+	//  Unrolled, constant-unfolded and arranged for madds and parallelism:
+	vec4 scale = pow(z, s);
+	vec4 sum = s_inv;  //  Summation iteration 0 result
+	//  Summation iterations 1, 2, and 3:
+	vec4 z_sq = z*z;
+	vec4 denom1 = s + vec4(1.0);
+	vec4 denom2 = 2.0*s + vec4(4.0);
+	vec4 denom3 = 6.0*s + vec4(18.0);
+	//vec4 denom4 = 24.0*s + vec4(96.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	//sum += z_sq * z_sq / denom4;
+	//  Scale and return:
+	return scale * sum;
+}
+
+vec3 ligamma_small_z_impl(vec3 s, vec3 z, vec3 s_inv)
+{
+    //  vec3 version:
+	vec3 scale = pow(z, s);
+	vec3 sum = s_inv;
+	vec3 z_sq = z*z;
+	vec3 denom1 = s + vec3(1.0);
+	vec3 denom2 = 2.0*s + vec3(4.0);
+	vec3 denom3 = 6.0*s + vec3(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+vec2 ligamma_small_z_impl(vec2 s, vec2 z, vec2 s_inv)
+{
+    //  vec2 version:
+	vec2 scale = pow(z, s);
+	vec2 sum = s_inv;
+	vec2 z_sq = z*z;
+	vec2 denom1 = s + vec2(1.0);
+	vec2 denom2 = 2.0*s + vec2(4.0);
+	vec2 denom3 = 6.0*s + vec2(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float ligamma_small_z_impl(float s, float z, float s_inv)
+{
+    //  Float version:
+	float scale = pow(z, s);
+	float sum = s_inv;
+	float z_sq = z*z;
+	float denom1 = s + 1.0;
+	float denom2 = 2.0*s + 4.0;
+	float denom3 = 6.0*s + 18.0;
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+//  Upper incomplete gamma function for small s and large z (implementation):
+vec4 uigamma_large_z_impl(vec4 s, vec4 z)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z > ~0.775075
+    //  Returns:    Gauss's continued fraction representation for the upper
+    //              incomplete gamma function (4 terms).
+	//  The "rolled up" continued fraction looks like this.  The denominator
+    //  is truncated, and it's calculated "from the bottom up:"
+	//      denom = vec4('inf');
+	//      vec4 one = vec4(1.0);
+	//      for(int i = 4; i > 0; --i)
+	//      {
+	//          denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom;
+	//      }
+	//  Unrolled and constant-unfolded for madds and parallelism:
+	vec4 numerator = pow(z, s) * exp(-z);
+	vec4 denom = vec4(7.0) + z - s;
+	denom = vec4(5.0) + z - s + (3.0*s - vec4(9.0))/denom;
+	denom = vec4(3.0) + z - s + (2.0*s - vec4(4.0))/denom;
+	denom = vec4(1.0) + z - s + (s - vec4(1.0))/denom;
+	return numerator / denom;
+}
+
+vec3 uigamma_large_z_impl(vec3 s, vec3 z)
+{
+    //  vec3 version:
+	vec3 numerator = pow(z, s) * exp(-z);
+	vec3 denom = vec3(7.0) + z - s;
+	denom = vec3(5.0) + z - s + (3.0*s - vec3(9.0))/denom;
+	denom = vec3(3.0) + z - s + (2.0*s - vec3(4.0))/denom;
+	denom = vec3(1.0) + z - s + (s - vec3(1.0))/denom;
+	return numerator / denom;
+}
+
+vec2 uigamma_large_z_impl(vec2 s, vec2 z)
+{
+    //  vec2 version:
+	vec2 numerator = pow(z, s) * exp(-z);
+	vec2 denom = vec2(7.0) + z - s;
+	denom = vec2(5.0) + z - s + (3.0*s - vec2(9.0))/denom;
+	denom = vec2(3.0) + z - s + (2.0*s - vec2(4.0))/denom;
+	denom = vec2(1.0) + z - s + (s - vec2(1.0))/denom;
+	return numerator / denom;
+}
+
+float uigamma_large_z_impl(float s, float z)
+{
+    //  Float version:
+	float numerator = pow(z, s) * exp(-z);
+	float denom = 7.0 + z - s;
+	denom = 5.0 + z - s + (3.0*s - 9.0)/denom;
+	denom = 3.0 + z - s + (2.0*s - 4.0)/denom;
+	denom = 1.0 + z - s + (s - 1.0)/denom;
+	return numerator / denom;
+}
+
+//  Normalized lower incomplete gamma function for small s (implementation):
+vec4 normalized_ligamma_impl(vec4 s, vec4 z,
+    vec4 s_inv, vec4 gamma_s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) s_inv = 1/s (precomputed for outside reuse)
+    //              3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse)
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  Since we only care about s < 0.5, we only need
+    //              to evaluate two branches (not four) based on z.  Each branch
+    //              uses four terms, with a max relative error of ~0.00182.  The
+    //              branch threshold and specifics were adapted for fewer terms
+    //              from Gil/Segura/Temme's paper here:
+    //                  http://oai.cwi.nl/oai/asset/20433/20433B.pdf
+	//  Evaluate both branches: Real branches test slower even when available.
+	vec4 thresh = vec4(0.775075);
+	bvec4 z_is_large = greaterThan(z , thresh);
+	vec4 z_size_check = vec4(z_is_large.x ? 1.0 : 0.0, z_is_large.y ? 1.0 : 0.0, z_is_large.z ? 1.0 : 0.0, z_is_large.w ? 1.0 : 0.0);
+	vec4 large_z = vec4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	vec4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	//  Combine the results from both branches:
+	return large_z * vec4(z_size_check) + small_z * vec4(z_size_check);
+}
+
+vec3 normalized_ligamma_impl(vec3 s, vec3 z,
+    vec3 s_inv, vec3 gamma_s_inv)
+{
+    //  vec3 version:
+	vec3 thresh = vec3(0.775075);
+	bvec3 z_is_large = greaterThan(z , thresh);
+	vec3 z_size_check = vec3(z_is_large.x ? 1.0 : 0.0, z_is_large.y ? 1.0 : 0.0, z_is_large.z ? 1.0 : 0.0);
+	vec3 large_z = vec3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	vec3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	return large_z * vec3(z_size_check) + small_z * vec3(z_size_check);
+}
+
+vec2 normalized_ligamma_impl(vec2 s, vec2 z,
+    vec2 s_inv, vec2 gamma_s_inv)
+{
+    //  vec2 version:
+	vec2 thresh = vec2(0.775075);
+	bvec2 z_is_large = greaterThan(z , thresh);
+	vec2 z_size_check = vec2(z_is_large.x ? 1.0 : 0.0, z_is_large.y ? 1.0 : 0.0);
+	vec2 large_z = vec2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	vec2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	return large_z * vec2(z_size_check) + small_z * vec2(z_size_check);
+}
+
+float normalized_ligamma_impl(float s, float z,
+    float s_inv, float gamma_s_inv)
+{
+    //  Float version:
+	float thresh = 0.775075;
+	float z_size_check = 0.0;
+	if (z > thresh) z_size_check = 1.0;
+	float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	return large_z * float(z_size_check) + small_z * float(z_size_check);
+}
+
+//  Normalized lower incomplete gamma function for small s:
+vec4 normalized_ligamma(vec4 s, vec4 z)
+{
+    //  Requires:   s < ~0.5
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  See normalized_ligamma_impl() for details.
+	vec4 s_inv = vec4(1.0)/s;
+	vec4 gamma_s_inv = vec4(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+vec3 normalized_ligamma(vec3 s, vec3 z)
+{
+    //  vec3 version:
+	vec3 s_inv = vec3(1.0)/s;
+	vec3 gamma_s_inv = vec3(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+vec2 normalized_ligamma(vec2 s, vec2 z)
+{
+    //  vec2 version:
+	vec2 s_inv = vec2(1.0)/s;
+	vec2 gamma_s_inv = vec2(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float normalized_ligamma(float s, float z)
+{
+    //  Float version:
+	float s_inv = 1.0/s;
+	float gamma_s_inv = 1.0/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+#endif  //  SPECIAL_FUNCTIONS_H
+
+///////////////////////////////////  HELPERS  //////////////////////////////////
+
+vec4 uv2_to_uv4(vec2 tex_uv)
+{
+    //  Make a vec2 uv offset safe for adding to vec4 tex2Dlod coords:
+    return vec4(tex_uv, 0.0, 0.0);
+}
+
+//  Make a length squared helper macro (for usage with static constants):
+#define LENGTH_SQ(vec) (dot(vec, vec))
+
+float get_fast_gaussian_weight_sum_inv(float sigma)
+{
+    //  We can use the Gaussian integral to calculate the asymptotic weight for
+    //  the center pixel.  Since the unnormalized center pixel weight is 1.0,
+    //  the normalized weight is the same as the weight sum inverse.  Given a
+    //  large enough blur (9+), the asymptotic weight sum is close and faster:
+    //      center_weight = 0.5 *
+    //          (erf(0.5/(sigma*sqrt(2.0))) - erf(-0.5/(sigma*sqrt(2.0))))
+    //      erf(-x) == -erf(x), so we get 0.5 * (2.0 * erf(blah blah)):
+    //  However, we can get even faster results with curve-fitting.  These are
+    //  also closer than the asymptotic results, because they were constructed
+    //  from 64 blurs sizes from [3, 131) and 255 equally-spaced sigmas from
+    //  (0, blurN_std_dev), so the results for smaller sigmas are biased toward
+    //  smaller blurs.  The max error is 0.0031793913.
+    //  Relative FPS: 134.3 with erf, 135.8 with curve-fitting.
+    //static float temp = 0.5/sqrt(2.0);
+    //return erf(temp/sigma);
+    return min(exp(exp(0.348348412457428/
+        (sigma - 0.0860587260734721))), 0.399334576340352/sigma);
+}
+
+////////////////////  ARBITRARILY RESIZABLE SEPARABLE BLURS  ///////////////////
+
+vec3 tex2Dblur11resize(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 11x Gaussian blurred texture lookup using a 11-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  Calculate Gaussian blur kernel weights and a normalization factor for
+    //  distances of 0-4, ignoring constant factors (since we're normalizing).
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float w4 = exp(-16.0 * denom_inv);
+    float w5 = exp(-25.0 * denom_inv);
+    float weight_sum_inv = 1.0 /
+        (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5));
+    //  Statically normalize weights, sum weighted samples, and return.  Blurs are
+    //  currently optimized for dynamic weights.
+    vec3 sum = vec3(0.0);
+    sum += w5 * tex2D_linearize(tex, tex_uv - 5.0 * dxdy).rgb;
+    sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
+    sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb;
+    sum += w5 * tex2D_linearize(tex, tex_uv + 5.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur9resize(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 9x Gaussian blurred texture lookup using a 9-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float w4 = exp(-16.0 * denom_inv);
+    float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
+    sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur7resize(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 7x Gaussian blurred texture lookup using a 7-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3));
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur5resize(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 5x Gaussian blurred texture lookup using a 5-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2));
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur3resize(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 3x Gaussian blurred texture lookup using a 3-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float weight_sum_inv = 1.0 / (w0 + 2.0 * w1);
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+///////////////////////////  FAST SEPARABLE BLURS  ///////////////////////////
+
+vec3 tex2Dblur11fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   1.) Global requirements must be met (see file description).
+    //              2.) filter_linearN must = "true" in your .cgp file.
+    //              3.) For gamma-correct bilinear filtering, global
+    //                  gamma_aware_bilinear == true (from gamma-management.h)
+    //  Returns:    A 1D 11x Gaussian blurred texture lookup using 6 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float w4 = exp(-16.0 * denom_inv);
+    float w5 = exp(-25.0 * denom_inv);
+    float weight_sum_inv = 1.0 /
+        (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    float w01 = w0 * 0.5 + w1;
+    float w23 = w2 + w3;
+    float w45 = w4 + w5;
+    float w01_ratio = w1/w01;
+    float w23_ratio = w3/w23;
+    float w45_ratio = w5/w45;
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w45 * tex2D_linearize(tex, tex_uv - (4.0 + w45_ratio) * dxdy).rgb;
+    sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb;
+    sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb;
+    sum += w45 * tex2D_linearize(tex, tex_uv + (4.0 + w45_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur17fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 17x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 8 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float w4 = exp(-16.0 * denom_inv);
+    float w5 = exp(-25.0 * denom_inv);
+    float w6 = exp(-36.0 * denom_inv);
+    float w7 = exp(-49.0 * denom_inv);
+    float w8 = exp(-64.0 * denom_inv);
+    //float weight_sum_inv = 1.0 / (w0 + 2.0 * (
+    //    w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8));
+    float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    float w1_2 = w1 + w2;
+    float w3_4 = w3 + w4;
+    float w5_6 = w5 + w6;
+    float w7_8 = w7 + w8;
+    float w1_2_ratio = w2/w1_2;
+    float w3_4_ratio = w4/w3_4;
+    float w5_6_ratio = w6/w5_6;
+    float w7_8_ratio = w8/w7_8;
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur25fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 25x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 12 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float w4 = exp(-16.0 * denom_inv);
+    float w5 = exp(-25.0 * denom_inv);
+    float w6 = exp(-36.0 * denom_inv);
+    float w7 = exp(-49.0 * denom_inv);
+    float w8 = exp(-64.0 * denom_inv);
+    float w9 = exp(-81.0 * denom_inv);
+    float w10 = exp(-100.0 * denom_inv);
+    float w11 = exp(-121.0 * denom_inv);
+    float w12 = exp(-144.0 * denom_inv);
+    //float weight_sum_inv = 1.0 / (w0 + 2.0 * (
+    //    w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12));
+    float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    float w1_2 = w1 + w2;
+    float w3_4 = w3 + w4;
+    float w5_6 = w5 + w6;
+    float w7_8 = w7 + w8;
+    float w9_10 = w9 + w10;
+    float w11_12 = w11 + w12;
+    float w1_2_ratio = w2/w1_2;
+    float w3_4_ratio = w4/w3_4;
+    float w5_6_ratio = w6/w5_6;
+    float w7_8_ratio = w8/w7_8;
+    float w9_10_ratio = w10/w9_10;
+    float w11_12_ratio = w12/w11_12;
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w11_12 * tex2D_linearize(tex, tex_uv - (11.0 + w11_12_ratio) * dxdy).rgb;
+    sum += w9_10 * tex2D_linearize(tex, tex_uv - (9.0 + w9_10_ratio) * dxdy).rgb;
+    sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w9_10 * tex2D_linearize(tex, tex_uv + (9.0 + w9_10_ratio) * dxdy).rgb;
+    sum += w11_12 * tex2D_linearize(tex, tex_uv + (11.0 + w11_12_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur31fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 31x Gaussian blurred texture lookup using 16 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float w4 = exp(-16.0 * denom_inv);
+    float w5 = exp(-25.0 * denom_inv);
+    float w6 = exp(-36.0 * denom_inv);
+    float w7 = exp(-49.0 * denom_inv);
+    float w8 = exp(-64.0 * denom_inv);
+    float w9 = exp(-81.0 * denom_inv);
+    float w10 = exp(-100.0 * denom_inv);
+    float w11 = exp(-121.0 * denom_inv);
+    float w12 = exp(-144.0 * denom_inv);
+    float w13 = exp(-169.0 * denom_inv);
+    float w14 = exp(-196.0 * denom_inv);
+    float w15 = exp(-225.0 * denom_inv);
+    //float weight_sum_inv = 1.0 /
+    //    (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 +
+    //        w9 + w10 + w11 + w12 + w13 + w14 + w15));
+    float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    float w0_1 = w0 * 0.5 + w1;
+    float w2_3 = w2 + w3;
+    float w4_5 = w4 + w5;
+    float w6_7 = w6 + w7;
+    float w8_9 = w8 + w9;
+    float w10_11 = w10 + w11;
+    float w12_13 = w12 + w13;
+    float w14_15 = w14 + w15;
+    float w0_1_ratio = w1/w0_1;
+    float w2_3_ratio = w3/w2_3;
+    float w4_5_ratio = w5/w4_5;
+    float w6_7_ratio = w7/w6_7;
+    float w8_9_ratio = w9/w8_9;
+    float w10_11_ratio = w11/w10_11;
+    float w12_13_ratio = w13/w12_13;
+    float w14_15_ratio = w15/w14_15;
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur43fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 43x Gaussian blurred texture lookup using 22 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float w4 = exp(-16.0 * denom_inv);
+    float w5 = exp(-25.0 * denom_inv);
+    float w6 = exp(-36.0 * denom_inv);
+    float w7 = exp(-49.0 * denom_inv);
+    float w8 = exp(-64.0 * denom_inv);
+    float w9 = exp(-81.0 * denom_inv);
+    float w10 = exp(-100.0 * denom_inv);
+    float w11 = exp(-121.0 * denom_inv);
+    float w12 = exp(-144.0 * denom_inv);
+    float w13 = exp(-169.0 * denom_inv);
+    float w14 = exp(-196.0 * denom_inv);
+    float w15 = exp(-225.0 * denom_inv);
+    float w16 = exp(-256.0 * denom_inv);
+    float w17 = exp(-289.0 * denom_inv);
+    float w18 = exp(-324.0 * denom_inv);
+    float w19 = exp(-361.0 * denom_inv);
+    float w20 = exp(-400.0 * denom_inv);
+    float w21 = exp(-441.0 * denom_inv);
+    //float weight_sum_inv = 1.0 /
+    //    (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 +
+    //        w12 + w13 + w14 + w15 + w16 + w17 + w18 + w19 + w20 + w21));
+    float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    float w0_1 = w0 * 0.5 + w1;
+    float w2_3 = w2 + w3;
+    float w4_5 = w4 + w5;
+    float w6_7 = w6 + w7;
+    float w8_9 = w8 + w9;
+    float w10_11 = w10 + w11;
+    float w12_13 = w12 + w13;
+    float w14_15 = w14 + w15;
+    float w16_17 = w16 + w17;
+    float w18_19 = w18 + w19;
+    float w20_21 = w20 + w21;
+    float w0_1_ratio = w1/w0_1;
+    float w2_3_ratio = w3/w2_3;
+    float w4_5_ratio = w5/w4_5;
+    float w6_7_ratio = w7/w6_7;
+    float w8_9_ratio = w9/w8_9;
+    float w10_11_ratio = w11/w10_11;
+    float w12_13_ratio = w13/w12_13;
+    float w14_15_ratio = w15/w14_15;
+    float w16_17_ratio = w17/w16_17;
+    float w18_19_ratio = w19/w18_19;
+    float w20_21_ratio = w21/w20_21;
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w20_21 * tex2D_linearize(tex, tex_uv - (20.0 + w20_21_ratio) * dxdy).rgb;
+    sum += w18_19 * tex2D_linearize(tex, tex_uv - (18.0 + w18_19_ratio) * dxdy).rgb;
+    sum += w16_17 * tex2D_linearize(tex, tex_uv - (16.0 + w16_17_ratio) * dxdy).rgb;
+    sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb;
+    sum += w16_17 * tex2D_linearize(tex, tex_uv + (16.0 + w16_17_ratio) * dxdy).rgb;
+    sum += w18_19 * tex2D_linearize(tex, tex_uv + (18.0 + w18_19_ratio) * dxdy).rgb;
+    sum += w20_21 * tex2D_linearize(tex, tex_uv + (20.0 + w20_21_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur3fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 3x Gaussian blurred texture lookup using 2 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float weight_sum_inv = 1.0 / (w0 + 2.0 * w1);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    float w01 = w0 * 0.5 + w1;
+    float w01_ratio = w1/w01;
+    //  Weights for all samples are the same, so just average them:
+    return 0.5 * (
+        tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb +
+        tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb);
+}
+
+vec3 tex2Dblur5fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 5x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 2 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    float w12 = w1 + w2;
+    float w12_ratio = w2/w12;
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur7fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 7x Gaussian blurred texture lookup using 4 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    float w01 = w0 * 0.5 + w1;
+    float w23 = w2 + w3;
+    float w01_ratio = w1/w01;
+    float w23_ratio = w3/w23;
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb;
+    sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+////////////////////  ARBITRARILY RESIZABLE ONE-PASS BLURS  ////////////////////
+
+vec3 tex2Dblur3x3resize(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 3x3 Gaussian blurred mipmapped texture lookup of the
+    //              resized input.
+    //  Description:
+    //  This is the only arbitrarily resizable one-pass blur; tex2Dblur5x5resize
+    //  would perform like tex2Dblur9x9, MUCH slower than tex2Dblur5resize.
+    float denom_inv = 0.5/(sigma*sigma);
+    //  Load each sample.  We need all 3x3 samples.  Quad-pixel communication
+    //  won't help either: This should perform like tex2Dblur5x5, but sharing a
+    //  4x4 sample field would perform more like tex2Dblur8x8shared (worse).
+    vec2 sample4_uv = tex_uv;
+    vec2 dx = vec2(dxdy.x, 0.0);
+    vec2 dy = vec2(0.0, dxdy.y);
+    vec2 sample1_uv = sample4_uv - dy;
+    vec2 sample7_uv = sample4_uv + dy;
+    vec3 sample0 = tex2D_linearize(tex, sample1_uv - dx).rgb;
+    vec3 sample1 = tex2D_linearize(tex, sample1_uv).rgb;
+    vec3 sample2 = tex2D_linearize(tex, sample1_uv + dx).rgb;
+    vec3 sample3 = tex2D_linearize(tex, sample4_uv - dx).rgb;
+    vec3 sample4 = tex2D_linearize(tex, sample4_uv).rgb;
+    vec3 sample5 = tex2D_linearize(tex, sample4_uv + dx).rgb;
+    vec3 sample6 = tex2D_linearize(tex, sample7_uv - dx).rgb;
+    vec3 sample7 = tex2D_linearize(tex, sample7_uv).rgb;
+    vec3 sample8 = tex2D_linearize(tex, sample7_uv + dx).rgb;
+    //  Statically compute Gaussian sample weights:
+    float w4 = 1.0;
+    float w1_3_5_7 = exp(-LENGTH_SQ(vec2(1.0, 0.0)) * denom_inv);
+    float w0_2_6_8 = exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv);
+    float weight_sum_inv = 1.0/(w4 + 4.0 * (w1_3_5_7 + w0_2_6_8));
+    //  Weight and sum the samples:
+    vec3 sum = w4 * sample4 +
+        w1_3_5_7 * (sample1 + sample3 + sample5 + sample7) +
+        w0_2_6_8 * (sample0 + sample2 + sample6 + sample8);
+    return sum * weight_sum_inv;
+}
+
+//  Resizable one-pass blurs:
+vec3 tex2Dblur3x3resize(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur3x3resize(texture, tex_uv, dxdy, blur3_std_dev);
+}
+
+vec3 tex2Dblur9fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 9x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 4 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0 = 1.0;
+    float w1 = exp(-1.0 * denom_inv);
+    float w2 = exp(-4.0 * denom_inv);
+    float w3 = exp(-9.0 * denom_inv);
+    float w4 = exp(-16.0 * denom_inv);
+    float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    float w12 = w1 + w2;
+    float w34 = w3 + w4;
+    float w12_ratio = w2/w12;
+    float w34_ratio = w4/w34;
+    //  Statically normalize weights, sum weighted samples, and return:
+    vec3 sum = vec3(0.0);
+    sum += w34 * tex2D_linearize(tex, tex_uv - (3.0 + w34_ratio) * dxdy).rgb;
+    sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb;
+    sum += w34 * tex2D_linearize(tex, tex_uv + (3.0 + w34_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur9x9(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Perform a 1-pass 9x9 blur with 5x5 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 9x9 Gaussian blurred mipmapped texture lookup composed of
+    //              5x5 carefully selected bilinear samples.
+    //  Description:
+    //  Perform a 1-pass 9x9 blur with 5x5 bilinear samples.  Adjust the
+    //  bilinear sample location to reflect the true Gaussian weights for each
+    //  underlying texel.  The following diagram illustrates the relative
+    //  locations of bilinear samples.  Each sample with the same number has the
+    //  same weight (notice the symmetry).  The letters a, b, c, d distinguish
+    //  quadrants, and the letters U, D, L, R, C (up, down, left, right, center)
+    //  distinguish 1D directions along the line containing the pixel center:
+    //      6a 5a 2U 5b 6b
+    //      4a 3a 1U 3b 4b
+    //      2L 1L 0C 1R 2R
+    //      4c 3c 1D 3d 4d
+    //      6c 5c 2D 5d 6d
+    //  The following diagram illustrates the underlying equally spaced texels,
+    //  named after the sample that accesses them and subnamed by their location
+    //  within their 2x2, 2x1, 1x2, or 1x1 texel block:
+    //      6a4 6a3 5a4 5a3 2U2 5b3 5b4 6b3 6b4
+    //      6a2 6a1 5a2 5a1 2U1 5b1 5b2 6b1 6b2
+    //      4a4 4a3 3a4 3a3 1U2 3b3 3b4 4b3 4b4
+    //      4a2 4a1 3a2 3a1 1U1 3b1 3b2 4b1 4b2
+    //      2L2 2L1 1L2 1L1 0C1 1R1 1R2 2R1 2R2
+    //      4c2 4c1 3c2 3c1 1D1 3d1 3d2 4d1 4d2
+    //      4c4 4c3 3c4 3c3 1D2 3d3 3d4 4d3 4d4
+    //      6c2 6c1 5c2 5c1 2D1 5d1 5d2 6d1 6d2
+    //      6c4 6c3 5c4 5c3 2D2 5d3 5d4 6d3 6d4
+    //  Note there is only one C texel and only two texels for each U, D, L, or
+    //  R sample.  The center sample is effectively a nearest neighbor sample,
+    //  and the U/D/L/R samples use 1D linear filtering.  All other texels are
+    //  read with bilinear samples somewhere within their 2x2 texel blocks.
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute sampling offsets within each 2x2 texel block, based
+    //  on 1D sampling ratios between texels [1, 2] and [3, 4] texels away from
+    //  the center, and reuse them independently for both dimensions.  Compute
+    //  these offsets based on the relative 1D Gaussian weights of the texels
+    //  in question.  (w1off means "Gaussian weight for the texel 1.0 texels
+    //  away from the pixel center," etc.).
+    float denom_inv = 0.5/(sigma*sigma);
+    float w1off = exp(-1.0 * denom_inv);
+    float w2off = exp(-4.0 * denom_inv);
+    float w3off = exp(-9.0 * denom_inv);
+    float w4off = exp(-16.0 * denom_inv);
+    float texel1to2ratio = w2off/(w1off + w2off);
+    float texel3to4ratio = w4off/(w3off + w4off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including x-axis-aligned:
+    vec2 sample1R_texel_offset = vec2(1.0, 0.0) + vec2(texel1to2ratio, 0.0);
+    vec2 sample2R_texel_offset = vec2(3.0, 0.0) + vec2(texel3to4ratio, 0.0);
+    vec2 sample3d_texel_offset = vec2(1.0, 1.0) + vec2(texel1to2ratio, texel1to2ratio);
+    vec2 sample4d_texel_offset = vec2(3.0, 1.0) + vec2(texel3to4ratio, texel1to2ratio);
+    vec2 sample5d_texel_offset = vec2(1.0, 3.0) + vec2(texel1to2ratio, texel3to4ratio);
+    vec2 sample6d_texel_offset = vec2(3.0, 3.0) + vec2(texel3to4ratio, texel3to4ratio);
+
+    //  CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
+    //  Statically compute Gaussian texel weights for the bottom-right quadrant.
+    //  Read underscores as "and."
+    float w1R1 = w1off;
+    float w1R2 = w2off;
+    float w2R1 = w3off;
+    float w2R2 = w4off;
+    float w3d1 =     exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv);
+    float w3d2_3d3 = exp(-LENGTH_SQ(vec2(2.0, 1.0)) * denom_inv);
+    float w3d4 =     exp(-LENGTH_SQ(vec2(2.0, 2.0)) * denom_inv);
+    float w4d1_5d1 = exp(-LENGTH_SQ(vec2(3.0, 1.0)) * denom_inv);
+    float w4d2_5d3 = exp(-LENGTH_SQ(vec2(4.0, 1.0)) * denom_inv);
+    float w4d3_5d2 = exp(-LENGTH_SQ(vec2(3.0, 2.0)) * denom_inv);
+    float w4d4_5d4 = exp(-LENGTH_SQ(vec2(4.0, 2.0)) * denom_inv);
+    float w6d1 =     exp(-LENGTH_SQ(vec2(3.0, 3.0)) * denom_inv);
+    float w6d2_6d3 = exp(-LENGTH_SQ(vec2(4.0, 3.0)) * denom_inv);
+    float w6d4 =     exp(-LENGTH_SQ(vec2(4.0, 4.0)) * denom_inv);
+    //  Statically add texel weights in each sample to get sample weights:
+    float w0 = 1.0;
+    float w1 = w1R1 + w1R2;
+    float w2 = w2R1 + w2R2;
+    float w3 = w3d1 + 2.0 * w3d2_3d3 + w3d4;
+    float w4 = w4d1_5d1 + w4d2_5d3 + w4d3_5d2 + w4d4_5d4;
+    float w5 = w4;
+    float w6 = w6d1 + 2.0 * w6d2_6d3 + w6d4;
+    //  Get the weight sum inverse (normalization factor):
+    float weight_sum_inv =
+        1.0/(w0 + 4.0 * (w1 + w2 + w3 + w4 + w5 + w6));
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 25 samples (1 nearest, 8 linear, 16 bilinear) using symmetry:
+    vec2 mirror_x = vec2(-1.0, 1.0);
+    vec2 mirror_y = vec2(1.0, -1.0);
+    vec2 mirror_xy = vec2(-1.0, -1.0);
+    vec2 dxdy_mirror_x = dxdy * mirror_x;
+    vec2 dxdy_mirror_y = dxdy * mirror_y;
+    vec2 dxdy_mirror_xy = dxdy * mirror_xy;
+    //  Sampling order doesn't seem to affect performance, so just be clear:
+    vec3 sample0C = tex2D_linearize(tex, tex_uv).rgb;
+    vec3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb;
+    vec3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb;
+    vec3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb;
+    vec3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb;
+    vec3 sample2R = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset).rgb;
+    vec3 sample2D = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset.yx).rgb;
+    vec3 sample2L = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset).rgb;
+    vec3 sample2U = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset.yx).rgb;
+    vec3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb;
+    vec3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb;
+    vec3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb;
+    vec3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb;
+    vec3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb;
+    vec3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb;
+    vec3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb;
+    vec3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb;
+    vec3 sample5d = tex2D_linearize(tex, tex_uv + dxdy * sample5d_texel_offset).rgb;
+    vec3 sample5c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample5d_texel_offset).rgb;
+    vec3 sample5b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample5d_texel_offset).rgb;
+    vec3 sample5a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample5d_texel_offset).rgb;
+    vec3 sample6d = tex2D_linearize(tex, tex_uv + dxdy * sample6d_texel_offset).rgb;
+    vec3 sample6c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample6d_texel_offset).rgb;
+    vec3 sample6b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample6d_texel_offset).rgb;
+    vec3 sample6a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample6d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    vec3 sum = w0 * sample0C;
+    sum += w1 * (sample1R + sample1D + sample1L + sample1U);
+    sum += w2 * (sample2R + sample2D + sample2L + sample2U);
+    sum += w3 * (sample3d + sample3c + sample3b + sample3a);
+    sum += w4 * (sample4d + sample4c + sample4b + sample4a);
+    sum += w5 * (sample5d + sample5c + sample5b + sample5a);
+    sum += w6 * (sample6d + sample6c + sample6b + sample6a);
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur7x7(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Perform a 1-pass 7x7 blur with 5x5 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 7x7 Gaussian blurred mipmapped texture lookup composed of
+    //              4x4 carefully selected bilinear samples.
+    //  Description:
+    //  First see the descriptions for tex2Dblur9x9() and tex2Dblur7().  This
+    //  blur mixes concepts from both.  The sample layout is as follows:
+    //      4a 3a 3b 4b
+    //      2a 1a 1b 2b
+    //      2c 1c 1d 2d
+    //      4c 3c 3d 4d
+    //  The texel layout is as follows.  Note that samples 3a/3b, 1a/1b, 1c/1d,
+    //  and 3c/3d share a vertical column of texels, and samples 2a/2c, 1a/1c,
+    //  1b/1d, and 2b/2d share a horizontal row of texels (all sample1's share
+    //  the center texel):
+    //      4a4  4a3  3a4  3ab3 3b4  4b3  4b4
+    //      4a2  4a1  3a2  3ab1 3b2  4b1  4b2
+    //      2a4  2a3  1a4  1ab3 1b4  2b3  2b4
+    //      2ac2 2ac1 1ac2 1*   1bd2 2bd1 2bd2
+    //      2c4  2c3  1c4  1cd3 1d4  2d3  2d4
+    //      4c2  4c1  3c2  3cd1 3d2  4d1  4d2
+    //      4c4  4c3  3c4  3cd3 3d4  4d3  4d4
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0off = 1.0;
+    float w1off = exp(-1.0 * denom_inv);
+    float w2off = exp(-4.0 * denom_inv);
+    float w3off = exp(-9.0 * denom_inv);
+    float texel0to1ratio = w1off/(w0off * 0.5 + w1off);
+    float texel2to3ratio = w3off/(w2off + w3off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including axis-aligned:
+    vec2 sample1d_texel_offset = vec2(texel0to1ratio, texel0to1ratio);
+    vec2 sample2d_texel_offset = vec2(2.0, 0.0) + vec2(texel2to3ratio, texel0to1ratio);
+    vec2 sample3d_texel_offset = vec2(0.0, 2.0) + vec2(texel0to1ratio, texel2to3ratio);
+    vec2 sample4d_texel_offset = vec2(2.0, 2.0) + vec2(texel2to3ratio, texel2to3ratio);
+
+    //  CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
+    //  Statically compute Gaussian texel weights for the bottom-right quadrant.
+    //  Read underscores as "and."
+    float w1abcd = 1.0;
+    float w1bd2_1cd3 = exp(-LENGTH_SQ(vec2(1.0, 0.0)) * denom_inv);
+    float w2bd1_3cd1 = exp(-LENGTH_SQ(vec2(2.0, 0.0)) * denom_inv);
+    float w2bd2_3cd2 = exp(-LENGTH_SQ(vec2(3.0, 0.0)) * denom_inv);
+    float w1d4 =       exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv);
+    float w2d3_3d2 =   exp(-LENGTH_SQ(vec2(2.0, 1.0)) * denom_inv);
+    float w2d4_3d4 =   exp(-LENGTH_SQ(vec2(3.0, 1.0)) * denom_inv);
+    float w4d1 =       exp(-LENGTH_SQ(vec2(2.0, 2.0)) * denom_inv);
+    float w4d2_4d3 =   exp(-LENGTH_SQ(vec2(3.0, 2.0)) * denom_inv);
+    float w4d4 =       exp(-LENGTH_SQ(vec2(3.0, 3.0)) * denom_inv);
+    //  Statically add texel weights in each sample to get sample weights.
+    //  Split weights for shared texels between samples sharing them:
+    float w1 = w1abcd * 0.25 + w1bd2_1cd3 + w1d4;
+    float w2_3 = (w2bd1_3cd1 + w2bd2_3cd2) * 0.5 + w2d3_3d2 + w2d4_3d4;
+    float w4 = w4d1 + 2.0 * w4d2_4d3 + w4d4;
+    //  Get the weight sum inverse (normalization factor):
+    float weight_sum_inv =
+        1.0/(4.0 * (w1 + 2.0 * w2_3 + w4));
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 16 samples using symmetry:
+    vec2 mirror_x = vec2(-1.0, 1.0);
+    vec2 mirror_y = vec2(1.0, -1.0);
+    vec2 mirror_xy = vec2(-1.0, -1.0);
+    vec2 dxdy_mirror_x = dxdy * mirror_x;
+    vec2 dxdy_mirror_y = dxdy * mirror_y;
+    vec2 dxdy_mirror_xy = dxdy * mirror_xy;
+    vec3 sample1a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample1d_texel_offset).rgb;
+    vec3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb;
+    vec3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb;
+    vec3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb;
+    vec3 sample1b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample1d_texel_offset).rgb;
+    vec3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb;
+    vec3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb;
+    vec3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb;
+    vec3 sample1c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample1d_texel_offset).rgb;
+    vec3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb;
+    vec3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb;
+    vec3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb;
+    vec3 sample1d = tex2D_linearize(tex, tex_uv + dxdy * sample1d_texel_offset).rgb;
+    vec3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb;
+    vec3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb;
+    vec3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    vec3 sum = vec3(0.0);
+    sum += w1 * (sample1a + sample1b + sample1c + sample1d);
+    sum += w2_3 * (sample2a + sample2b + sample2c + sample2d);
+    sum += w2_3 * (sample3a + sample3b + sample3c + sample3d);
+    sum += w4 * (sample4a + sample4b + sample4c + sample4d);
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur5x5(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Perform a 1-pass 5x5 blur with 3x3 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 5x5 Gaussian blurred mipmapped texture lookup composed of
+    //              3x3 carefully selected bilinear samples.
+    //  Description:
+    //  First see the description for tex2Dblur9x9().  This blur uses the same
+    //  concept and sample/texel locations except on a smaller scale.  Samples:
+    //      2a 1U 2b
+    //      1L 0C 1R
+    //      2c 1D 2d
+    //  Texels:
+    //      2a4 2a3 1U2 2b3 2b4
+    //      2a2 2a1 1U1 2b1 2b2
+    //      1L2 1L1 0C1 1R1 1R2
+    //      2c2 2c1 1D1 2d1 2d2
+    //      2c4 2c3 1D2 2d3 2d4
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
+    float denom_inv = 0.5/(sigma*sigma);
+    float w1off = exp(-1.0 * denom_inv);
+    float w2off = exp(-4.0 * denom_inv);
+    float texel1to2ratio = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including x-axis-aligned:
+    vec2 sample1R_texel_offset = vec2(1.0, 0.0) + vec2(texel1to2ratio, 0.0);
+    vec2 sample2d_texel_offset = vec2(1.0, 1.0) + vec2(texel1to2ratio, texel1to2ratio);
+
+    //  CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
+    //  Statically compute Gaussian texel weights for the bottom-right quadrant.
+    //  Read underscores as "and."
+    float w1R1 = w1off;
+    float w1R2 = w2off;
+    float w2d1 =   exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv);
+    float w2d2_3 = exp(-LENGTH_SQ(vec2(2.0, 1.0)) * denom_inv);
+    float w2d4 =   exp(-LENGTH_SQ(vec2(2.0, 2.0)) * denom_inv);
+    //  Statically add texel weights in each sample to get sample weights:
+    float w0 = 1.0;
+    float w1 = w1R1 + w1R2;
+    float w2 = w2d1 + 2.0 * w2d2_3 + w2d4;
+    //  Get the weight sum inverse (normalization factor):
+    float weight_sum_inv = 1.0/(w0 + 4.0 * (w1 + w2));
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 9 samples (1 nearest, 4 linear, 4 bilinear) using symmetry:
+    vec2 mirror_x = vec2(-1.0, 1.0);
+    vec2 mirror_y = vec2(1.0, -1.0);
+    vec2 mirror_xy = vec2(-1.0, -1.0);
+    vec2 dxdy_mirror_x = dxdy * mirror_x;
+    vec2 dxdy_mirror_y = dxdy * mirror_y;
+    vec2 dxdy_mirror_xy = dxdy * mirror_xy;
+    vec3 sample0C = tex2D_linearize(tex, tex_uv).rgb;
+    vec3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb;
+    vec3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb;
+    vec3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb;
+    vec3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb;
+    vec3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb;
+    vec3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb;
+    vec3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb;
+    vec3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    vec3 sum = w0 * sample0C;
+    sum += w1 * (sample1R + sample1D + sample1L + sample1U);
+    sum += w2 * (sample2a + sample2b + sample2c + sample2d);
+    return sum * weight_sum_inv;
+}
+
+vec3 tex2Dblur3x3(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy, float sigma)
+{
+    //  Perform a 1-pass 3x3 blur with 5x5 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 3x3 Gaussian blurred mipmapped texture lookup composed of
+    //              2x2 carefully selected bilinear samples.
+    //  Description:
+    //  First see the descriptions for tex2Dblur9x9() and tex2Dblur7().  This
+    //  blur mixes concepts from both.  The sample layout is as follows:
+    //      0a 0b
+    //      0c 0d
+    //  The texel layout is as follows.  Note that samples 0a/0b and 0c/0d share
+    //  a vertical column of texels, and samples 0a/0c and 0b/0d share a
+    //  horizontal row of texels (all samples share the center texel):
+    //      0a3  0ab2 0b3
+    //      0ac1 0*0  0bd1
+    //      0c3  0cd2 0d3
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
+    float denom_inv = 0.5/(sigma*sigma);
+    float w0off = 1.0;
+    float w1off = exp(-1.0 * denom_inv);
+    float texel0to1ratio = w1off/(w0off * 0.5 + w1off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including axis-aligned:
+    vec2 sample0d_texel_offset = vec2(texel0to1ratio, texel0to1ratio);
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 4 samples using symmetry:
+    vec2 mirror_x = vec2(-1.0, 1.0);
+    vec2 mirror_y = vec2(1.0, -1.0);
+    vec2 mirror_xy = vec2(-1.0, -1.0);
+    vec2 dxdy_mirror_x = dxdy * mirror_x;
+    vec2 dxdy_mirror_y = dxdy * mirror_y;
+    vec2 dxdy_mirror_xy = dxdy * mirror_xy;
+    vec3 sample0a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample0d_texel_offset).rgb;
+    vec3 sample0b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample0d_texel_offset).rgb;
+    vec3 sample0c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample0d_texel_offset).rgb;
+    vec3 sample0d = tex2D_linearize(tex, tex_uv + dxdy * sample0d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Weights for all samples are the same, so just average them:
+    return 0.25 * (sample0a + sample0b + sample0c + sample0d);
+}
+
+vec3 tex2Dblur9fast(sampler2D tex, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur9fast(tex, tex_uv, dxdy, blur9_std_dev);
+}
+
+vec3 tex2Dblur17fast(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur17fast(texture, tex_uv, dxdy, blur17_std_dev);
+}
+
+vec3 tex2Dblur25fast(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur25fast(texture, tex_uv, dxdy, blur25_std_dev);
+}
+
+vec3 tex2Dblur43fast(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur43fast(texture, tex_uv, dxdy, blur43_std_dev);
+}
+vec3 tex2Dblur31fast(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur31fast(texture, tex_uv, dxdy, blur31_std_dev);
+}
+
+vec3 tex2Dblur3fast(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur3fast(texture, tex_uv, dxdy, blur3_std_dev);
+}
+
+vec3 tex2Dblur3x3(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur3x3(texture, tex_uv, dxdy, blur3_std_dev);
+}
+
+vec3 tex2Dblur5fast(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur5fast(texture, tex_uv, dxdy, blur5_std_dev);
+}
+
+vec3 tex2Dblur5resize(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur5resize(texture, tex_uv, dxdy, blur5_std_dev);
+}
+vec3 tex2Dblur3resize(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur3resize(texture, tex_uv, dxdy, blur3_std_dev);
+}
+
+vec3 tex2Dblur5x5(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur5x5(texture, tex_uv, dxdy, blur5_std_dev);
+}
+
+vec3 tex2Dblur7resize(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur7resize(texture, tex_uv, dxdy, blur7_std_dev);
+}
+
+vec3 tex2Dblur7fast(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur7fast(texture, tex_uv, dxdy, blur7_std_dev);
+}
+
+vec3 tex2Dblur7x7(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur7x7(texture, tex_uv, dxdy, blur7_std_dev);
+}
+
+vec3 tex2Dblur9resize(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur9resize(texture, tex_uv, dxdy, blur9_std_dev);
+}
+
+vec3 tex2Dblur9x9(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur9x9(texture, tex_uv, dxdy, blur9_std_dev);
+}
+
+vec3 tex2Dblur11resize(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur11resize(texture, tex_uv, dxdy, blur11_std_dev);
+}
+
+vec3 tex2Dblur11fast(sampler2D texture, vec2 tex_uv,
+    vec2 dxdy)
+{
+    return tex2Dblur11fast(texture, tex_uv, dxdy, blur11_std_dev);
+}
+
+#endif  //  BLUR_FUNCTIONS_H
+
+#define InputSize sourceSize[0].xy
+#define TextureSize sourceSize[0].xy
+#define OutputSize targetSize.xy
+
+void main() {
+   gl_Position = position;
+   vTexCoord = texCoord;
+	//  Get the uv sample distance between output pixels.  Blurs are not generic
+    //  Gaussian resizers, and correct blurs require:
+    //  1.) OutputSize == InputSize * 2^m, where m is an integer <= 0.
+    //  2.) mipmap_inputN = "true" for this pass in the preset if m != 0
+    //  3.) filter_linearN = "true" except for 1x scale nearest neighbor blurs
+    //  Gaussian resizers would upsize using the distance between input texels
+    //  (not output pixels), but we avoid this and consistently blur at the
+    //  destination size.  Otherwise, combining statically calculated weights
+    //  with bilinear sample exploitation would result in terrible artifacts.
+    vec2 dxdy_scale = InputSize/OutputSize;
+	vec2 dxdy = dxdy_scale/TextureSize;
+    //  This blur is vertical-only, so zero out the horizontal offset:
+	blur_dxdy = vec2(0.0, dxdy.y);
+}
\ No newline at end of file
diff --git a/shaders/CRT-Royale.shader/brightpass.fs b/shaders/CRT-Royale.shader/brightpass.fs
new file mode 100644
index 00000000..29f27db8
--- /dev/null
+++ b/shaders/CRT-Royale.shader/brightpass.fs
@@ -0,0 +1,14481 @@
+#version 150
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+uniform sampler2D source[];
+uniform vec4 sourceSize[];
+uniform vec4 targetSize;
+
+in Vertex {
+   vec2 vTexCoord;
+   vec2 scanline_tex_uv;
+   vec2 blur3x3_tex_uv;
+   float bloom_sigma_runtime;
+};
+
+out vec4 FragColor;
+
+// USER SETTINGS BLOCK //
+
+#define crt_gamma 2.500000
+#define lcd_gamma 2.200000
+#define levels_contrast 1.0
+#define halation_weight 0.0
+#define diffusion_weight 0.075
+#define bloom_underestimate_levels 0.8
+#define bloom_excess 0.000000
+#define beam_min_sigma 0.020000
+#define beam_max_sigma 0.300000
+#define beam_spot_power 0.330000
+#define beam_min_shape 2.000000
+#define beam_max_shape 4.000000
+#define beam_shape_power 0.250000
+#define beam_horiz_filter 0.000000
+#define beam_horiz_sigma 0.35
+#define beam_horiz_linear_rgb_weight 1.000000
+#define convergence_offset_x_r -0.000000
+#define convergence_offset_x_g 0.000000
+#define convergence_offset_x_b 0.000000
+#define convergence_offset_y_r 0.000000
+#define convergence_offset_y_g -0.000000
+#define convergence_offset_y_b 0.000000
+#define mask_type 1.000000
+#define mask_sample_mode_desired 0.000000
+#define mask_specify_num_triads 0.000000
+#define mask_triad_size_desired 3.000000
+#define mask_num_triads_desired 480.000000
+#define aa_subpixel_r_offset_x_runtime -0.0
+#define aa_subpixel_r_offset_y_runtime 0.000000
+#define aa_cubic_c 0.500000
+#define aa_gauss_sigma 0.500000
+#define geom_mode_runtime 0.000000
+#define geom_radius 2.000000
+#define geom_view_dist 2.000000
+#define geom_tilt_angle_x 0.000000
+#define geom_tilt_angle_y 0.000000
+#define geom_aspect_ratio_x 432.000000
+#define geom_aspect_ratio_y 329.000000
+#define geom_overscan_x 1.000000
+#define geom_overscan_y 1.000000
+#define border_size 0.015
+#define border_darkness 2.0
+#define border_compress 2.500000
+#define interlace_bff 0.000000
+#define interlace_1080i 0.000000
+
+// END USER SETTINGS BLOCK //
+
+// compatibility macros for transparently converting HLSLisms into GLSLisms
+#define mul(a,b) (b*a)
+#define lerp(a,b,c) mix(a,b,c)
+#define saturate(c) clamp(c, 0.0, 1.0)
+#define frac(x) (fract(x))
+#define float2 vec2
+#define float3 vec3
+#define float4 vec4
+#define bool2 bvec2
+#define bool3 bvec3
+#define bool4 bvec4
+#define float2x2 mat2x2
+#define float3x3 mat3x3
+#define float4x4 mat4x4
+#define float4x3 mat4x3
+#define float2x4 mat2x4
+#define IN params
+#define texture_size sourceSize[0].xy
+#define video_size sourceSize[0].xy
+#define output_size targetSize.xy
+#define frame_count phase
+#define static  
+#define inline  
+#define const  
+#define fmod(x,y) mod(x,y)
+#define ddx(c) dFdx(c)
+#define ddy(c) dFdy(c)
+#define atan2(x,y) atan(y,x)
+#define rsqrt(c) inversesqrt(c)
+
+#define MASKED_SCANLINEStexture source[0]
+#define MASKED_SCANLINEStexture_size sourceSize[0].xy
+#define MASKED_SCANLINESvideo_size sourceSize[0].xy
+#define BLOOM_APPROXtexture source[5]
+#define BLOOM_APPROXtexture_size sourceSize[5].xy
+#define BLOOM_APPROXvideo_size sourceSize[5].xy
+
+#if defined(GL_ES)
+	#define COMPAT_PRECISION mediump
+#else
+	#define COMPAT_PRECISION
+#endif
+
+#if __VERSION__ >= 130
+	#define COMPAT_TEXTURE texture
+#else
+	#define COMPAT_TEXTURE texture2D
+#endif
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+////////////////////////////  END USER-SETTINGS  //////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  END DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////////////
+
+//#include "bind-shader-h"
+
+/////////////////////////////  BEGIN BIND-SHADER-PARAMS  ////////////////////////////
+
+#ifndef BIND_SHADER_PARAMS_H
+#define BIND_SHADER_PARAMS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+/////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+////////////////////   END DERIVED-SETTINGS-AND-CONSTANTS   /////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+//  Override some parameters for gamma-management.h and tex2Dantialias.h:
+#define OVERRIDE_DEVICE_GAMMA
+static const float gba_gamma = 3.5; //  Irrelevant but necessary to define.
+#define ANTIALIAS_OVERRIDE_BASICS
+#define ANTIALIAS_OVERRIDE_PARAMETERS
+
+//  Provide accessors for vector constants that pack scalar uniforms:
+inline float2 get_aspect_vector(const float geom_aspect_ratio)
+{
+    //  Get an aspect ratio vector.  Enforce geom_max_aspect_ratio, and prevent
+    //  the absolute scale from affecting the uv-mapping for curvature:
+    const float geom_clamped_aspect_ratio =
+        min(geom_aspect_ratio, geom_max_aspect_ratio);
+    const float2 geom_aspect =
+        normalize(float2(geom_clamped_aspect_ratio, 1.0));
+    return geom_aspect;
+}
+
+inline float2 get_geom_overscan_vector()
+{
+    return float2(geom_overscan_x, geom_overscan_y);
+}
+
+inline float2 get_geom_tilt_angle_vector()
+{
+    return float2(geom_tilt_angle_x, geom_tilt_angle_y);
+}
+
+inline float3 get_convergence_offsets_x_vector()
+{
+    return float3(convergence_offset_x_r, convergence_offset_x_g,
+        convergence_offset_x_b);
+}
+
+inline float3 get_convergence_offsets_y_vector()
+{
+    return float3(convergence_offset_y_r, convergence_offset_y_g,
+        convergence_offset_y_b);
+}
+
+inline float2 get_convergence_offsets_r_vector()
+{
+    return float2(convergence_offset_x_r, convergence_offset_y_r);
+}
+
+inline float2 get_convergence_offsets_g_vector()
+{
+    return float2(convergence_offset_x_g, convergence_offset_y_g);
+}
+
+inline float2 get_convergence_offsets_b_vector()
+{
+    return float2(convergence_offset_x_b, convergence_offset_y_b);
+}
+
+inline float2 get_aa_subpixel_r_offset()
+{
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+            //  WARNING: THIS IS EXTREMELY EXPENSIVE.
+            return float2(aa_subpixel_r_offset_x_runtime,
+                aa_subpixel_r_offset_y_runtime);
+        #else
+            return aa_subpixel_r_offset_static;
+        #endif
+    #else
+        return aa_subpixel_r_offset_static;
+    #endif
+}
+
+//  Provide accessors settings which still need "cooking:"
+inline float get_mask_amplify()
+{
+    static const float mask_grille_amplify = 1.0/mask_grille_avg_color;
+    static const float mask_slot_amplify = 1.0/mask_slot_avg_color;
+    static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color;
+    return mask_type < 0.5 ? mask_grille_amplify :
+        mask_type < 1.5 ? mask_slot_amplify :
+        mask_shadow_amplify;
+}
+
+inline float get_mask_sample_mode()
+{
+    #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_desired;
+        #else
+            return clamp(mask_sample_mode_desired, 1.0, 2.0);
+        #endif
+    #else
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_static;
+        #else
+            return clamp(mask_sample_mode_static, 1.0, 2.0);
+        #endif
+    #endif
+}
+
+#endif  //  BIND_SHADER_PARAMS_H
+
+////////////////////////////  END BIND-SHADER-PARAMS  ///////////////////////////
+
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
+
+//#include "../../../../include/gamma-management.h"
+
+////////////////////////////  BEGIN GAMMA-MANAGEMENT  //////////////////////////
+
+#ifndef GAMMA_MANAGEMENT_H
+#define GAMMA_MANAGEMENT_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//  
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides gamma-aware tex*D*() and encode_output() functions.
+//  Requires:   Before #include-ing this file, the including file must #define
+//              the following macros when applicable and follow their rules:
+//              1.) #define FIRST_PASS if this is the first pass.
+//              2.) #define LAST_PASS if this is the last pass.
+//              3.) If sRGB is available, set srgb_framebufferN = "true" for
+//                  every pass except the last in your .cgp preset.
+//              4.) If sRGB isn't available but you want gamma-correctness with
+//                  no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
+//              5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
+//              6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
+//              7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
+//              8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
+//              If an option in [5, 8] is #defined in the first or last pass, it
+//              should be #defined for both.  It shouldn't make a difference
+//              whether it's #defined for intermediate passes or not.
+//  Optional:   The including file (or an earlier included file) may optionally
+//              #define a number of macros indicating it will override certain
+//              macros and associated constants are as follows:
+//              static constants with either static or uniform constants.  The
+//              1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
+//                  static const float ntsc_gamma
+//                  static const float pal_gamma
+//                  static const float crt_reference_gamma_high
+//                  static const float crt_reference_gamma_low
+//                  static const float lcd_reference_gamma
+//                  static const float crt_office_gamma
+//                  static const float lcd_office_gamma
+//              2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
+//                  static const float crt_gamma
+//                  static const float gba_gamma
+//                  static const float lcd_gamma
+//              3.) OVERRIDE_FINAL_GAMMA: The user must first define:
+//                  static const float input_gamma
+//                  static const float intermediate_gamma
+//                  static const float output_gamma
+//                  (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
+//              4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
+//                  static const bool assume_opaque_alpha
+//              The gamma constant overrides must be used in every pass or none,
+//              and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
+//              OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
+//  Usage:      After setting macros appropriately, ignore gamma correction and
+//              replace all tex*D*() calls with equivalent gamma-aware
+//              tex*D*_linearize calls, except:
+//              1.) When you read an LUT, use regular tex*D or a gamma-specified
+//                  function, depending on its gamma encoding:
+//                      tex*D*_linearize_gamma (takes a runtime gamma parameter)
+//              2.) If you must read pass0's original input in a later pass, use
+//                  tex2D_linearize_ntsc_gamma.  If you want to read pass0's
+//                  input with gamma-corrected bilinear filtering, consider
+//                  creating a first linearizing pass and reading from the input
+//                  of pass1 later.
+//              Then, return encode_output(color) from every fragment shader.
+//              Finally, use the global gamma_aware_bilinear boolean if you want
+//              to statically branch based on whether bilinear filtering is
+//              gamma-correct or not (e.g. for placing Gaussian blur samples).
+//
+//  Detailed Policy:
+//  tex*D*_linearize() functions enforce a consistent gamma-management policy
+//  based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings.  They assume
+//  their input texture has the same encoding characteristics as the input for
+//  the current pass (which doesn't apply to the exceptions listed above).
+//  Similarly, encode_output() enforces a policy based on the LAST_PASS and
+//  GAMMA_ENCODE_EVERY_FBO settings.  Together, they result in one of the
+//  following two pipelines.
+//  Typical pipeline with intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = linear_color;     //  Automatic sRGB encoding
+//      linear_color = intermediate_output;     //  Automatic sRGB decoding
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Typical pipeline without intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
+//      linear_color = pow(intermediate_output, intermediate_gamma);
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
+//  easily get gamma-correctness without banding on devices where sRGB isn't
+//  supported.
+//
+//  Use This Header to Maximize Code Reuse:
+//  The purpose of this header is to provide a consistent interface for texture
+//  reads and output gamma-encoding that localizes and abstracts away all the
+//  annoying details.  This greatly reduces the amount of code in each shader
+//  pass that depends on the pass number in the .cgp preset or whether sRGB
+//  FBO's are being used: You can trivially change the gamma behavior of your
+//  whole pass by commenting or uncommenting 1-3 #defines.  To reuse the same
+//  code in your first, Nth, and last passes, you can even put it all in another
+//  header file and #include it from skeleton .cg files that #define the
+//  appropriate pass-specific settings.
+//
+//  Rationale for Using Three Macros:
+//  This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
+//  SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
+//  a lower maintenance burden on each pass.  At first glance it seems we could
+//  accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
+//  This works for simple use cases where input_gamma == output_gamma, but it
+//  breaks down for more complex scenarios like CRT simulation, where the pass
+//  number determines the gamma encoding of the input and output.
+
+
+///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
+
+//  Set standard gamma constants, but allow users to override them:
+#ifndef OVERRIDE_STANDARD_GAMMA
+    //  Standard encoding gammas:
+    static const float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
+    static const float pal_gamma = 2.8;     //  Never actually 2.8 in practice
+    //  Typical device decoding gammas (only use for emulating devices):
+    //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
+    //  gammas: The standards purposely undercorrected for an analog CRT's
+    //  assumed 2.5 reference display gamma to maintain contrast in assumed
+    //  [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
+    //  These unstated assumptions about display gamma and perceptual rendering
+    //  intent caused a lot of confusion, and more modern CRT's seemed to target
+    //  NTSC 2.2 gamma with circuitry.  LCD displays seem to have followed suit
+    //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
+    //  displays designed to view sRGB in bright environments.  (Standards are
+    //  also in flux again with BT.1886, but it's underspecified for displays.)
+    static const float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
+    static const float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
+    static const float lcd_reference_gamma = 2.5;       //  To match CRT
+    static const float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
+    static const float lcd_office_gamma = 2.2;  //  Approximates sRGB
+#endif  //  OVERRIDE_STANDARD_GAMMA
+
+//  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
+//  but only if they're aware of it.
+#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
+    static const bool assume_opaque_alpha = false;
+#endif
+
+
+///////////////////////  DERIVED CONSTANTS AS FUNCTIONS  ///////////////////////
+
+//  gamma-management.h should be compatible with overriding gamma values with
+//  runtime user parameters, but we can only define other global constants in
+//  terms of static constants, not uniform user parameters.  To get around this
+//  limitation, we need to define derived constants using functions.
+
+//  Set device gamma constants, but allow users to override them:
+#ifdef OVERRIDE_DEVICE_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_crt_gamma()    {   return crt_gamma;   }
+    inline float get_gba_gamma()    {   return gba_gamma;   }
+    inline float get_lcd_gamma()    {   return lcd_gamma;   }
+#else
+    inline float get_crt_gamma()    {   return crt_reference_gamma_high;    }
+    inline float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
+    inline float get_lcd_gamma()    {   return lcd_office_gamma;            }
+#endif  //  OVERRIDE_DEVICE_GAMMA
+
+//  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
+#ifdef OVERRIDE_FINAL_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_intermediate_gamma()   {   return intermediate_gamma;  }
+    inline float get_input_gamma()          {   return input_gamma;         }
+    inline float get_output_gamma()         {   return output_gamma;        }
+#else
+    //  If we gamma-correct every pass, always use ntsc_gamma between passes to
+    //  ensure middle passes don't need to care if anything is being simulated:
+    inline float get_intermediate_gamma()   {   return ntsc_gamma;          }
+    #ifdef SIMULATE_CRT_ON_LCD
+        inline float get_input_gamma()      {   return get_crt_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_LCD
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_LCD_ON_CRT
+        inline float get_input_gamma()      {   return get_lcd_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_CRT
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else   //  Don't simulate anything:
+        inline float get_input_gamma()      {   return ntsc_gamma;          }
+        inline float get_output_gamma()     {   return ntsc_gamma;          }
+    #endif  //  SIMULATE_GBA_ON_CRT
+    #endif  //  SIMULATE_LCD_ON_CRT
+    #endif  //  SIMULATE_GBA_ON_LCD
+    #endif  //  SIMULATE_CRT_ON_LCD
+#endif  //  OVERRIDE_FINAL_GAMMA
+
+//  Set decoding/encoding gammas for the current pass.  Use static constants for
+//  linearize_input and gamma_encode_output, because they aren't derived, and
+//  they let the compiler do dead-code elimination.
+#ifndef GAMMA_ENCODE_EVERY_FBO
+    #ifdef FIRST_PASS
+        static const bool linearize_input = true;
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        static const bool linearize_input = false;
+        inline float get_pass_input_gamma()     {   return 1.0;                 }
+    #endif
+    #ifdef LAST_PASS
+        static const bool gamma_encode_output = true;
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        static const bool gamma_encode_output = false;
+        inline float get_pass_output_gamma()    {   return 1.0;                 }
+    #endif
+#else
+    static const bool linearize_input = true;
+    static const bool gamma_encode_output = true;
+    #ifdef FIRST_PASS
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        inline float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
+    #endif
+    #ifdef LAST_PASS
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        inline float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
+    #endif
+#endif
+
+//  Users might want to know if bilinear filtering will be gamma-correct:
+static const bool gamma_aware_bilinear = !linearize_input;
+
+
+//////////////////////  COLOR ENCODING/DECODING FUNCTIONS  /////////////////////
+
+inline float4 encode_output(const float4 color)
+{
+    if(gamma_encode_output)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_input(const float4 color)
+{
+    if(linearize_input)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_gamma_input(const float4 color, const float3 gamma)
+{
+    if(assume_opaque_alpha)
+    {
+        return float4(pow(color.rgb, gamma), 1.0);
+    }
+    else
+    {
+        return float4(pow(color.rgb, gamma), color.a);
+    }
+}
+
+//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯
+//#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D)))
+// EDIT: it's the 'const' in front of the coords that's doing it
+
+///////////////////////////  TEXTURE LOOKUP WRAPPERS  //////////////////////////
+
+//  "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a wide array of linearizing texture lookup wrapper functions.  The
+//  Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
+//  lookups are provided for completeness in case that changes someday.  Nobody
+//  is likely to use the *fetch and *proj functions, but they're included just
+//  in case.  The only tex*D texture sampling functions omitted are:
+//      - tex*Dcmpbias
+//      - tex*Dcmplod
+//      - tex*DARRAY*
+//      - tex*DMS*
+//      - Variants returning integers
+//  Standard line length restrictions are ignored below for vertical brevity.
+/*
+//  tex1D:
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex1Dbias:
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dbias(tex, tex_coords));   }
+
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dbias(tex, tex_coords, texel_off));    }
+
+//  tex1Dfetch:
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
+{   return decode_input(tex1Dfetch(tex, tex_coords));  }
+
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex1Dlod:
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dlod(tex, tex_coords));    }
+
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dlod(tex, tex_coords, texel_off));     }
+
+//  tex1Dproj:
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+*/
+//  tex2D:
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords, texel_off));    }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex2Dbias:
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
+//{   return decode_input(tex2Dbias(tex, tex_coords));   }
+
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dbias(tex, tex_coords, texel_off));    }
+
+//  tex2Dfetch:
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
+//{   return decode_input(tex2Dfetch(tex, tex_coords));  }
+
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords)
+{   return decode_input(textureLod(tex, tex_coords.xy, 0.0));    }
+
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));     }
+/*
+//  tex2Dproj:
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+*/
+/*
+//  tex3D:
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
+{   return decode_input(tex3D(tex, tex_coords));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, texel_off));    }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex3Dbias:
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dbias(tex, tex_coords));   }
+
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dbias(tex, tex_coords, texel_off));    }
+
+//  tex3Dfetch:
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
+{   return decode_input(tex3Dfetch(tex, tex_coords));  }
+
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex3Dlod:
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dlod(tex, tex_coords));    }
+
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dlod(tex, tex_coords, texel_off));     }
+
+//  tex3Dproj:
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dproj(tex, tex_coords));   }
+
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dproj(tex, tex_coords, texel_off));    }
+/////////*
+
+//  NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  This narrow selection of nonstandard tex2D* functions can be useful:
+
+//  tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0)));   }
+
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off));    }
+
+
+//  MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a narrower selection of tex2D* wrapper functions that decode an
+//  input sample with a specified gamma value.  These are useful for reading
+//  LUT's and for reading the input of pass0 in a later pass.
+
+//  tex2D:
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma);   }
+
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+/*
+//  tex2Dbias:
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma);   }
+
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma);    }
+
+//  tex2Dfetch:
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma);  }
+
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma);   }
+*/
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma);    }
+
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma);     }
+
+
+#endif  //  GAMMA_MANAGEMENT_H
+
+////////////////////////////  END GAMMA-MANAGEMENT  //////////////////////////
+
+//#include "phosphor-mask-resizing.h"
+
+////////////////////////  BEGIN PHOSPHOR-MASK-RESIZING  ////////////////////////
+
+#ifndef PHOSPHOR_MASK_RESIZING_H
+#define PHOSPHOR_MASK_RESIZING_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//#include "../user-settings.h"
+//#include "derived-settings-and-constants.h"
+
+/////////////////////////////  CODEPATH SELECTION  /////////////////////////////
+
+//  Choose a looping strategy based on what's allowed:
+//  Dynamic loops not allowed: Use a flat static loop.
+//  Dynamic loops accomodated: Coarsely branch around static loops.
+//  Dynamic loops assumed allowed: Use a flat dynamic loop.
+#ifndef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #ifdef ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+        #define BREAK_LOOPS_INTO_PIECES
+    #else
+        #define USE_SINGLE_STATIC_LOOP
+    #endif
+#endif  //  No else needed: Dynamic loops assumed.
+
+
+//////////////////////////////////  CONSTANTS  /////////////////////////////////
+
+//  The larger the resized tile, the fewer samples we'll need for downsizing.
+//  See if we can get a static min tile size > mask_min_allowed_tile_size:
+static const float mask_min_allowed_tile_size = ceil(
+    mask_min_allowed_triad_size * mask_triads_per_tile);
+static const float mask_min_expected_tile_size = 
+        mask_min_allowed_tile_size;
+//  Limit the number of sinc resize taps by the maximum minification factor:
+static const float pi_over_lobes = pi/mask_sinc_lobes;
+static const float max_sinc_resize_samples_float = 2.0 * mask_sinc_lobes *
+    mask_resize_src_lut_size.x/mask_min_expected_tile_size;
+//  Vectorized loops sample in multiples of 4.  Round up to be safe:
+static const float max_sinc_resize_samples_m4 = ceil(
+    max_sinc_resize_samples_float * 0.25) * 4.0;
+
+
+/////////////////////////  RESAMPLING FUNCTION HELPERS  ////////////////////////
+
+inline float get_dynamic_loop_size(const float magnification_scale)
+{
+    //  Requires:   The following global constants must be defined:
+    //              1.) mask_sinc_lobes
+    //              2.) max_sinc_resize_samples_m4
+    //  Returns:    The minimum number of texture samples for a correct downsize
+    //              at magnification_scale.
+    //  We're downsizing, so the filter is sized across 2*lobes output pixels
+    //  (not 2*lobes input texels).  This impacts distance measurements and the
+    //  minimum number of input samples needed.
+    const float min_samples_float = 2.0 * mask_sinc_lobes / magnification_scale;
+    const float min_samples_m4 = ceil(min_samples_float * 0.25) * 4.0;
+    #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+        const float max_samples_m4 = max_sinc_resize_samples_m4;
+    #else   // ifdef BREAK_LOOPS_INTO_PIECES
+        //  Simulating loops with branches imposes a 128-sample limit.
+        const float max_samples_m4 = min(128.0, max_sinc_resize_samples_m4);
+    #endif
+    return min(min_samples_m4, max_samples_m4);
+}
+
+float2 get_first_texel_tile_uv_and_dist(const float2 tex_uv, 
+    const float2 tex_size, const float dr, 
+    const float input_tiles_per_texture_r, const float samples,
+    static const bool vertical)
+{
+    //  Requires:   1.) dr == du == 1.0/texture_size.x or
+    //                  dr == dv == 1.0/texture_size.y
+    //                  (whichever direction we're resampling in).
+    //                  It's a scalar to save register space.
+    //              2.) input_tiles_per_texture_r is the number of input tiles
+    //                  that can fit in the input texture in the direction we're
+    //                  resampling this pass.
+    //              3.) vertical indicates whether we're resampling vertically
+    //                  this pass (or horizontally).
+    //  Returns:    Pack and return the first sample's tile_uv coord in [0, 1]
+    //              and its texel distance from the destination pixel, in the
+    //              resized dimension only.
+    //  We'll start with the topmost or leftmost sample and work down or right,
+    //  so get the first sample location and distance.  Modify both dimensions
+    //  as if we're doing a one-pass 2D resize; we'll throw away the unneeded
+    //  (and incorrect) dimension at the end.
+    const float2 curr_texel = tex_uv * tex_size;
+    const float2 prev_texel =
+        floor(curr_texel - float2(under_half)) + float2(0.5);
+    const float2 first_texel = prev_texel - float2(samples/2.0 - 1.0);
+    const float2 first_texel_uv_wrap_2D = first_texel * dr;
+    const float2 first_texel_dist_2D = curr_texel - first_texel;
+    //  Convert from tex_uv to tile_uv coords so we can sub fracs for fmods.
+    const float2 first_texel_tile_uv_wrap_2D =
+        first_texel_uv_wrap_2D * input_tiles_per_texture_r;
+    //  Project wrapped coordinates to the [0, 1] range.  We'll do this with all
+    //  samples,but the first texel is special, since it might be negative.
+    const float2 coord_negative =
+        float2((first_texel_tile_uv_wrap_2D.x < 0.),(first_texel_tile_uv_wrap_2D.y < 0.));
+    const float2 first_texel_tile_uv_2D =
+        frac(first_texel_tile_uv_wrap_2D) + coord_negative;
+    //  Pack the first texel's tile_uv coord and texel distance in 1D:
+    const float2 tile_u_and_dist =
+        float2(first_texel_tile_uv_2D.x, first_texel_dist_2D.x);
+    const float2 tile_v_and_dist =
+        float2(first_texel_tile_uv_2D.y, first_texel_dist_2D.y);
+    return vertical ? tile_v_and_dist : tile_u_and_dist;
+    //return lerp(tile_u_and_dist, tile_v_and_dist, float(vertical));
+}
+
+inline float4 tex2Dlod0try(const sampler2D tex, const float2 tex_uv)
+{
+    //  Mipmapping and anisotropic filtering get confused by sinc-resampling.
+    //  One [slow] workaround is to select the lowest mip level:
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        return textureLod(tex, float4(tex_uv, 0.0, 0.0).xy);
+    #else
+        #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+            return tex2Dbias(tex, float4(tex_uv, 0.0, -16.0));
+        #else
+            return texture(tex, tex_uv);
+        #endif
+    #endif
+}
+
+
+//////////////////////////////  LOOP BODY MACROS  //////////////////////////////
+
+//  Using inline functions can exceed the temporary register limit, so we're
+//  stuck with #define macros (I'm TRULY sorry).  They're declared here instead
+//  of above to be closer to the actual invocation sites.  Steps:
+//  1.) Get the exact texel location.
+//  2.) Sample the phosphor mask (already assumed encoded in linear RGB).
+//  3.) Get the distance from the current pixel and sinc weight:
+//          sinc(dist) = sin(pi * dist)/(pi * dist)
+//      We can also use the slower/smoother Lanczos instead:
+//          L(x) = sinc(dist) * sinc(dist / lobes)
+//  4.) Accumulate the weight sum in weights, and accumulate the weighted texels
+//      in pixel_color (we'll normalize outside the loop at the end).
+//  We vectorize the loop to help reduce the Lanczos window's cost.
+
+    //  The r coord is the coord in the dimension we're resizing along (u or v),
+    //  and first_texel_tile_uv_rrrr is a float4 of the first texel's u or v
+    //  tile_uv coord in [0, 1].  tex_uv_r will contain the tile_uv u or v coord
+    //  for four new texel samples.
+    #define CALCULATE_R_COORD_FOR_4_SAMPLES                                    \
+        const float4 true_i = float4(i_base + i) + float4(0.0, 1.0, 2.0, 3.0); \
+        const float4 tile_uv_r = frac(                                         \
+            first_texel_tile_uv_rrrr + true_i * tile_dr);                      \
+        const float4 tex_uv_r = tile_uv_r * tile_size_uv_r;
+
+    #ifdef PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+        #define CALCULATE_SINC_RESAMPLE_WEIGHTS                                \
+            const float4 pi_dist_over_lobes = pi_over_lobes * dist;            \
+            const float4 weights = min(sin(pi_dist) * sin(pi_dist_over_lobes) /\
+                (pi_dist*pi_dist_over_lobes), float4(1.0));
+    #else
+        #define CALCULATE_SINC_RESAMPLE_WEIGHTS                                \
+            const float4 weights = min(sin(pi_dist)/pi_dist, float4(1.0));
+    #endif
+
+    #define UPDATE_COLOR_AND_WEIGHT_SUMS                                       \
+        const float4 dist = magnification_scale *                              \
+            abs(first_dist_unscaled - true_i);                                 \
+        const float4 pi_dist = pi * dist;                                      \
+        CALCULATE_SINC_RESAMPLE_WEIGHTS;                                       \
+        pixel_color += new_sample0 * weights.xxx;                              \
+        pixel_color += new_sample1 * weights.yyy;                              \
+        pixel_color += new_sample2 * weights.zzz;                              \
+        pixel_color += new_sample3 * weights.www;                              \
+        weight_sum += weights;
+
+    #define VERTICAL_SINC_RESAMPLE_LOOP_BODY                                   \
+        CALCULATE_R_COORD_FOR_4_SAMPLES;                                       \
+        const float3 new_sample0 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.x)).rgb;                                 \
+        const float3 new_sample1 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.y)).rgb;                                 \
+        const float3 new_sample2 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.z)).rgb;                                 \
+        const float3 new_sample3 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.w)).rgb;                                 \
+        UPDATE_COLOR_AND_WEIGHT_SUMS;
+
+    #define HORIZONTAL_SINC_RESAMPLE_LOOP_BODY                                 \
+        CALCULATE_R_COORD_FOR_4_SAMPLES;                                       \
+        const float3 new_sample0 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.x, tex_uv.y)).rgb;                                 \
+        const float3 new_sample1 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.y, tex_uv.y)).rgb;                                 \
+        const float3 new_sample2 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.z, tex_uv.y)).rgb;                                 \
+        const float3 new_sample3 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.w, tex_uv.y)).rgb;                                 \
+        UPDATE_COLOR_AND_WEIGHT_SUMS;
+
+
+////////////////////////////  RESAMPLING FUNCTIONS  ////////////////////////////
+
+float3 downsample_vertical_sinc_tiled(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size, static const float dr,
+    const float magnification_scale, static const float tile_size_uv_r)
+{
+    //  Requires:   1.) dr == du == 1.0/texture_size.x or
+    //                  dr == dv == 1.0/texture_size.y
+    //                  (whichever direction we're resampling in).
+    //                  It's a scalar to save register space.
+    //              2.) tile_size_uv_r is the number of texels an input tile
+    //                  takes up in the input texture, in the direction we're
+    //                  resampling this pass.
+    //              3.) magnification_scale must be <= 1.0.
+    //  Returns:    Return a [Lanczos] sinc-resampled pixel of a vertically
+    //              downsized input tile embedded in an input texture.  (The
+    //              vertical version is special-cased though: It assumes the
+    //              tile size equals the [static] texture size, since it's used
+    //              on an LUT texture input containing one tile.  For more
+    //              generic use, eliminate the "static" in the parameters.)
+    //  The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension
+    //  we're resizing along, e.g. "dy" in this case.
+    #ifdef USE_SINGLE_STATIC_LOOP
+        //  A static loop can be faster, but it might blur too much from using
+        //  more samples than it should.
+        static const int samples = int(max_sinc_resize_samples_m4);
+    #else
+        const int samples = int(get_dynamic_loop_size(magnification_scale));
+    #endif
+
+    //  Get the first sample location (scalar tile uv coord along the resized
+    //  dimension) and distance from the output location (in texels):
+    static const float input_tiles_per_texture_r = 1.0/tile_size_uv_r;
+    //  true = vertical resize:
+    const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist(
+        tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, true);
+    const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx;
+    const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy;
+    //  Get the tile sample offset:
+    static const float tile_dr = dr * input_tiles_per_texture_r;
+
+    //  Sum up each weight and weighted sample color, varying the looping
+    //  strategy based on our expected dynamic loop capabilities.  See the
+    //  loop body macros above.
+    int i_base = 0;
+    float4 weight_sum = float4(0.0);
+    float3 pixel_color = float3(0.0);
+    static const int i_step = 4;
+    #ifdef BREAK_LOOPS_INTO_PIECES
+        if(samples - i_base >= 64)
+        {
+            for(int i = 0; i < 64; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 64;
+        }
+        if(samples - i_base >= 32)
+        {
+            for(int i = 0; i < 32; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 32;
+        }
+        if(samples - i_base >= 16)
+        {
+            for(int i = 0; i < 16; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 16;
+        }
+        if(samples - i_base >= 8)
+        {
+            for(int i = 0; i < 8; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 8;
+        }
+        if(samples - i_base >= 4)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 4;
+        }
+        //  Do another 4-sample block for a total of 128 max samples.
+        if(samples - i_base > 0)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+        }
+    #else
+        for(int i = 0; i < samples; i += i_step)
+        {
+            VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+        }
+    #endif
+    //  Normalize so the weight_sum == 1.0, and return:
+    const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw;
+    const float3 scalar_weight_sum = float3(weight_sum_reduce.x + 
+        weight_sum_reduce.y);
+    return (pixel_color/scalar_weight_sum);
+}
+
+float3 downsample_horizontal_sinc_tiled(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size, const float dr,
+    const float magnification_scale, const float tile_size_uv_r)
+{
+    //  Differences from downsample_horizontal_sinc_tiled:
+    //  1.) The dr and tile_size_uv_r parameters are not static consts.
+    //  2.) The "vertical" parameter to get_first_texel_tile_uv_and_dist is
+    //      set to false instead of true.
+    //  3.) The horizontal version of the loop body is used.
+    //  TODO: If we can get guaranteed compile-time dead code elimination,
+    //  we can combine the vertical/horizontal downsampling functions by:
+    //  1.) Add an extra static const bool parameter called "vertical."
+    //  2.) Supply it with the result of get_first_texel_tile_uv_and_dist().
+    //  3.) Use a conditional assignment in the loop body macro.  This is the
+    //      tricky part: We DO NOT want to incur the extra conditional
+    //      assignment in the inner loop at runtime!
+    //  The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension
+    //  we're resizing along, e.g. "dx" in this case.
+    #ifdef USE_SINGLE_STATIC_LOOP
+        //  If we have to load all samples, we might as well use them.
+        static const int samples = int(max_sinc_resize_samples_m4);
+    #else
+        const int samples = int(get_dynamic_loop_size(magnification_scale));
+    #endif
+
+    //  Get the first sample location (scalar tile uv coord along resized
+    //  dimension) and distance from the output location (in texels):
+    const float input_tiles_per_texture_r = 1.0/tile_size_uv_r;
+    //  false = horizontal resize:
+    const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist(
+        tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, false);
+    const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx;
+    const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy;
+    //  Get the tile sample offset:
+    const float tile_dr = dr * input_tiles_per_texture_r;
+
+    //  Sum up each weight and weighted sample color, varying the looping
+    //  strategy based on our expected dynamic loop capabilities.  See the
+    //  loop body macros above.
+    int i_base = 0;
+    float4 weight_sum = float4(0.0);
+    float3 pixel_color = float3(0.0);
+    static const int i_step = 4;
+    #ifdef BREAK_LOOPS_INTO_PIECES
+        if(samples - i_base >= 64)
+        {
+            for(int i = 0; i < 64; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 64;
+        }
+        if(samples - i_base >= 32)
+        {
+            for(int i = 0; i < 32; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 32;
+        }
+        if(samples - i_base >= 16)
+        {
+            for(int i = 0; i < 16; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 16;
+        }
+        if(samples - i_base >= 8)
+        {
+            for(int i = 0; i < 8; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 8;
+        }
+        if(samples - i_base >= 4)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 4;
+        }
+        //  Do another 4-sample block for a total of 128 max samples.
+        if(samples - i_base > 0)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+        }
+    #else
+        for(int i = 0; i < samples; i += i_step)
+        {
+            HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+        }
+    #endif
+    //  Normalize so the weight_sum == 1.0, and return:
+    const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw;
+    const float3 scalar_weight_sum = float3(weight_sum_reduce.x +
+        weight_sum_reduce.y);
+    return (pixel_color/scalar_weight_sum);
+}
+
+
+////////////////////////////  TILE SIZE CALCULATION  ///////////////////////////
+
+float2 get_resized_mask_tile_size(const float2 estimated_viewport_size,
+    const float2 estimated_mask_resize_output_size,
+    const bool solemnly_swear_same_inputs_for_every_pass)
+{
+    //  Requires:   The following global constants must be defined according to
+    //              certain constraints:
+    //              1.) mask_resize_num_triads: Must be high enough that our
+    //                  mask sampling method won't have artifacts later
+    //                  (long story; see derived-settings-and-constants.h)
+    //              2.) mask_resize_src_lut_size: Texel size of our mask LUT
+    //              3.) mask_triads_per_tile: Num horizontal triads in our LUT
+    //              4.) mask_min_allowed_triad_size: User setting (the more
+    //                  restrictive it is, the faster the resize will go)
+    //              5.) mask_min_allowed_tile_size_x < mask_resize_src_lut_size.x
+    //              6.) mask_triad_size_desired_{runtime, static}
+    //              7.) mask_num_triads_desired_{runtime, static}
+    //              8.) mask_specify_num_triads must be 0.0/1.0 (false/true)
+    //              The function parameters must be defined as follows:
+    //              1.) estimated_viewport_size == (final viewport size);
+    //                  If mask_specify_num_triads is 1.0/true and the viewport
+    //                  estimate is wrong, the number of triads will differ from
+    //                  the user's preference by about the same factor.
+    //              2.) estimated_mask_resize_output_size: Must equal the
+    //                  output size of the MASK_RESIZE pass.
+    //                  Exception: The x component may be estimated garbage if
+    //                  and only if the caller throws away the x result.
+    //              3.) solemnly_swear_same_inputs_for_every_pass: Set to false,
+    //                  unless you can guarantee that every call across every
+    //                  pass will use the same sizes for the other parameters.
+    //              When calling this across multiple passes, always use the
+    //              same y viewport size/scale, and always use the same x
+    //              viewport size/scale when using the x result.
+    //  Returns:    Return the final size of a manually resized mask tile, after
+    //              constraining the desired size to avoid artifacts.  Under
+    //              unusual circumstances, tiles may become stretched vertically
+    //              (see wall of text below).
+    //  Stated tile properties must be correct:
+    static const float tile_aspect_ratio_inv =
+        mask_resize_src_lut_size.y/mask_resize_src_lut_size.x;
+    static const float tile_aspect_ratio = 1.0/tile_aspect_ratio_inv;
+    static const float2 tile_aspect = float2(1.0, tile_aspect_ratio_inv);
+    //  If mask_specify_num_triads is 1.0/true and estimated_viewport_size.x is
+    //  wrong, the user preference will be misinterpreted:
+    const float desired_tile_size_x = mask_triads_per_tile * lerp(
+        mask_triad_size_desired,
+        estimated_viewport_size.x / mask_num_triads_desired,
+        mask_specify_num_triads);
+    if(get_mask_sample_mode() > 0.5)
+    {
+        //  We don't need constraints unless we're sampling MASK_RESIZE.
+        return desired_tile_size_x * tile_aspect;
+    }
+    //  Make sure we're not upsizing:
+    const float temp_tile_size_x =
+        min(desired_tile_size_x, mask_resize_src_lut_size.x);
+    //  Enforce min_tile_size and max_tile_size in both dimensions:
+    const float2 temp_tile_size = temp_tile_size_x * tile_aspect;
+    static const float2 min_tile_size =
+        mask_min_allowed_tile_size * tile_aspect;
+    const float2 max_tile_size =
+        estimated_mask_resize_output_size / mask_resize_num_tiles;
+    const float2 clamped_tile_size =
+        clamp(temp_tile_size, min_tile_size, max_tile_size);
+    //  Try to maintain tile_aspect_ratio.  This is the tricky part:
+    //  If we're currently resizing in the y dimension, the x components
+    //  could be MEANINGLESS.  (If estimated_mask_resize_output_size.x is
+    //  bogus, then so is max_tile_size.x and clamped_tile_size.x.)
+    //  We can't adjust the y size based on clamped_tile_size.x.  If it
+    //  clamps when it shouldn't, it won't clamp again when later passes
+    //  call this function with the correct sizes, and the discrepancy will
+    //  break the sampling coords in MASKED_SCANLINES.  Instead, we'll limit
+    //  the x size based on the y size, but not vice versa, unless the
+    //  caller swears the parameters were the same (correct) in every pass.
+    //  As a result, triads could appear vertically stretched if:
+    //  a.) mask_resize_src_lut_size.x > mask_resize_src_lut_size.y: Wide
+    //      LUT's might clamp x more than y (all provided LUT's are square)
+    //  b.) true_viewport_size.x < true_viewport_size.y: The user is playing
+    //      with a vertically oriented screen (not accounted for anyway)
+    //  c.) mask_resize_viewport_scale.x < masked_resize_viewport_scale.y:
+    //      Viewport scales are equal by default.
+    //  If any of these are the case, you can fix the stretching by setting:
+    //      mask_resize_viewport_scale.x = mask_resize_viewport_scale.y *
+    //          (1.0 / min_expected_aspect_ratio) *
+    //          (mask_resize_src_lut_size.x / mask_resize_src_lut_size.y)
+    const float x_tile_size_from_y =
+        clamped_tile_size.y * tile_aspect_ratio;
+    const float y_tile_size_from_x = lerp(clamped_tile_size.y,
+        clamped_tile_size.x * tile_aspect_ratio_inv,
+        float(solemnly_swear_same_inputs_for_every_pass));
+    const float2 reclamped_tile_size = float2(
+        min(clamped_tile_size.x, x_tile_size_from_y),
+        min(clamped_tile_size.y, y_tile_size_from_x));
+    //  We need integer tile sizes in both directions for tiled sampling to
+    //  work correctly.  Use floor (to make sure we don't round up), but be
+    //  careful to avoid a rounding bug where floor decreases whole numbers:
+    const float2 final_resized_tile_size =
+        floor(reclamped_tile_size + float2(FIX_ZERO(0.0)));
+    return final_resized_tile_size;
+}
+
+
+/////////////////////////  FINAL MASK SAMPLING HELPERS  ////////////////////////
+
+float4 get_mask_sampling_parameters(const float2 mask_resize_texture_size,
+    const float2 mask_resize_video_size, const float2 true_viewport_size,
+    out float2 mask_tiles_per_screen)
+{
+    //  Requires:   1.) Requirements of get_resized_mask_tile_size() must be
+    //                  met, particularly regarding global constants.
+    //              The function parameters must be defined as follows:
+    //              1.) mask_resize_texture_size == MASK_RESIZE.texture_size
+    //                  if get_mask_sample_mode() is 0 (otherwise anything)
+    //              2.) mask_resize_video_size == MASK_RESIZE.video_size
+    //                  if get_mask_sample_mode() is 0 (otherwise anything)
+    //              3.) true_viewport_size == output_size for a pass set to
+    //                  1.0 viewport scale (i.e. it must be correct)
+    //  Returns:    Return a float4 containing:
+    //                  xy: tex_uv coords for the start of the mask tile
+    //                  zw: tex_uv size of the mask tile from start to end
+    //              mask_tiles_per_screen is an out parameter containing the
+    //              number of mask tiles that will fit on the screen.
+    //  First get the final resized tile size.  The viewport size and mask
+    //  resize viewport scale must be correct, but don't solemnly swear they
+    //  were correct in both mask resize passes unless you know it's true.
+    //  (We can better ensure a correct tile aspect ratio if the parameters are
+    //  guaranteed correct in all passes...but if we lie, we'll get inconsistent
+    //  sizes across passes, resulting in broken texture coordinates.)
+    const float mask_sample_mode = get_mask_sample_mode();
+    const float2 mask_resize_tile_size = get_resized_mask_tile_size(
+        true_viewport_size, mask_resize_video_size, false);
+    if(mask_sample_mode < 0.5)
+    {
+        //  Sample MASK_RESIZE: The resized tile is a fraction of the texture
+        //  size and starts at a nonzero offset to allow for border texels:
+        const float2 mask_tile_uv_size = mask_resize_tile_size /
+            mask_resize_texture_size;
+        const float2 skipped_tiles = mask_start_texels/mask_resize_tile_size;
+        const float2 mask_tile_start_uv = skipped_tiles * mask_tile_uv_size;
+        //  mask_tiles_per_screen must be based on the *true* viewport size:
+        mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size;
+        return float4(mask_tile_start_uv, mask_tile_uv_size);
+    }
+    else
+    {
+        //  If we're tiling at the original size (1:1 pixel:texel), redefine a
+        //  "tile" to be the full texture containing many triads.  Otherwise,
+        //  we're hardware-resampling an LUT, and the texture truly contains a
+        //  single unresized phosphor mask tile anyway.
+        static const float2 mask_tile_uv_size = float2(1.0);
+        static const float2 mask_tile_start_uv = float2(0.0);
+        if(mask_sample_mode > 1.5)
+        {
+            //  Repeat the full LUT at a 1:1 pixel:texel ratio without resizing:
+            mask_tiles_per_screen = true_viewport_size/mask_texture_large_size;
+        }
+        else
+        {
+            //  Hardware-resize the original LUT:
+            mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size;
+        }
+        return float4(mask_tile_start_uv, mask_tile_uv_size);
+    }
+}
+/*
+float2 fix_tiling_discontinuities_normalized(const float2 tile_uv,
+    float2 duv_dx, float2 duv_dy)
+{
+    //  Requires:   1.) duv_dx == ddx(tile_uv)
+    //              2.) duv_dy == ddy(tile_uv)
+    //              3.) tile_uv contains tile-relative uv coords in [0, 1],
+    //                  such that (0.5, 0.5) is the center of a tile, etc.
+    //                  ("Tile" can mean texture, the video embedded in the
+    //                  texture, or some other "tile" embedded in a texture.)
+    //  Returns:    Return new tile_uv coords that contain no discontinuities
+    //              across a 2x2 pixel quad.
+    //  Description:
+    //  When uv coords wrap from 1.0 to 0.0, they create a discontinuity in the
+    //  derivatives, which we assume happened if the absolute difference between
+    //  any fragment in a 2x2 block is > ~half a tile.  If the current block has
+    //  a u or v discontinuity and the current fragment is in the first half of
+    //  the tile along that axis (i.e. it wrapped from 1.0 to 0.0), add a tile
+    //  to that coord to make the 2x2 block continuous.  (It will now have a
+    //  coord > 1.0 in the padding area beyond the tile.)  This function takes
+    //  derivatives as parameters so the caller can reuse them.
+    //  In case we're using high-quality (nVidia-style) derivatives, ensure
+    //  diagonically opposite fragments see each other for correctness:
+    duv_dx = abs(duv_dx) + abs(ddy(duv_dx));
+    duv_dy = abs(duv_dy) + abs(ddx(duv_dy));
+    const float2 pixel_in_first_half_tile = float2((tile_uv.x < 0.5),(tile_uv.y < 0.5));
+    const float2 jump_exists = float2(((duv_dx + duv_dy).x > 0.5),((duv_dx + duv_dy).y > 0.5));
+    return tile_uv + jump_exists * pixel_in_first_half_tile;
+}
+*/
+float2 convert_phosphor_tile_uv_wrap_to_tex_uv(const float2 tile_uv_wrap,
+    const float4 mask_tile_start_uv_and_size)
+{
+    //  Requires:   1.) tile_uv_wrap contains tile-relative uv coords, where the
+    //                  tile spans from [0, 1], such that (0.5, 0.5) is at the
+    //                  tile center.  The input coords can range from [0, inf],
+    //                  and their fractional parts map to a repeated tile.
+    //                  ("Tile" can mean texture, the video embedded in the
+    //                  texture, or some other "tile" embedded in a texture.)
+    //              2.) mask_tile_start_uv_and_size.xy contains tex_uv coords
+    //                  for the start of the embedded tile in the full texture.
+    //              3.) mask_tile_start_uv_and_size.zw contains the [fractional]
+    //                  tex_uv size of the embedded tile in the full texture.
+    //  Returns:    Return tex_uv coords (used for texture sampling)
+    //              corresponding to tile_uv_wrap.
+    if(get_mask_sample_mode() < 0.5)
+    {
+        //  Manually repeat the resized mask tile to fill the screen:
+        //  First get fractional tile_uv coords.  Using frac/fmod on coords
+        //  confuses anisotropic filtering; fix it as user options dictate.
+        //  derived-settings-and-constants.h disables incompatible options.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            float2 tile_uv = frac(tile_uv_wrap * 0.5) * 2.0;
+        #else
+            float2 tile_uv = frac(tile_uv_wrap);
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            const float2 tile_uv_dx = ddx(tile_uv);
+            const float2 tile_uv_dy = ddy(tile_uv);
+            tile_uv = fix_tiling_discontinuities_normalized(tile_uv,
+                tile_uv_dx, tile_uv_dy);
+        #endif
+        //  The tile is embedded in a padded FBO, and it may start at a
+        //  nonzero offset if border texels are used to avoid artifacts:
+        const float2 mask_tex_uv = mask_tile_start_uv_and_size.xy +
+            tile_uv * mask_tile_start_uv_and_size.zw;
+        return mask_tex_uv;
+    }
+    else
+    {
+        //  Sample from the input phosphor mask texture with hardware tiling.
+        //  If we're tiling at the original size (mode 2), the "tile" is the
+        //  whole texture, and it contains a large number of triads mapped with
+        //  a 1:1 pixel:texel ratio.  OTHERWISE, the texture contains a single
+        //  unresized tile.  tile_uv_wrap already has correct coords for both!
+        return tile_uv_wrap;
+    }
+}
+
+
+#endif  //  PHOSPHOR_MASK_RESIZING_H
+
+/////////////////////////  END PHOSPHOR-MASK-RESIZING  /////////////////////////
+
+//#include "scanline-functions.h"
+
+/////////////////////////////  BEGIN SCANLINE-FUNCTIONS  ////////////////////////////
+
+#ifndef SCANLINE_FUNCTIONS_H
+#define SCANLINE_FUNCTIONS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+////////////////////////////  END USER-SETTINGS  //////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  END DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////////////
+
+//#include "../../../../include/special-functions.h"
+
+///////////////////////////  BEGIN SPECIAL-FUNCTIONS  //////////////////////////
+
+#ifndef SPECIAL_FUNCTIONS_H
+#define SPECIAL_FUNCTIONS_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file implements the following mathematical special functions:
+//  1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2))
+//  2.) gamma(s), a real-numbered extension of the integer factorial function
+//  It also implements normalized_ligamma(s, z), a normalized lower incomplete
+//  gamma function for s < 0.5 only.  Both gamma() and normalized_ligamma() can
+//  be called with an _impl suffix to use an implementation version with a few
+//  extra precomputed parameters (which may be useful for the caller to reuse).
+//  See below for details.
+//
+//  Design Rationale:
+//  Pretty much every line of code in this file is duplicated four times for
+//  different input types (float4/float3/float2/float).  This is unfortunate,
+//  but Cg doesn't allow function templates.  Macros would be far less verbose,
+//  but they would make the code harder to document and read.  I don't expect
+//  these functions will require a whole lot of maintenance changes unless
+//  someone ever has need for more robust incomplete gamma functions, so code
+//  duplication seems to be the lesser evil in this case.
+
+
+///////////////////////////  GAUSSIAN ERROR FUNCTION  //////////////////////////
+
+float4 erf6(float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Return an Abramowitz/Stegun approximation of erf(), where:
+    //                  erf(x) = 2/sqrt(pi) * integral(e**(-x**2))
+    //              This approximation has a max absolute error of 2.5*10**-5
+    //              with solid numerical robustness and efficiency.  See:
+	//                  https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions
+	static const float4 one = float4(1.0);
+	const float4 sign_x = sign(x);
+	const float4 t = one/(one + 0.47047*abs(x));
+	const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float3 erf6(const float3 x)
+{
+    //  Float3 version:
+	static const float3 one = float3(1.0);
+	const float3 sign_x = sign(x);
+	const float3 t = one/(one + 0.47047*abs(x));
+	const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float2 erf6(const float2 x)
+{
+    //  Float2 version:
+	static const float2 one = float2(1.0);
+	const float2 sign_x = sign(x);
+	const float2 t = one/(one + 0.47047*abs(x));
+	const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float erf6(const float x)
+{
+    //  Float version:
+	const float sign_x = sign(x);
+	const float t = 1.0/(1.0 + 0.47047*abs(x));
+	const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float4 erft(const float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Approximate erf() with the hyperbolic tangent.  The error is
+    //              visually noticeable, but it's blazing fast and perceptually
+    //              close...at least on ATI hardware.  See:
+    //                  http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html
+    //  Warning:    Only use this if your hardware drivers correctly implement
+    //              tanh(): My nVidia 8800GTS returns garbage output.
+	return tanh(1.202760580 * x);
+}
+
+float3 erft(const float3 x)
+{
+    //  Float3 version:
+	return tanh(1.202760580 * x);
+}
+
+float2 erft(const float2 x)
+{
+    //  Float2 version:
+	return tanh(1.202760580 * x);
+}
+
+float erft(const float x)
+{
+    //  Float version:
+	return tanh(1.202760580 * x);
+}
+
+inline float4 erf(const float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Some approximation of erf(x), depending on user settings.
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float3 erf(const float3 x)
+{
+    //  Float3 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float2 erf(const float2 x)
+{
+    //  Float2 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float erf(const float x)
+{
+    //  Float version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+
+///////////////////////////  COMPLETE GAMMA FUNCTION  //////////////////////////
+
+float4 gamma_impl(const float4 s, const float4 s_inv)
+{
+    //  Requires:   1.) s is the standard parameter to the gamma function, and
+    //                  it should lie in the [0, 36] range.
+    //              2.) s_inv = 1.0/s.  This implementation function requires
+    //                  the caller to precompute this value, giving users the
+    //                  opportunity to reuse it.
+    //  Returns:    Return approximate gamma function (real-numbered factorial)
+    //              output using the Lanczos approximation with two coefficients
+    //              calculated using Paul Godfrey's method here:
+    //                  http://my.fit.edu/~gabdo/gamma.txt
+    //              An optimal g value for s in [0, 36] is ~1.12906830989, with
+    //              a maximum relative error of 0.000463 for 2**16 equally
+    //              evals.  We could use three coeffs (0.0000346 error) without
+    //              hurting latency, but this allows more parallelism with
+    //              outside instructions.
+	static const float4 g = float4(1.12906830989);
+	static const float4 c0 = float4(0.8109119309638332633713423362694399653724431);
+	static const float4 c1 = float4(0.4808354605142681877121661197951496120000040);
+	static const float4 e = float4(2.71828182845904523536028747135266249775724709);
+	const float4 sph = s + float4(0.5);
+	const float4 lanczos_sum = c0 + c1/(s + float4(1.0));
+	const float4 base = (sph + g)/e;  //  or (s + g + float4(0.5))/e
+	//  gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s).
+	//  This has less error for small s's than (s -= 1.0) at the beginning.
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float3 gamma_impl(const float3 s, const float3 s_inv)
+{
+    //  Float3 version:
+	static const float3 g = float3(1.12906830989);
+	static const float3 c0 = float3(0.8109119309638332633713423362694399653724431);
+	static const float3 c1 = float3(0.4808354605142681877121661197951496120000040);
+	static const float3 e = float3(2.71828182845904523536028747135266249775724709);
+	const float3 sph = s + float3(0.5);
+	const float3 lanczos_sum = c0 + c1/(s + float3(1.0));
+	const float3 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float2 gamma_impl(const float2 s, const float2 s_inv)
+{
+    //  Float2 version:
+	static const float2 g = float2(1.12906830989);
+	static const float2 c0 = float2(0.8109119309638332633713423362694399653724431);
+	static const float2 c1 = float2(0.4808354605142681877121661197951496120000040);
+	static const float2 e = float2(2.71828182845904523536028747135266249775724709);
+	const float2 sph = s + float2(0.5);
+	const float2 lanczos_sum = c0 + c1/(s + float2(1.0));
+	const float2 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float gamma_impl(const float s, const float s_inv)
+{
+    //  Float version:
+	static const float g = 1.12906830989;
+	static const float c0 = 0.8109119309638332633713423362694399653724431;
+	static const float c1 = 0.4808354605142681877121661197951496120000040;
+	static const float e = 2.71828182845904523536028747135266249775724709;
+	const float sph = s + 0.5;
+	const float lanczos_sum = c0 + c1/(s + 1.0);
+	const float base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float4 gamma(const float4 s)
+{
+    //  Requires:   s is the standard parameter to the gamma function, and it
+    //              should lie in the [0, 36] range.
+    //  Returns:    Return approximate gamma function output with a maximum
+    //              relative error of 0.000463.  See gamma_impl for details.
+	return gamma_impl(s, float4(1.0)/s);
+}
+
+float3 gamma(const float3 s)
+{
+    //  Float3 version:
+	return gamma_impl(s, float3(1.0)/s);
+}
+
+float2 gamma(const float2 s)
+{
+    //  Float2 version:
+	return gamma_impl(s, float2(1.0)/s);
+}
+
+float gamma(const float s)
+{
+    //  Float version:
+	return gamma_impl(s, 1.0/s);
+}
+
+
+////////////////  INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT)  ///////////////
+
+//  Lower incomplete gamma function for small s and z (implementation):
+float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z <= ~0.775075
+    //              3.) s_inv = 1.0/s (precomputed for outside reuse)
+    //  Returns:    A series representation for the lower incomplete gamma
+    //              function for small s and small z (4 terms).
+    //  The actual "rolled up" summation looks like:
+	//      last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0;
+	//      sum = last_sign * last_pow / ((s + k) * last_factorial)
+	//      for(int i = 0; i < 4; ++i)
+	//      {
+	//          last_sign *= -1.0; last_pow *= z; last_factorial *= i;
+	//          sum += last_sign * last_pow / ((s + k) * last_factorial);
+	//      }
+	//  Unrolled, constant-unfolded and arranged for madds and parallelism:
+	const float4 scale = pow(z, s);
+	float4 sum = s_inv;  //  Summation iteration 0 result
+	//  Summation iterations 1, 2, and 3:
+	const float4 z_sq = z*z;
+	const float4 denom1 = s + float4(1.0);
+	const float4 denom2 = 2.0*s + float4(4.0);
+	const float4 denom3 = 6.0*s + float4(18.0);
+	//float4 denom4 = 24.0*s + float4(96.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	//sum += z_sq * z_sq / denom4;
+	//  Scale and return:
+	return scale * sum;
+}
+
+float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv)
+{
+    //  Float3 version:
+	const float3 scale = pow(z, s);
+	float3 sum = s_inv;
+	const float3 z_sq = z*z;
+	const float3 denom1 = s + float3(1.0);
+	const float3 denom2 = 2.0*s + float3(4.0);
+	const float3 denom3 = 6.0*s + float3(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv)
+{
+    //  Float2 version:
+	const float2 scale = pow(z, s);
+	float2 sum = s_inv;
+	const float2 z_sq = z*z;
+	const float2 denom1 = s + float2(1.0);
+	const float2 denom2 = 2.0*s + float2(4.0);
+	const float2 denom3 = 6.0*s + float2(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float ligamma_small_z_impl(const float s, const float z, const float s_inv)
+{
+    //  Float version:
+	const float scale = pow(z, s);
+	float sum = s_inv;
+	const float z_sq = z*z;
+	const float denom1 = s + 1.0;
+	const float denom2 = 2.0*s + 4.0;
+	const float denom3 = 6.0*s + 18.0;
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+//  Upper incomplete gamma function for small s and large z (implementation):
+float4 uigamma_large_z_impl(const float4 s, const float4 z)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z > ~0.775075
+    //  Returns:    Gauss's continued fraction representation for the upper
+    //              incomplete gamma function (4 terms).
+	//  The "rolled up" continued fraction looks like this.  The denominator
+    //  is truncated, and it's calculated "from the bottom up:"
+	//      denom = float4('inf');
+	//      float4 one = float4(1.0);
+	//      for(int i = 4; i > 0; --i)
+	//      {
+	//          denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom;
+	//      }
+	//  Unrolled and constant-unfolded for madds and parallelism:
+	const float4 numerator = pow(z, s) * exp(-z);
+	float4 denom = float4(7.0) + z - s;
+	denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom;
+	denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom;
+	denom = float4(1.0) + z - s + (s - float4(1.0))/denom;
+	return numerator / denom;
+}
+
+float3 uigamma_large_z_impl(const float3 s, const float3 z)
+{
+    //  Float3 version:
+	const float3 numerator = pow(z, s) * exp(-z);
+	float3 denom = float3(7.0) + z - s;
+	denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom;
+	denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom;
+	denom = float3(1.0) + z - s + (s - float3(1.0))/denom;
+	return numerator / denom;
+}
+
+float2 uigamma_large_z_impl(const float2 s, const float2 z)
+{
+    //  Float2 version:
+	const float2 numerator = pow(z, s) * exp(-z);
+	float2 denom = float2(7.0) + z - s;
+	denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom;
+	denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom;
+	denom = float2(1.0) + z - s + (s - float2(1.0))/denom;
+	return numerator / denom;
+}
+
+float uigamma_large_z_impl(const float s, const float z)
+{
+    //  Float version:
+	const float numerator = pow(z, s) * exp(-z);
+	float denom = 7.0 + z - s;
+	denom = 5.0 + z - s + (3.0*s - 9.0)/denom;
+	denom = 3.0 + z - s + (2.0*s - 4.0)/denom;
+	denom = 1.0 + z - s + (s - 1.0)/denom;
+	return numerator / denom;
+}
+
+//  Normalized lower incomplete gamma function for small s (implementation):
+float4 normalized_ligamma_impl(const float4 s, const float4 z,
+    const float4 s_inv, const float4 gamma_s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) s_inv = 1/s (precomputed for outside reuse)
+    //              3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse)
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  Since we only care about s < 0.5, we only need
+    //              to evaluate two branches (not four) based on z.  Each branch
+    //              uses four terms, with a max relative error of ~0.00182.  The
+    //              branch threshold and specifics were adapted for fewer terms
+    //              from Gil/Segura/Temme's paper here:
+    //                  http://oai.cwi.nl/oai/asset/20433/20433B.pdf
+	//  Evaluate both branches: Real branches test slower even when available.
+	static const float4 thresh = float4(0.775075);
+	bool4 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	z_is_large.w = z.w > thresh.w;
+	const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	//  Combine the results from both branches:
+	bool4 inverse_z_is_large = not(z_is_large);
+	return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large);
+}
+
+float3 normalized_ligamma_impl(const float3 s, const float3 z,
+    const float3 s_inv, const float3 gamma_s_inv)
+{
+    //  Float3 version:
+	static const float3 thresh = float3(0.775075);
+	bool3 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool3 inverse_z_is_large = not(z_is_large);
+	return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large);
+}
+
+float2 normalized_ligamma_impl(const float2 s, const float2 z,
+    const float2 s_inv, const float2 gamma_s_inv)
+{
+    //  Float2 version:
+	static const float2 thresh = float2(0.775075);
+	bool2 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool2 inverse_z_is_large = not(z_is_large);
+	return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large);
+}
+
+float normalized_ligamma_impl(const float s, const float z,
+    const float s_inv, const float gamma_s_inv)
+{
+    //  Float version:
+	static const float thresh = 0.775075;
+	const bool z_is_large = z > thresh;
+	const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	return large_z * float(z_is_large) + small_z * float(!z_is_large);
+}
+
+//  Normalized lower incomplete gamma function for small s:
+float4 normalized_ligamma(const float4 s, const float4 z)
+{
+    //  Requires:   s < ~0.5
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  See normalized_ligamma_impl() for details.
+	const float4 s_inv = float4(1.0)/s;
+	const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float3 normalized_ligamma(const float3 s, const float3 z)
+{
+    //  Float3 version:
+	const float3 s_inv = float3(1.0)/s;
+	const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float2 normalized_ligamma(const float2 s, const float2 z)
+{
+    //  Float2 version:
+	const float2 s_inv = float2(1.0)/s;
+	const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float normalized_ligamma(const float s, const float z)
+{
+    //  Float version:
+	const float s_inv = 1.0/s;
+	const float gamma_s_inv = 1.0/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+#endif  //  SPECIAL_FUNCTIONS_H
+
+////////////////////////////  END SPECIAL-FUNCTIONS  ///////////////////////////
+
+//#include "../../../../include/gamma-management.h"
+
+////////////////////////////  BEGIN GAMMA-MANAGEMENT  //////////////////////////
+
+#ifndef GAMMA_MANAGEMENT_H
+#define GAMMA_MANAGEMENT_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//  
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides gamma-aware tex*D*() and encode_output() functions.
+//  Requires:   Before #include-ing this file, the including file must #define
+//              the following macros when applicable and follow their rules:
+//              1.) #define FIRST_PASS if this is the first pass.
+//              2.) #define LAST_PASS if this is the last pass.
+//              3.) If sRGB is available, set srgb_framebufferN = "true" for
+//                  every pass except the last in your .cgp preset.
+//              4.) If sRGB isn't available but you want gamma-correctness with
+//                  no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
+//              5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
+//              6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
+//              7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
+//              8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
+//              If an option in [5, 8] is #defined in the first or last pass, it
+//              should be #defined for both.  It shouldn't make a difference
+//              whether it's #defined for intermediate passes or not.
+//  Optional:   The including file (or an earlier included file) may optionally
+//              #define a number of macros indicating it will override certain
+//              macros and associated constants are as follows:
+//              static constants with either static or uniform constants.  The
+//              1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
+//                  static const float ntsc_gamma
+//                  static const float pal_gamma
+//                  static const float crt_reference_gamma_high
+//                  static const float crt_reference_gamma_low
+//                  static const float lcd_reference_gamma
+//                  static const float crt_office_gamma
+//                  static const float lcd_office_gamma
+//              2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
+//                  static const float crt_gamma
+//                  static const float gba_gamma
+//                  static const float lcd_gamma
+//              3.) OVERRIDE_FINAL_GAMMA: The user must first define:
+//                  static const float input_gamma
+//                  static const float intermediate_gamma
+//                  static const float output_gamma
+//                  (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
+//              4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
+//                  static const bool assume_opaque_alpha
+//              The gamma constant overrides must be used in every pass or none,
+//              and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
+//              OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
+//  Usage:      After setting macros appropriately, ignore gamma correction and
+//              replace all tex*D*() calls with equivalent gamma-aware
+//              tex*D*_linearize calls, except:
+//              1.) When you read an LUT, use regular tex*D or a gamma-specified
+//                  function, depending on its gamma encoding:
+//                      tex*D*_linearize_gamma (takes a runtime gamma parameter)
+//              2.) If you must read pass0's original input in a later pass, use
+//                  tex2D_linearize_ntsc_gamma.  If you want to read pass0's
+//                  input with gamma-corrected bilinear filtering, consider
+//                  creating a first linearizing pass and reading from the input
+//                  of pass1 later.
+//              Then, return encode_output(color) from every fragment shader.
+//              Finally, use the global gamma_aware_bilinear boolean if you want
+//              to statically branch based on whether bilinear filtering is
+//              gamma-correct or not (e.g. for placing Gaussian blur samples).
+//
+//  Detailed Policy:
+//  tex*D*_linearize() functions enforce a consistent gamma-management policy
+//  based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings.  They assume
+//  their input texture has the same encoding characteristics as the input for
+//  the current pass (which doesn't apply to the exceptions listed above).
+//  Similarly, encode_output() enforces a policy based on the LAST_PASS and
+//  GAMMA_ENCODE_EVERY_FBO settings.  Together, they result in one of the
+//  following two pipelines.
+//  Typical pipeline with intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = linear_color;     //  Automatic sRGB encoding
+//      linear_color = intermediate_output;     //  Automatic sRGB decoding
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Typical pipeline without intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
+//      linear_color = pow(intermediate_output, intermediate_gamma);
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
+//  easily get gamma-correctness without banding on devices where sRGB isn't
+//  supported.
+//
+//  Use This Header to Maximize Code Reuse:
+//  The purpose of this header is to provide a consistent interface for texture
+//  reads and output gamma-encoding that localizes and abstracts away all the
+//  annoying details.  This greatly reduces the amount of code in each shader
+//  pass that depends on the pass number in the .cgp preset or whether sRGB
+//  FBO's are being used: You can trivially change the gamma behavior of your
+//  whole pass by commenting or uncommenting 1-3 #defines.  To reuse the same
+//  code in your first, Nth, and last passes, you can even put it all in another
+//  header file and #include it from skeleton .cg files that #define the
+//  appropriate pass-specific settings.
+//
+//  Rationale for Using Three Macros:
+//  This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
+//  SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
+//  a lower maintenance burden on each pass.  At first glance it seems we could
+//  accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
+//  This works for simple use cases where input_gamma == output_gamma, but it
+//  breaks down for more complex scenarios like CRT simulation, where the pass
+//  number determines the gamma encoding of the input and output.
+
+
+///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
+
+//  Set standard gamma constants, but allow users to override them:
+#ifndef OVERRIDE_STANDARD_GAMMA
+    //  Standard encoding gammas:
+    static const float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
+    static const float pal_gamma = 2.8;     //  Never actually 2.8 in practice
+    //  Typical device decoding gammas (only use for emulating devices):
+    //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
+    //  gammas: The standards purposely undercorrected for an analog CRT's
+    //  assumed 2.5 reference display gamma to maintain contrast in assumed
+    //  [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
+    //  These unstated assumptions about display gamma and perceptual rendering
+    //  intent caused a lot of confusion, and more modern CRT's seemed to target
+    //  NTSC 2.2 gamma with circuitry.  LCD displays seem to have followed suit
+    //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
+    //  displays designed to view sRGB in bright environments.  (Standards are
+    //  also in flux again with BT.1886, but it's underspecified for displays.)
+    static const float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
+    static const float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
+    static const float lcd_reference_gamma = 2.5;       //  To match CRT
+    static const float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
+    static const float lcd_office_gamma = 2.2;  //  Approximates sRGB
+#endif  //  OVERRIDE_STANDARD_GAMMA
+
+//  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
+//  but only if they're aware of it.
+#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
+    static const bool assume_opaque_alpha = false;
+#endif
+
+
+///////////////////////  DERIVED CONSTANTS AS FUNCTIONS  ///////////////////////
+
+//  gamma-management.h should be compatible with overriding gamma values with
+//  runtime user parameters, but we can only define other global constants in
+//  terms of static constants, not uniform user parameters.  To get around this
+//  limitation, we need to define derived constants using functions.
+
+//  Set device gamma constants, but allow users to override them:
+#ifdef OVERRIDE_DEVICE_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_crt_gamma()    {   return crt_gamma;   }
+    inline float get_gba_gamma()    {   return gba_gamma;   }
+    inline float get_lcd_gamma()    {   return lcd_gamma;   }
+#else
+    inline float get_crt_gamma()    {   return crt_reference_gamma_high;    }
+    inline float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
+    inline float get_lcd_gamma()    {   return lcd_office_gamma;            }
+#endif  //  OVERRIDE_DEVICE_GAMMA
+
+//  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
+#ifdef OVERRIDE_FINAL_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_intermediate_gamma()   {   return intermediate_gamma;  }
+    inline float get_input_gamma()          {   return input_gamma;         }
+    inline float get_output_gamma()         {   return output_gamma;        }
+#else
+    //  If we gamma-correct every pass, always use ntsc_gamma between passes to
+    //  ensure middle passes don't need to care if anything is being simulated:
+    inline float get_intermediate_gamma()   {   return ntsc_gamma;          }
+    #ifdef SIMULATE_CRT_ON_LCD
+        inline float get_input_gamma()      {   return get_crt_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_LCD
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_LCD_ON_CRT
+        inline float get_input_gamma()      {   return get_lcd_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_CRT
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else   //  Don't simulate anything:
+        inline float get_input_gamma()      {   return ntsc_gamma;          }
+        inline float get_output_gamma()     {   return ntsc_gamma;          }
+    #endif  //  SIMULATE_GBA_ON_CRT
+    #endif  //  SIMULATE_LCD_ON_CRT
+    #endif  //  SIMULATE_GBA_ON_LCD
+    #endif  //  SIMULATE_CRT_ON_LCD
+#endif  //  OVERRIDE_FINAL_GAMMA
+
+//  Set decoding/encoding gammas for the current pass.  Use static constants for
+//  linearize_input and gamma_encode_output, because they aren't derived, and
+//  they let the compiler do dead-code elimination.
+#ifndef GAMMA_ENCODE_EVERY_FBO
+    #ifdef FIRST_PASS
+        static const bool linearize_input = true;
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        static const bool linearize_input = false;
+        inline float get_pass_input_gamma()     {   return 1.0;                 }
+    #endif
+    #ifdef LAST_PASS
+        static const bool gamma_encode_output = true;
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        static const bool gamma_encode_output = false;
+        inline float get_pass_output_gamma()    {   return 1.0;                 }
+    #endif
+#else
+    static const bool linearize_input = true;
+    static const bool gamma_encode_output = true;
+    #ifdef FIRST_PASS
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        inline float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
+    #endif
+    #ifdef LAST_PASS
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        inline float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
+    #endif
+#endif
+
+//  Users might want to know if bilinear filtering will be gamma-correct:
+static const bool gamma_aware_bilinear = !linearize_input;
+
+
+//////////////////////  COLOR ENCODING/DECODING FUNCTIONS  /////////////////////
+
+inline float4 encode_output(const float4 color)
+{
+    if(gamma_encode_output)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_input(const float4 color)
+{
+    if(linearize_input)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_gamma_input(const float4 color, const float3 gamma)
+{
+    if(assume_opaque_alpha)
+    {
+        return float4(pow(color.rgb, gamma), 1.0);
+    }
+    else
+    {
+        return float4(pow(color.rgb, gamma), color.a);
+    }
+}
+
+//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯
+//#define tex2D_linearize(C, D) decode_input(vec4(texture(C, D)))
+// EDIT: it's the 'const' in front of the coords that's doing it
+
+///////////////////////////  TEXTURE LOOKUP WRAPPERS  //////////////////////////
+
+//  "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a wide array of linearizing texture lookup wrapper functions.  The
+//  Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
+//  lookups are provided for completeness in case that changes someday.  Nobody
+//  is likely to use the *fetch and *proj functions, but they're included just
+//  in case.  The only tex*D texture sampling functions omitted are:
+//      - tex*Dcmpbias
+//      - tex*Dcmplod
+//      - tex*DARRAY*
+//      - tex*DMS*
+//      - Variants returning integers
+//  Standard line length restrictions are ignored below for vertical brevity.
+/*
+//  tex1D:
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex1Dbias:
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dbias(tex, tex_coords));   }
+
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dbias(tex, tex_coords, texel_off));    }
+
+//  tex1Dfetch:
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
+{   return decode_input(tex1Dfetch(tex, tex_coords));  }
+
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex1Dlod:
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dlod(tex, tex_coords));    }
+
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dlod(tex, tex_coords, texel_off));     }
+
+//  tex1Dproj:
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+*/
+//  tex2D:
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords, texel_off));    }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex2Dbias:
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
+//{   return decode_input(tex2Dbias(tex, tex_coords));   }
+
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dbias(tex, tex_coords, texel_off));    }
+
+//  tex2Dfetch:
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
+//{   return decode_input(tex2Dfetch(tex, tex_coords));  }
+
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords)
+{   return decode_input(textureLod(tex, tex_coords.xy, 0.0));    }
+
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));     }
+/*
+//  tex2Dproj:
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+*/
+/*
+//  tex3D:
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
+{   return decode_input(tex3D(tex, tex_coords));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, texel_off));    }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex3Dbias:
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dbias(tex, tex_coords));   }
+
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dbias(tex, tex_coords, texel_off));    }
+
+//  tex3Dfetch:
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
+{   return decode_input(tex3Dfetch(tex, tex_coords));  }
+
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex3Dlod:
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dlod(tex, tex_coords));    }
+
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dlod(tex, tex_coords, texel_off));     }
+
+//  tex3Dproj:
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dproj(tex, tex_coords));   }
+
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dproj(tex, tex_coords, texel_off));    }
+/////////*
+
+//  NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  This narrow selection of nonstandard tex2D* functions can be useful:
+
+//  tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0)));   }
+
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off));    }
+
+
+//  MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a narrower selection of tex2D* wrapper functions that decode an
+//  input sample with a specified gamma value.  These are useful for reading
+//  LUT's and for reading the input of pass0 in a later pass.
+
+//  tex2D:
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma);   }
+
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+/*
+//  tex2Dbias:
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma);   }
+
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma);    }
+
+//  tex2Dfetch:
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma);  }
+
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma);   }
+*/
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma);    }
+
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma);     }
+
+
+#endif  //  GAMMA_MANAGEMENT_H
+
+////////////////////////////  END GAMMA-MANAGEMENT  //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+/////////////////////////////  SCANLINE FUNCTIONS  /////////////////////////////
+
+inline float3 get_gaussian_sigma(const float3 color, const float sigma_range)
+{
+    //  Requires:   Globals:
+    //              1.) beam_min_sigma and beam_max_sigma are global floats
+    //                  containing the desired minimum and maximum beam standard
+    //                  deviations, for dim and bright colors respectively.
+    //              2.) beam_max_sigma must be > 0.0
+    //              3.) beam_min_sigma must be in (0.0, beam_max_sigma]
+    //              4.) beam_spot_power must be defined as a global float.
+    //              Parameters:
+    //              1.) color is the underlying source color along a scanline
+    //              2.) sigma_range = beam_max_sigma - beam_min_sigma; we take
+    //                  sigma_range as a parameter to avoid repeated computation
+    //                  when beam_{min, max}_sigma are runtime shader parameters
+    //  Optional:   Users may set beam_spot_shape_function to 1 to define the
+    //              inner f(color) subfunction (see below) as:
+    //                  f(color) = sqrt(1.0 - (color - 1.0)*(color - 1.0))
+    //              Otherwise (technically, if beam_spot_shape_function < 0.5):
+    //                  f(color) = pow(color, beam_spot_power)
+    //  Returns:    The standard deviation of the Gaussian beam for "color:"
+    //                  sigma = beam_min_sigma + sigma_range * f(color)
+    //  Details/Discussion:
+    //  The beam's spot shape vaguely resembles an aspect-corrected f() in the
+    //  range [0, 1] (not quite, but it's related).  f(color) = color makes
+    //  spots look like diamonds, and a spherical function or cube balances
+    //  between variable width and a soft/realistic shape.   A beam_spot_power
+    //  > 1.0 can produce an ugly spot shape and more initial clipping, but the
+    //  final shape also differs based on the horizontal resampling filter and
+    //  the phosphor bloom.  For instance, resampling horizontally in nonlinear
+    //  light and/or with a sharp (e.g. Lanczos) filter will sharpen the spot
+    //  shape, but a sixth root is still quite soft.  A power function (default
+    //  1.0/3.0 beam_spot_power) is most flexible, but a fixed spherical curve
+    //  has the highest variability without an awful spot shape.
+    //
+    //  beam_min_sigma affects scanline sharpness/aliasing in dim areas, and its
+    //  difference from beam_max_sigma affects beam width variability.  It only
+    //  affects clipping [for pure Gaussians] if beam_spot_power > 1.0 (which is
+    //  a conservative estimate for a more complex constraint).
+    //
+    //  beam_max_sigma affects clipping and increasing scanline width/softness
+    //  as color increases.  The wider this is, the more scanlines need to be
+    //  evaluated to avoid distortion.  For a pure Gaussian, the max_beam_sigma
+    //  at which the first unused scanline always has a weight < 1.0/255.0 is:
+    //      num scanlines = 2, max_beam_sigma = 0.2089; distortions begin ~0.34
+    //      num scanlines = 3, max_beam_sigma = 0.3879; distortions begin ~0.52
+    //      num scanlines = 4, max_beam_sigma = 0.5723; distortions begin ~0.70
+    //      num scanlines = 5, max_beam_sigma = 0.7591; distortions begin ~0.89
+    //      num scanlines = 6, max_beam_sigma = 0.9483; distortions begin ~1.08
+    //  Generalized Gaussians permit more leeway here as steepness increases.
+    if(beam_spot_shape_function < 0.5)
+    {
+        //  Use a power function:
+        return float3(beam_min_sigma) + sigma_range *
+            pow(color, float3(beam_spot_power));
+    }
+    else
+    {
+        //  Use a spherical function:
+        const float3 color_minus_1 = color - float3(1.0);
+        return float3(beam_min_sigma) + sigma_range *
+            sqrt(float3(1.0) - color_minus_1*color_minus_1);
+    }
+}
+
+inline float3 get_generalized_gaussian_beta(const float3 color,
+    const float shape_range)
+{
+    //  Requires:   Globals:
+    //              1.) beam_min_shape and beam_max_shape are global floats
+    //                  containing the desired min/max generalized Gaussian
+    //                  beta parameters, for dim and bright colors respectively.
+    //              2.) beam_max_shape must be >= 2.0
+    //              3.) beam_min_shape must be in [2.0, beam_max_shape]
+    //              4.) beam_shape_power must be defined as a global float.
+    //              Parameters:
+    //              1.) color is the underlying source color along a scanline
+    //              2.) shape_range = beam_max_shape - beam_min_shape; we take
+    //                  shape_range as a parameter to avoid repeated computation
+    //                  when beam_{min, max}_shape are runtime shader parameters
+    //  Returns:    The type-I generalized Gaussian "shape" parameter beta for
+    //              the given color.
+    //  Details/Discussion:
+    //  Beta affects the scanline distribution as follows:
+    //  a.) beta < 2.0 narrows the peak to a spike with a discontinuous slope
+    //  b.) beta == 2.0 just degenerates to a Gaussian
+    //  c.) beta > 2.0 flattens and widens the peak, then drops off more steeply
+    //      than a Gaussian.  Whereas high sigmas widen and soften peaks, high
+    //      beta widen and sharpen peaks at the risk of aliasing.
+    //  Unlike high beam_spot_powers, high beam_shape_powers actually soften shape
+    //  transitions, whereas lower ones sharpen them (at the risk of aliasing).
+    return beam_min_shape + shape_range * pow(color, float3(beam_shape_power));
+}
+
+float3 scanline_gaussian_integral_contrib(const float3 dist,
+    const float3 color, const float pixel_height, const float sigma_range)
+{
+    //  Requires:   1.) dist is the distance of the [potentially separate R/G/B]
+    //                  point(s) from a scanline in units of scanlines, where
+    //                  1.0 means the sample point straddles the next scanline.
+    //              2.) color is the underlying source color along a scanline.
+    //              3.) pixel_height is the output pixel height in scanlines.
+    //              4.) Requirements of get_gaussian_sigma() must be met.
+    //  Returns:    Return a scanline's light output over a given pixel.
+    //  Details:
+    //  The CRT beam profile follows a roughly Gaussian distribution which is
+    //  wider for bright colors than dark ones.  The integral over the full
+    //  range of a Gaussian function is always 1.0, so we can vary the beam
+    //  with a standard deviation without affecting brightness.  'x' = distance:
+    //      gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2))
+    //      gaussian integral = 0.5 (1.0 + erf(x/(sigma * sqrt(2))))
+    //  Use a numerical approximation of the "error function" (the Gaussian
+    //  indefinite integral) to find the definite integral of the scanline's
+    //  average brightness over a given pixel area.  Even if curved coords were
+    //  used in this pass, a flat scalar pixel height works almost as well as a
+    //  pixel height computed from a full pixel-space to scanline-space matrix.
+    const float3 sigma = get_gaussian_sigma(color, sigma_range);
+    const float3 ph_offset = float3(pixel_height * 0.5);
+    const float3 denom_inv = 1.0/(sigma*sqrt(2.0));
+    const float3 integral_high = erf((dist + ph_offset)*denom_inv);
+    const float3 integral_low = erf((dist - ph_offset)*denom_inv);
+    return color * 0.5*(integral_high - integral_low)/pixel_height;
+}
+
+float3 scanline_generalized_gaussian_integral_contrib(float3 dist,
+    float3 color, float pixel_height, float sigma_range,
+    float shape_range)
+{
+    //  Requires:   1.) Requirements of scanline_gaussian_integral_contrib()
+    //                  must be met.
+    //              2.) Requirements of get_gaussian_sigma() must be met.
+    //              3.) Requirements of get_generalized_gaussian_beta() must be
+    //                  met.
+    //  Returns:    Return a scanline's light output over a given pixel.
+    //  A generalized Gaussian distribution allows the shape (beta) to vary
+    //  as well as the width (alpha).  "gamma" refers to the gamma function:
+    //      generalized sample =
+    //          beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta)
+    //  ligamma(s, z) is the lower incomplete gamma function, for which we only
+    //  implement two of four branches (because we keep 1/beta <= 0.5):
+    //      generalized integral = 0.5 + 0.5* sign(x) *
+    //          ligamma(1/beta, (|x|/alpha)**beta)/gamma(1/beta)
+    //  See get_generalized_gaussian_beta() for a discussion of beta.
+    //  We base alpha on the intended Gaussian sigma, but it only strictly
+    //  models models standard deviation at beta == 2, because the standard
+    //  deviation depends on both alpha and beta (keeping alpha independent is
+    //  faster and preserves intuitive behavior and a full spectrum of results).
+    const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
+    const float3 beta = get_generalized_gaussian_beta(color, shape_range);
+    const float3 alpha_inv = float3(1.0)/alpha;
+    const float3 s = float3(1.0)/beta;
+    const float3 ph_offset = float3(pixel_height * 0.5);
+    //  Pass beta to gamma_impl to avoid repeated divides.  Similarly pass
+    //  beta (i.e. 1/s) and 1/gamma(s) to normalized_ligamma_impl.
+    const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, beta);
+    const float3 dist1 = dist + ph_offset;
+    const float3 dist0 = dist - ph_offset;
+    const float3 integral_high = sign(dist1) * normalized_ligamma_impl(
+        s, pow(abs(dist1)*alpha_inv, beta), beta, gamma_s_inv);
+    const float3 integral_low = sign(dist0) * normalized_ligamma_impl(
+        s, pow(abs(dist0)*alpha_inv, beta), beta, gamma_s_inv);
+    return color * 0.5*(integral_high - integral_low)/pixel_height;
+}
+
+float3 scanline_gaussian_sampled_contrib(const float3 dist, const float3 color,
+    const float pixel_height, const float sigma_range)
+{
+    //  See scanline_gaussian integral_contrib() for detailed comments!
+    //  gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2))
+    const float3 sigma = get_gaussian_sigma(color, sigma_range);
+    //  Avoid repeated divides:
+    const float3 sigma_inv = float3(1.0)/sigma;
+    const float3 inner_denom_inv = 0.5 * sigma_inv * sigma_inv;
+    const float3 outer_denom_inv = sigma_inv/sqrt(2.0*pi);
+    if(beam_antialias_level > 0.5)
+    {
+        //  Sample 1/3 pixel away in each direction as well:
+        const float3 sample_offset = float3(pixel_height/3.0);
+        const float3 dist2 = dist + sample_offset;
+        const float3 dist3 = abs(dist - sample_offset);
+        //  Average three pure Gaussian samples:
+        const float3 scale = color/3.0 * outer_denom_inv;
+        const float3 weight1 = exp(-(dist*dist)*inner_denom_inv);
+        const float3 weight2 = exp(-(dist2*dist2)*inner_denom_inv);
+        const float3 weight3 = exp(-(dist3*dist3)*inner_denom_inv);
+        return scale * (weight1 + weight2 + weight3);
+    }
+    else
+    {
+        return color*exp(-(dist*dist)*inner_denom_inv)*outer_denom_inv;
+    }
+}
+
+float3 scanline_generalized_gaussian_sampled_contrib(float3 dist,
+    float3 color, float pixel_height, float sigma_range,
+    float shape_range)
+{
+    //  See scanline_generalized_gaussian_integral_contrib() for details!
+    //  generalized sample =
+    //      beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta)
+    const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
+    const float3 beta = get_generalized_gaussian_beta(color, shape_range);
+    //  Avoid repeated divides:
+    const float3 alpha_inv = float3(1.0)/alpha;
+    const float3 beta_inv = float3(1.0)/beta;
+    const float3 scale = color * beta * 0.5 * alpha_inv /
+        gamma_impl(beta_inv, beta);
+    if(beam_antialias_level > 0.5)
+    {
+        //  Sample 1/3 pixel closer to and farther from the scanline too.
+        const float3 sample_offset = float3(pixel_height/3.0);
+        const float3 dist2 = dist + sample_offset;
+        const float3 dist3 = abs(dist - sample_offset);
+        //  Average three generalized Gaussian samples:
+        const float3 weight1 = exp(-pow(abs(dist*alpha_inv), beta));
+        const float3 weight2 = exp(-pow(abs(dist2*alpha_inv), beta));
+        const float3 weight3 = exp(-pow(abs(dist3*alpha_inv), beta));
+        return scale/3.0 * (weight1 + weight2 + weight3);
+    }
+    else
+    {
+        return scale * exp(-pow(abs(dist*alpha_inv), beta));
+    }
+}
+
+inline float3 scanline_contrib(float3 dist, float3 color,
+    float pixel_height, const float sigma_range, const float shape_range)
+{
+    //  Requires:   1.) Requirements of scanline_gaussian_integral_contrib()
+    //                  must be met.
+    //              2.) Requirements of get_gaussian_sigma() must be met.
+    //              3.) Requirements of get_generalized_gaussian_beta() must be
+    //                  met.
+    //  Returns:    Return a scanline's light output over a given pixel, using
+    //              a generalized or pure Gaussian distribution and sampling or
+    //              integrals as desired by user codepath choices.
+    if(beam_generalized_gaussian)
+    {
+        if(beam_antialias_level > 1.5)
+        {
+            return scanline_generalized_gaussian_integral_contrib(
+                dist, color, pixel_height, sigma_range, shape_range);
+        }
+        else
+        {
+            return scanline_generalized_gaussian_sampled_contrib(
+                dist, color, pixel_height, sigma_range, shape_range);
+        }
+    }
+    else
+    {
+        if(beam_antialias_level > 1.5)
+        {
+            return scanline_gaussian_integral_contrib(
+                dist, color, pixel_height, sigma_range);
+        }
+        else
+        {
+            return scanline_gaussian_sampled_contrib(
+                dist, color, pixel_height, sigma_range);
+        }
+    }
+}
+
+inline float3 get_raw_interpolated_color(const float3 color0,
+    const float3 color1, const float3 color2, const float3 color3,
+    const float4 weights)
+{
+    //  Use max to avoid bizarre artifacts from negative colors:
+    return max(mul(weights, float4x3(color0, color1, color2, color3)), 0.0);
+}
+
+float3 get_interpolated_linear_color(const float3 color0, const float3 color1,
+    const float3 color2, const float3 color3, const float4 weights)
+{
+    //  Requires:   1.) Requirements of include/gamma-management.h must be met:
+    //                  intermediate_gamma must be globally defined, and input
+    //                  colors are interpreted as linear RGB unless you #define
+    //                  GAMMA_ENCODE_EVERY_FBO (in which case they are
+    //                  interpreted as gamma-encoded with intermediate_gamma).
+    //              2.) color0-3 are colors sampled from a texture with tex2D().
+    //                  They are interpreted as defined in requirement 1.
+    //              3.) weights contains weights for each color, summing to 1.0.
+    //              4.) beam_horiz_linear_rgb_weight must be defined as a global
+    //                  float in [0.0, 1.0] describing how much blending should
+    //                  be done in linear RGB (rest is gamma-corrected RGB).
+    //              5.) RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE must be #defined
+    //                  if beam_horiz_linear_rgb_weight is anything other than a
+    //                  static constant, or we may try branching at runtime
+    //                  without dynamic branches allowed (slow).
+    //  Returns:    Return an interpolated color lookup between the four input
+    //              colors based on the weights in weights.  The final color will
+    //              be a linear RGB value, but the blending will be done as
+    //              indicated above.
+    const float intermediate_gamma = get_intermediate_gamma();
+    //  Branch if beam_horiz_linear_rgb_weight is static (for free) or if the
+    //  profile allows dynamic branches (faster than computing extra pows):
+    #ifndef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+    #else
+        #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+            #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+        #endif
+    #endif
+    #ifdef SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+        //  beam_horiz_linear_rgb_weight is static, so we can branch:
+        #ifdef GAMMA_ENCODE_EVERY_FBO
+            const float3 gamma_mixed_color = pow(get_raw_interpolated_color(
+                color0, color1, color2, color3, weights), float3(intermediate_gamma));
+            if(beam_horiz_linear_rgb_weight > 0.0)
+            {
+                const float3 linear_mixed_color = get_raw_interpolated_color(
+                    pow(color0, float3(intermediate_gamma)),
+                    pow(color1, float3(intermediate_gamma)),
+                    pow(color2, float3(intermediate_gamma)),
+                    pow(color3, float3(intermediate_gamma)),
+                    weights);
+                return lerp(gamma_mixed_color, linear_mixed_color,
+                    beam_horiz_linear_rgb_weight);
+            }
+            else
+            {
+                return gamma_mixed_color;
+            }
+        #else
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                color0, color1, color2, color3, weights);
+            if(beam_horiz_linear_rgb_weight < 1.0)
+            {
+                const float3 gamma_mixed_color = get_raw_interpolated_color(
+                    pow(color0, float3(1.0/intermediate_gamma)),
+                    pow(color1, float3(1.0/intermediate_gamma)),
+                    pow(color2, float3(1.0/intermediate_gamma)),
+                    pow(color3, float3(1.0/intermediate_gamma)),
+                    weights);
+                return lerp(gamma_mixed_color, linear_mixed_color,
+                    beam_horiz_linear_rgb_weight);
+            }
+            else
+            {
+                return linear_mixed_color;
+            }
+        #endif  //  GAMMA_ENCODE_EVERY_FBO
+    #else
+        #ifdef GAMMA_ENCODE_EVERY_FBO
+            //  Inputs: color0-3 are colors in gamma-encoded RGB.
+            const float3 gamma_mixed_color = pow(get_raw_interpolated_color(
+                color0, color1, color2, color3, weights), intermediate_gamma);
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                pow(color0, float3(intermediate_gamma)),
+                pow(color1, float3(intermediate_gamma)),
+                pow(color2, float3(intermediate_gamma)),
+                pow(color3, float3(intermediate_gamma)),
+                weights);
+            return lerp(gamma_mixed_color, linear_mixed_color,
+                beam_horiz_linear_rgb_weight);
+        #else
+            //  Inputs: color0-3 are colors in linear RGB.
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                color0, color1, color2, color3, weights);
+            const float3 gamma_mixed_color = get_raw_interpolated_color(
+                    pow(color0, float3(1.0/intermediate_gamma)),
+                    pow(color1, float3(1.0/intermediate_gamma)),
+                    pow(color2, float3(1.0/intermediate_gamma)),
+                    pow(color3, float3(1.0/intermediate_gamma)),
+                    weights);
+			// wtf fixme
+//			const float beam_horiz_linear_rgb_weight1 = 1.0;
+            return lerp(gamma_mixed_color, linear_mixed_color,
+                beam_horiz_linear_rgb_weight);
+        #endif  //  GAMMA_ENCODE_EVERY_FBO
+    #endif  //  SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+}
+
+float3 get_scanline_color(const sampler2D tex, const float2 scanline_uv,
+    const float2 uv_step_x, const float4 weights)
+{
+    //  Requires:   1.) scanline_uv must be vertically snapped to the caller's
+    //                  desired line or scanline and horizontally snapped to the
+    //                  texel just left of the output pixel (color1)
+    //              2.) uv_step_x must contain the horizontal uv distance
+    //                  between texels.
+    //              3.) weights must contain interpolation filter weights for
+    //                  color0, color1, color2, and color3, where color1 is just
+    //                  left of the output pixel.
+    //  Returns:    Return a horizontally interpolated texture lookup using 2-4
+    //              nearby texels, according to weights and the conventions of
+    //              get_interpolated_linear_color().
+    //  We can ignore the outside texture lookups for Quilez resampling.
+    const float3 color1 = COMPAT_TEXTURE(tex, scanline_uv).rgb;
+    const float3 color2 = COMPAT_TEXTURE(tex, scanline_uv + uv_step_x).rgb;
+    float3 color0 = float3(0.0);
+    float3 color3 = float3(0.0);
+    if(beam_horiz_filter > 0.5)
+    {
+        color0 = COMPAT_TEXTURE(tex, scanline_uv - uv_step_x).rgb;
+        color3 = COMPAT_TEXTURE(tex, scanline_uv + 2.0 * uv_step_x).rgb;
+    }
+    //  Sample the texture as-is, whether it's linear or gamma-encoded:
+    //  get_interpolated_linear_color() will handle the difference.
+    return get_interpolated_linear_color(color0, color1, color2, color3, weights);
+}
+
+float3 sample_single_scanline_horizontal(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size,
+    const float2 texture_size_inv)
+{
+    //  TODO: Add function requirements.
+    //  Snap to the previous texel and get sample dists from 2/4 nearby texels:
+    const float2 curr_texel = tex_uv * tex_size;
+    //  Use under_half to fix a rounding bug right around exact texel locations.
+    const float2 prev_texel =
+        floor(curr_texel - float2(under_half)) + float2(0.5);
+    const float2 prev_texel_hor = float2(prev_texel.x, curr_texel.y);
+    const float2 prev_texel_hor_uv = prev_texel_hor * texture_size_inv;
+    const float prev_dist = curr_texel.x - prev_texel_hor.x;
+    const float4 sample_dists = float4(1.0 + prev_dist, prev_dist,
+        1.0 - prev_dist, 2.0 - prev_dist);
+    //  Get Quilez, Lanczos2, or Gaussian resize weights for 2/4 nearby texels:
+    float4 weights;
+    if(beam_horiz_filter < 0.5)
+    {
+        //  Quilez:
+        const float x = sample_dists.y;
+        const float w2 = x*x*x*(x*(x*6.0 - 15.0) + 10.0);
+        weights = float4(0.0, 1.0 - w2, w2, 0.0);
+    }
+    else if(beam_horiz_filter < 1.5)
+    {
+        //  Gaussian:
+        float inner_denom_inv = 1.0/(2.0*beam_horiz_sigma*beam_horiz_sigma);
+        weights = exp(-(sample_dists*sample_dists)*inner_denom_inv);
+    }
+    else
+    {
+        //  Lanczos2:
+        const float4 pi_dists = FIX_ZERO(sample_dists * pi);
+        weights = 2.0 * sin(pi_dists) * sin(pi_dists * 0.5) /
+            (pi_dists * pi_dists);
+    }
+    //  Ensure the weight sum == 1.0:
+    const float4 final_weights = weights/dot(weights, float4(1.0));
+    //  Get the interpolated horizontal scanline color:
+    const float2 uv_step_x = float2(texture_size_inv.x, 0.0);
+    return get_scanline_color(
+        tex, prev_texel_hor_uv, uv_step_x, final_weights);
+}
+
+float3 sample_rgb_scanline_horizontal(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size,
+    const float2 texture_size_inv)
+{
+    //  TODO: Add function requirements.
+    //  Rely on a helper to make convergence easier.
+    if(beam_misconvergence)
+    {
+        const float3 convergence_offsets_rgb =
+            get_convergence_offsets_x_vector();
+        const float3 offset_u_rgb =
+            convergence_offsets_rgb * texture_size_inv.xxx;
+        const float2 scanline_uv_r = tex_uv - float2(offset_u_rgb.r, 0.0);
+        const float2 scanline_uv_g = tex_uv - float2(offset_u_rgb.g, 0.0);
+        const float2 scanline_uv_b = tex_uv - float2(offset_u_rgb.b, 0.0);
+        const float3 sample_r = sample_single_scanline_horizontal(
+            tex, scanline_uv_r, tex_size, texture_size_inv);
+        const float3 sample_g = sample_single_scanline_horizontal(
+            tex, scanline_uv_g, tex_size, texture_size_inv);
+        const float3 sample_b = sample_single_scanline_horizontal(
+            tex, scanline_uv_b, tex_size, texture_size_inv);
+        return float3(sample_r.r, sample_g.g, sample_b.b);
+    }
+    else
+    {
+        return sample_single_scanline_horizontal(tex, tex_uv, tex_size,
+            texture_size_inv);
+    }
+}
+
+float2 get_last_scanline_uv(const float2 tex_uv, const float2 tex_size,
+    const float2 texture_size_inv, const float2 il_step_multiple,
+    const float frame_count, out float dist)
+{
+    //  Compute texture coords for the last/upper scanline, accounting for
+    //  interlacing: With interlacing, only consider even/odd scanlines every
+    //  other frame.  Top-field first (TFF) order puts even scanlines on even
+    //  frames, and BFF order puts them on odd frames.  Texels are centered at:
+    //      frac(tex_uv * tex_size) == x.5
+    //  Caution: If these coordinates ever seem incorrect, first make sure it's
+    //  not because anisotropic filtering is blurring across field boundaries.
+    //  Note: TFF/BFF won't matter for sources that double-weave or similar.
+	// wtf fixme
+//	const float interlace_bff1 = 1.0;
+    const float field_offset = floor(il_step_multiple.y * 0.75) *
+        fmod(frame_count + float(interlace_bff), 2.0);
+    const float2 curr_texel = tex_uv * tex_size;
+    //  Use under_half to fix a rounding bug right around exact texel locations.
+    const float2 prev_texel_num = floor(curr_texel - float2(under_half));
+    const float wrong_field = fmod(
+        prev_texel_num.y + field_offset, il_step_multiple.y);
+    const float2 scanline_texel_num = prev_texel_num - float2(0.0, wrong_field);
+    //  Snap to the center of the previous scanline in the current field:
+    const float2 scanline_texel = scanline_texel_num + float2(0.5);
+    const float2 scanline_uv = scanline_texel * texture_size_inv;
+    //  Save the sample's distance from the scanline, in units of scanlines:
+    dist = (curr_texel.y - scanline_texel.y)/il_step_multiple.y;
+    return scanline_uv;
+}
+
+inline bool is_interlaced(float num_lines)
+{
+    //  Detect interlacing based on the number of lines in the source.
+    if(interlace_detect)
+    {
+        //  NTSC: 525 lines, 262.5/field; 486 active (2 half-lines), 243/field
+        //  NTSC Emulators: Typically 224 or 240 lines
+        //  PAL: 625 lines, 312.5/field; 576 active (typical), 288/field
+        //  PAL Emulators: ?
+        //  ATSC: 720p, 1080i, 1080p
+        //  Where do we place our cutoffs?  Assumptions:
+        //  1.) We only need to care about active lines.
+        //  2.) Anything > 288 and <= 576 lines is probably interlaced.
+        //  3.) Anything > 576 lines is probably not interlaced...
+        //  4.) ...except 1080 lines, which is a crapshoot (user decision).
+        //  5.) Just in case the main program uses calculated video sizes,
+        //      we should nudge the float thresholds a bit.
+        const bool sd_interlace = ((num_lines > 288.5) && (num_lines < 576.5));
+        const bool hd_interlace = bool(interlace_1080i) ?
+            ((num_lines > 1079.5) && (num_lines < 1080.5)) :
+            false;
+        return (sd_interlace || hd_interlace);
+    }
+    else
+    {
+        return false;
+    }
+}
+
+#endif  //  SCANLINE_FUNCTIONS_H
+
+/////////////////////////////  END SCANLINE-FUNCTIONS  ////////////////////////////
+
+///////////////////////////////  END VERTEX-INCLUDES  /////////////////////////////
+
+#undef COMPAT_PRECISION
+#undef COMPAT_TEXTURE
+
+float bloom_approx_scale_x = targetSize.x / sourceSize[0].y;
+const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0);
+const float bloom_diff_thresh_ = 1.0/256.0;
+
+//////////////////////////////  FRAGMENT INCLUDES  //////////////////////////////
+
+//#include "bloom-functions.h"
+
+////////////////////////////  BEGIN BLOOM-FUNCTIONS  ///////////////////////////
+
+#ifndef BLOOM_FUNCTIONS_H
+#define BLOOM_FUNCTIONS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These utility functions and constants help several passes determine the
+//  size and center texel weight of the phosphor bloom in a uniform manner.
+
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//  We need to calculate the correct blur sigma using some .cgp constants:
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+////////////////////////////  END USER-SETTINGS  //////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  END DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////////////
+
+//#include "../../../../include/blur-functions.h"
+
+////////////////////////////  BEGIN BLUR-FUNCTIONS  ///////////////////////////
+
+#ifndef BLUR_FUNCTIONS_H
+#define BLUR_FUNCTIONS_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides reusable one-pass and separable (two-pass) blurs.
+//  Requires:   All blurs share these requirements (dxdy requirement is split):
+//              1.) All requirements of gamma-management.h must be satisfied!
+//              2.) filter_linearN must == "true" in your .cgp preset unless
+//                  you're using tex2DblurNresize at 1x scale.
+//              3.) mipmap_inputN must == "true" in your .cgp preset if
+//                  output_size < video_size.
+//              4.) output_size == video_size / pow(2, M), where M is some
+//                  positive integer.  tex2Dblur*resize can resize arbitrarily
+//                  (and the blur will be done after resizing), but arbitrary
+//                  resizes "fail" with other blurs due to the way they mix
+//                  static weights with bilinear sample exploitation.
+//              5.) In general, dxdy should contain the uv pixel spacing:
+//                      dxdy = (video_size/output_size)/texture_size
+//              6.) For separable blurs (tex2DblurNresize and tex2DblurNfast),
+//                  zero out the dxdy component in the unblurred dimension:
+//                      dxdy = float2(dxdy.x, 0.0) or float2(0.0, dxdy.y)
+//              Many blurs share these requirements:
+//              1.) One-pass blurs require scale_xN == scale_yN or scales > 1.0,
+//                  or they will blur more in the lower-scaled dimension.
+//              2.) One-pass shared sample blurs require ddx(), ddy(), and
+//                  tex2Dlod() to be supported by the current Cg profile, and
+//                  the drivers must support high-quality derivatives.
+//              3.) One-pass shared sample blurs require:
+//                      tex_uv.w == log2(video_size/output_size).y;
+//              Non-wrapper blurs share this requirement:
+//              1.) sigma is the intended standard deviation of the blur
+//              Wrapper blurs share this requirement, which is automatically
+//              met (unless OVERRIDE_BLUR_STD_DEVS is #defined; see below):
+//              1.) blurN_std_dev must be global static const float values
+//                  specifying standard deviations for Nx blurs in units
+//                  of destination pixels
+//  Optional:   1.) The including file (or an earlier included file) may
+//                  optionally #define USE_BINOMIAL_BLUR_STD_DEVS to replace
+//                  default standard deviations with those matching a binomial
+//                  distribution.  (See below for details/properties.)
+//              2.) The including file (or an earlier included file) may
+//                  optionally #define OVERRIDE_BLUR_STD_DEVS and override:
+//                      static const float blur3_std_dev
+//                      static const float blur4_std_dev
+//                      static const float blur5_std_dev
+//                      static const float blur6_std_dev
+//                      static const float blur7_std_dev
+//                      static const float blur8_std_dev
+//                      static const float blur9_std_dev
+//                      static const float blur10_std_dev
+//                      static const float blur11_std_dev
+//                      static const float blur12_std_dev
+//                      static const float blur17_std_dev
+//                      static const float blur25_std_dev
+//                      static const float blur31_std_dev
+//                      static const float blur43_std_dev
+//              3.) The including file (or an earlier included file) may
+//                  optionally #define OVERRIDE_ERROR_BLURRING and override:
+//                      static const float error_blurring
+//                  This tuning value helps mitigate weighting errors from one-
+//                  pass shared-sample blurs sharing bilinear samples between
+//                  fragments.  Values closer to 0.0 have "correct" blurriness
+//                  but allow more artifacts, and values closer to 1.0 blur away
+//                  artifacts by sampling closer to halfway between texels.
+//              UPDATE 6/21/14: The above static constants may now be overridden
+//              by non-static uniform constants.  This permits exposing blur
+//              standard deviations as runtime GUI shader parameters.  However,
+//              using them keeps weights from being statically computed, and the
+//              speed hit depends on the blur: On my machine, uniforms kill over
+//              53% of the framerate with tex2Dblur12x12shared, but they only
+//              drop the framerate by about 18% with tex2Dblur11fast.
+//  Quality and Performance Comparisons:
+//  For the purposes of the following discussion, "no sRGB" means
+//  GAMMA_ENCODE_EVERY_FBO is #defined, and "sRGB" means it isn't.
+//  1.) tex2DblurNfast is always faster than tex2DblurNresize.
+//  2.) tex2DblurNresize functions are the only ones that can arbitrarily resize
+//      well, because they're the only ones that don't exploit bilinear samples.
+//      This also means they're the only functions which can be truly gamma-
+//      correct without linear (or sRGB FBO) input, but only at 1x scale.
+//  3.) One-pass shared sample blurs only have a speed advantage without sRGB.
+//      They also have some inaccuracies due to their shared-[bilinear-]sample
+//      design, which grow increasingly bothersome for smaller blurs and higher-
+//      frequency source images (relative to their resolution).  I had high
+//      hopes for them, but their most realistic use case is limited to quickly
+//      reblurring an already blurred input at full resolution.  Otherwise:
+//      a.) If you're blurring a low-resolution source, you want a better blur.
+//      b.) If you're blurring a lower mipmap, you want a better blur.
+//      c.) If you're blurring a high-resolution, high-frequency source, you
+//          want a better blur.
+//  4.) The one-pass blurs without shared samples grow slower for larger blurs,
+//      but they're competitive with separable blurs at 5x5 and smaller, and
+//      even tex2Dblur7x7 isn't bad if you're wanting to conserve passes.
+//  Here are some framerates from a GeForce 8800GTS.  The first pass resizes to
+//  viewport size (4x in this test) and linearizes for sRGB codepaths, and the
+//  remaining passes perform 6 full blurs.  Mipmapped tests are performed at the
+//  same scale, so they just measure the cost of mipmapping each FBO (only every
+//  other FBO is mipmapped for separable blurs, to mimic realistic usage).
+//  Mipmap      Neither     sRGB+Mipmap sRGB        Function
+//  76.0        92.3        131.3       193.7       tex2Dblur3fast
+//  63.2        74.4        122.4       175.5       tex2Dblur3resize
+//  93.7        121.2       159.3       263.2       tex2Dblur3x3
+//  59.7        68.7        115.4       162.1       tex2Dblur3x3resize
+//  63.2        74.4        122.4       175.5       tex2Dblur5fast
+//  49.3        54.8        100.0       132.7       tex2Dblur5resize
+//  59.7        68.7        115.4       162.1       tex2Dblur5x5
+//  64.9        77.2        99.1        137.2       tex2Dblur6x6shared
+//  55.8        63.7        110.4       151.8       tex2Dblur7fast
+//  39.8        43.9        83.9        105.8       tex2Dblur7resize
+//  40.0        44.2        83.2        104.9       tex2Dblur7x7
+//  56.4        65.5        71.9        87.9        tex2Dblur8x8shared
+//  49.3        55.1        99.9        132.5       tex2Dblur9fast
+//  33.3        36.2        72.4        88.0        tex2Dblur9resize
+//  27.8        29.7        61.3        72.2        tex2Dblur9x9
+//  37.2        41.1        52.6        60.2        tex2Dblur10x10shared
+//  44.4        49.5        91.3        117.8       tex2Dblur11fast
+//  28.8        30.8        63.6        75.4        tex2Dblur11resize
+//  33.6        36.5        40.9        45.5        tex2Dblur12x12shared
+//  TODO: Fill in benchmarks for new untested blurs.
+//                                                  tex2Dblur17fast
+//                                                  tex2Dblur25fast
+//                                                  tex2Dblur31fast
+//                                                  tex2Dblur43fast
+//                                                  tex2Dblur3x3resize
+
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+//  Set static standard deviations, but allow users to override them with their
+//  own constants (even non-static uniforms if they're okay with the speed hit):
+#ifndef OVERRIDE_BLUR_STD_DEVS
+    //  blurN_std_dev values are specified in terms of dxdy strides.
+    #ifdef USE_BINOMIAL_BLUR_STD_DEVS
+        //  By request, we can define standard deviations corresponding to a
+        //  binomial distribution with p = 0.5 (related to Pascal's triangle).
+        //  This distribution works such that blurring multiple times should
+        //  have the same result as a single larger blur.  These values are
+        //  larger than default for blurs up to 6x and smaller thereafter.
+        static const float blur3_std_dev = 0.84931640625;
+        static const float blur4_std_dev = 0.84931640625;
+        static const float blur5_std_dev = 1.0595703125;
+        static const float blur6_std_dev = 1.06591796875;
+        static const float blur7_std_dev = 1.17041015625;
+        static const float blur8_std_dev = 1.1720703125;
+        static const float blur9_std_dev = 1.2259765625;
+        static const float blur10_std_dev = 1.21982421875;
+        static const float blur11_std_dev = 1.25361328125;
+        static const float blur12_std_dev = 1.2423828125;
+        static const float blur17_std_dev = 1.27783203125;
+        static const float blur25_std_dev = 1.2810546875;
+        static const float blur31_std_dev = 1.28125;
+        static const float blur43_std_dev = 1.28125;
+    #else
+        //  The defaults are the largest values that keep the largest unused
+        //  blur term on each side <= 1.0/256.0.  (We could get away with more
+        //  or be more conservative, but this compromise is pretty reasonable.)
+        static const float blur3_std_dev = 0.62666015625;
+        static const float blur4_std_dev = 0.66171875;
+        static const float blur5_std_dev = 0.9845703125;
+        static const float blur6_std_dev = 1.02626953125;
+        static const float blur7_std_dev = 1.36103515625;
+        static const float blur8_std_dev = 1.4080078125;
+        static const float blur9_std_dev = 1.7533203125;
+        static const float blur10_std_dev = 1.80478515625;
+        static const float blur11_std_dev = 2.15986328125;
+        static const float blur12_std_dev = 2.215234375;
+        static const float blur17_std_dev = 3.45535583496;
+        static const float blur25_std_dev = 5.3409576416;
+        static const float blur31_std_dev = 6.86488037109;
+        static const float blur43_std_dev = 10.1852050781;
+    #endif  //  USE_BINOMIAL_BLUR_STD_DEVS
+#endif  //  OVERRIDE_BLUR_STD_DEVS
+
+#ifndef OVERRIDE_ERROR_BLURRING
+    //  error_blurring should be in [0.0, 1.0].  Higher values reduce ringing
+    //  in shared-sample blurs but increase blurring and feature shifting.
+    static const float error_blurring = 0.5;
+#endif
+
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//  gamma-management.h relies on pass-specific settings to guide its behavior:
+//  FIRST_PASS, LAST_PASS, GAMMA_ENCODE_EVERY_FBO, etc.  See it for details.
+//#include "gamma-management.h"
+
+////////////////////////////  BEGIN GAMMA-MANAGEMENT  //////////////////////////
+
+#ifndef GAMMA_MANAGEMENT_H
+#define GAMMA_MANAGEMENT_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//  
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides gamma-aware tex*D*() and encode_output() functions.
+//  Requires:   Before #include-ing this file, the including file must #define
+//              the following macros when applicable and follow their rules:
+//              1.) #define FIRST_PASS if this is the first pass.
+//              2.) #define LAST_PASS if this is the last pass.
+//              3.) If sRGB is available, set srgb_framebufferN = "true" for
+//                  every pass except the last in your .cgp preset.
+//              4.) If sRGB isn't available but you want gamma-correctness with
+//                  no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
+//              5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
+//              6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
+//              7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
+//              8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
+//              If an option in [5, 8] is #defined in the first or last pass, it
+//              should be #defined for both.  It shouldn't make a difference
+//              whether it's #defined for intermediate passes or not.
+//  Optional:   The including file (or an earlier included file) may optionally
+//              #define a number of macros indicating it will override certain
+//              macros and associated constants are as follows:
+//              static constants with either static or uniform constants.  The
+//              1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
+//                  static const float ntsc_gamma
+//                  static const float pal_gamma
+//                  static const float crt_reference_gamma_high
+//                  static const float crt_reference_gamma_low
+//                  static const float lcd_reference_gamma
+//                  static const float crt_office_gamma
+//                  static const float lcd_office_gamma
+//              2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
+//                  static const float crt_gamma
+//                  static const float gba_gamma
+//                  static const float lcd_gamma
+//              3.) OVERRIDE_FINAL_GAMMA: The user must first define:
+//                  static const float input_gamma
+//                  static const float intermediate_gamma
+//                  static const float output_gamma
+//                  (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
+//              4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
+//                  static const bool assume_opaque_alpha
+//              The gamma constant overrides must be used in every pass or none,
+//              and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
+//              OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
+//  Usage:      After setting macros appropriately, ignore gamma correction and
+//              replace all tex*D*() calls with equivalent gamma-aware
+//              tex*D*_linearize calls, except:
+//              1.) When you read an LUT, use regular tex*D or a gamma-specified
+//                  function, depending on its gamma encoding:
+//                      tex*D*_linearize_gamma (takes a runtime gamma parameter)
+//              2.) If you must read pass0's original input in a later pass, use
+//                  tex2D_linearize_ntsc_gamma.  If you want to read pass0's
+//                  input with gamma-corrected bilinear filtering, consider
+//                  creating a first linearizing pass and reading from the input
+//                  of pass1 later.
+//              Then, return encode_output(color) from every fragment shader.
+//              Finally, use the global gamma_aware_bilinear boolean if you want
+//              to statically branch based on whether bilinear filtering is
+//              gamma-correct or not (e.g. for placing Gaussian blur samples).
+//
+//  Detailed Policy:
+//  tex*D*_linearize() functions enforce a consistent gamma-management policy
+//  based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings.  They assume
+//  their input texture has the same encoding characteristics as the input for
+//  the current pass (which doesn't apply to the exceptions listed above).
+//  Similarly, encode_output() enforces a policy based on the LAST_PASS and
+//  GAMMA_ENCODE_EVERY_FBO settings.  Together, they result in one of the
+//  following two pipelines.
+//  Typical pipeline with intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = linear_color;     //  Automatic sRGB encoding
+//      linear_color = intermediate_output;     //  Automatic sRGB decoding
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Typical pipeline without intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
+//      linear_color = pow(intermediate_output, intermediate_gamma);
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
+//  easily get gamma-correctness without banding on devices where sRGB isn't
+//  supported.
+//
+//  Use This Header to Maximize Code Reuse:
+//  The purpose of this header is to provide a consistent interface for texture
+//  reads and output gamma-encoding that localizes and abstracts away all the
+//  annoying details.  This greatly reduces the amount of code in each shader
+//  pass that depends on the pass number in the .cgp preset or whether sRGB
+//  FBO's are being used: You can trivially change the gamma behavior of your
+//  whole pass by commenting or uncommenting 1-3 #defines.  To reuse the same
+//  code in your first, Nth, and last passes, you can even put it all in another
+//  header file and #include it from skeleton .cg files that #define the
+//  appropriate pass-specific settings.
+//
+//  Rationale for Using Three Macros:
+//  This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
+//  SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
+//  a lower maintenance burden on each pass.  At first glance it seems we could
+//  accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
+//  This works for simple use cases where input_gamma == output_gamma, but it
+//  breaks down for more complex scenarios like CRT simulation, where the pass
+//  number determines the gamma encoding of the input and output.
+
+
+///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
+
+//  Set standard gamma constants, but allow users to override them:
+#ifndef OVERRIDE_STANDARD_GAMMA
+    //  Standard encoding gammas:
+    static const float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
+    static const float pal_gamma = 2.8;     //  Never actually 2.8 in practice
+    //  Typical device decoding gammas (only use for emulating devices):
+    //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
+    //  gammas: The standards purposely undercorrected for an analog CRT's
+    //  assumed 2.5 reference display gamma to maintain contrast in assumed
+    //  [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
+    //  These unstated assumptions about display gamma and perceptual rendering
+    //  intent caused a lot of confusion, and more modern CRT's seemed to target
+    //  NTSC 2.2 gamma with circuitry.  LCD displays seem to have followed suit
+    //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
+    //  displays designed to view sRGB in bright environments.  (Standards are
+    //  also in flux again with BT.1886, but it's underspecified for displays.)
+    static const float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
+    static const float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
+    static const float lcd_reference_gamma = 2.5;       //  To match CRT
+    static const float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
+    static const float lcd_office_gamma = 2.2;  //  Approximates sRGB
+#endif  //  OVERRIDE_STANDARD_GAMMA
+
+//  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
+//  but only if they're aware of it.
+#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
+    static const bool assume_opaque_alpha = false;
+#endif
+
+
+///////////////////////  DERIVED CONSTANTS AS FUNCTIONS  ///////////////////////
+
+//  gamma-management.h should be compatible with overriding gamma values with
+//  runtime user parameters, but we can only define other global constants in
+//  terms of static constants, not uniform user parameters.  To get around this
+//  limitation, we need to define derived constants using functions.
+
+//  Set device gamma constants, but allow users to override them:
+#ifdef OVERRIDE_DEVICE_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_crt_gamma()    {   return crt_gamma;   }
+    inline float get_gba_gamma()    {   return gba_gamma;   }
+    inline float get_lcd_gamma()    {   return lcd_gamma;   }
+#else
+    inline float get_crt_gamma()    {   return crt_reference_gamma_high;    }
+    inline float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
+    inline float get_lcd_gamma()    {   return lcd_office_gamma;            }
+#endif  //  OVERRIDE_DEVICE_GAMMA
+
+//  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
+#ifdef OVERRIDE_FINAL_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_intermediate_gamma()   {   return intermediate_gamma;  }
+    inline float get_input_gamma()          {   return input_gamma;         }
+    inline float get_output_gamma()         {   return output_gamma;        }
+#else
+    //  If we gamma-correct every pass, always use ntsc_gamma between passes to
+    //  ensure middle passes don't need to care if anything is being simulated:
+    inline float get_intermediate_gamma()   {   return ntsc_gamma;          }
+    #ifdef SIMULATE_CRT_ON_LCD
+        inline float get_input_gamma()      {   return get_crt_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_LCD
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_LCD_ON_CRT
+        inline float get_input_gamma()      {   return get_lcd_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_CRT
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else   //  Don't simulate anything:
+        inline float get_input_gamma()      {   return ntsc_gamma;          }
+        inline float get_output_gamma()     {   return ntsc_gamma;          }
+    #endif  //  SIMULATE_GBA_ON_CRT
+    #endif  //  SIMULATE_LCD_ON_CRT
+    #endif  //  SIMULATE_GBA_ON_LCD
+    #endif  //  SIMULATE_CRT_ON_LCD
+#endif  //  OVERRIDE_FINAL_GAMMA
+
+//  Set decoding/encoding gammas for the current pass.  Use static constants for
+//  linearize_input and gamma_encode_output, because they aren't derived, and
+//  they let the compiler do dead-code elimination.
+#ifndef GAMMA_ENCODE_EVERY_FBO
+    #ifdef FIRST_PASS
+        static const bool linearize_input = true;
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        static const bool linearize_input = false;
+        inline float get_pass_input_gamma()     {   return 1.0;                 }
+    #endif
+    #ifdef LAST_PASS
+        static const bool gamma_encode_output = true;
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        static const bool gamma_encode_output = false;
+        inline float get_pass_output_gamma()    {   return 1.0;                 }
+    #endif
+#else
+    static const bool linearize_input = true;
+    static const bool gamma_encode_output = true;
+    #ifdef FIRST_PASS
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        inline float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
+    #endif
+    #ifdef LAST_PASS
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        inline float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
+    #endif
+#endif
+
+//  Users might want to know if bilinear filtering will be gamma-correct:
+static const bool gamma_aware_bilinear = !linearize_input;
+
+
+//////////////////////  COLOR ENCODING/DECODING FUNCTIONS  /////////////////////
+
+inline float4 encode_output(const float4 color)
+{
+    if(gamma_encode_output)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_input(const float4 color)
+{
+    if(linearize_input)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_gamma_input(const float4 color, const float3 gamma)
+{
+    if(assume_opaque_alpha)
+    {
+        return float4(pow(color.rgb, gamma), 1.0);
+    }
+    else
+    {
+        return float4(pow(color.rgb, gamma), color.a);
+    }
+}
+
+//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯
+//#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D)))
+// EDIT: it's the 'const' in front of the coords that's doing it
+
+///////////////////////////  TEXTURE LOOKUP WRAPPERS  //////////////////////////
+
+//  "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a wide array of linearizing texture lookup wrapper functions.  The
+//  Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
+//  lookups are provided for completeness in case that changes someday.  Nobody
+//  is likely to use the *fetch and *proj functions, but they're included just
+//  in case.  The only tex*D texture sampling functions omitted are:
+//      - tex*Dcmpbias
+//      - tex*Dcmplod
+//      - tex*DARRAY*
+//      - tex*DMS*
+//      - Variants returning integers
+//  Standard line length restrictions are ignored below for vertical brevity.
+/*
+//  tex1D:
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex1Dbias:
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dbias(tex, tex_coords));   }
+
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dbias(tex, tex_coords, texel_off));    }
+
+//  tex1Dfetch:
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
+{   return decode_input(tex1Dfetch(tex, tex_coords));  }
+
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex1Dlod:
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dlod(tex, tex_coords));    }
+
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dlod(tex, tex_coords, texel_off));     }
+
+//  tex1Dproj:
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+*/
+//  tex2D:
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords, texel_off));    }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex2Dbias:
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
+//{   return decode_input(tex2Dbias(tex, tex_coords));   }
+
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dbias(tex, tex_coords, texel_off));    }
+
+//  tex2Dfetch:
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
+//{   return decode_input(tex2Dfetch(tex, tex_coords));  }
+
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords)
+{   return decode_input(textureLod(tex, tex_coords.xy, 0.0));    }
+
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));     }
+/*
+//  tex2Dproj:
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+*/
+/*
+//  tex3D:
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
+{   return decode_input(tex3D(tex, tex_coords));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, texel_off));    }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex3Dbias:
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dbias(tex, tex_coords));   }
+
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dbias(tex, tex_coords, texel_off));    }
+
+//  tex3Dfetch:
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
+{   return decode_input(tex3Dfetch(tex, tex_coords));  }
+
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex3Dlod:
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dlod(tex, tex_coords));    }
+
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dlod(tex, tex_coords, texel_off));     }
+
+//  tex3Dproj:
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dproj(tex, tex_coords));   }
+
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dproj(tex, tex_coords, texel_off));    }
+/////////*
+
+//  NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  This narrow selection of nonstandard tex2D* functions can be useful:
+
+//  tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0)));   }
+
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off));    }
+
+
+//  MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a narrower selection of tex2D* wrapper functions that decode an
+//  input sample with a specified gamma value.  These are useful for reading
+//  LUT's and for reading the input of pass0 in a later pass.
+
+//  tex2D:
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma);   }
+
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+/*
+//  tex2Dbias:
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma);   }
+
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma);    }
+
+//  tex2Dfetch:
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma);  }
+
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma);   }
+*/
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma);    }
+
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma);     }
+
+
+#endif  //  GAMMA_MANAGEMENT_H
+
+////////////////////////////  END GAMMA-MANAGEMENT  //////////////////////////
+
+//#include "quad-pixel-communication.h"
+
+///////////////////////  BEGIN QUAD-PIXEL-COMMUNICATION  //////////////////////
+
+#ifndef QUAD_PIXEL_COMMUNICATION_H
+#define QUAD_PIXEL_COMMUNICATION_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey*
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DISCLAIMER  /////////////////////////////////
+
+//  *This code was inspired by "Shader Amortization using Pixel Quad Message
+//  Passing" by Eric Penner, published in GPU Pro 2, Chapter VI.2.  My intent
+//  is not to plagiarize his fundamentally similar code and assert my own
+//  copyright, but the algorithmic helper functions require so little code that
+//  implementations can't vary by much except bugfixes and conventions.  I just
+//  wanted to license my own particular code here to avoid ambiguity and make it
+//  clear that as far as I'm concerned, people can do as they please with it.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  Given screen pixel numbers, derive a "quad vector" describing a fragment's
+//  position in its 2x2 pixel quad.  Given that vector, obtain the values of any
+//  variable at neighboring fragments.
+//  Requires:   Using this file in general requires:
+//              1.) ddx() and ddy() are present in the current Cg profile.
+//              2.) The GPU driver is using fine/high-quality derivatives.
+//                  Functions will give incorrect results if this is not true,
+//                  so a test function is included.
+
+
+/////////////////////  QUAD-PIXEL COMMUNICATION PRIMITIVES  ////////////////////
+
+float4 get_quad_vector_naive(float4 output_pixel_num_wrt_uvxy)
+{
+    //  Requires:   Two measures of the current fragment's output pixel number
+    //              in the range ([0, output_size.x), [0, output_size.y)):
+    //              1.) output_pixel_num_wrt_uvxy.xy increase with uv coords.
+    //              2.) output_pixel_num_wrt_uvxy.zw increase with screen xy.
+    //  Returns:    Two measures of the fragment's position in its 2x2 quad:
+    //              1.) The .xy components are its 2x2 placement with respect to
+    //                  uv direction (the origin (0, 0) is at the top-left):
+    //                  top-left     = (-1.0, -1.0) top-right    = ( 1.0, -1.0)
+    //                  bottom-left  = (-1.0,  1.0) bottom-right = ( 1.0,  1.0)
+    //                  You need this to arrange/weight shared texture samples.
+    //              2.) The .zw components are its 2x2 placement with respect to
+    //                  screen xy direction (position); the origin varies.
+    //                  quad_gather needs this measure to work correctly.
+    //              Note: quad_vector.zw = quad_vector.xy * float2(
+    //                      ddx(output_pixel_num_wrt_uvxy.x),
+    //                      ddy(output_pixel_num_wrt_uvxy.y));
+    //  Caveats:    This function assumes the GPU driver always starts 2x2 pixel
+    //              quads at even pixel numbers.  This assumption can be wrong
+    //              for odd output resolutions (nondeterministically so).
+    float4 pixel_odd = frac(output_pixel_num_wrt_uvxy * 0.5) * 2.0;
+    float4 quad_vector = pixel_odd * 2.0 - float4(1.0);
+    return quad_vector;
+}
+
+float4 get_quad_vector(float4 output_pixel_num_wrt_uvxy)
+{
+    //  Requires:   Same as get_quad_vector_naive() (see that first).
+    //  Returns:    Same as get_quad_vector_naive() (see that first), but it's
+    //              correct even if the 2x2 pixel quad starts at an odd pixel,
+    //              which can occur at odd resolutions.
+    float4 quad_vector_guess =
+        get_quad_vector_naive(output_pixel_num_wrt_uvxy);
+    //  If quad_vector_guess.zw doesn't increase with screen xy, we know
+    //  the 2x2 pixel quad starts at an odd pixel:
+    float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_guess.z),
+                                                ddy(quad_vector_guess.w));
+    return quad_vector_guess * odd_start_mirror.xyxy;
+}
+
+float4 get_quad_vector(float2 output_pixel_num_wrt_uv)
+{
+    //  Requires:   1.) ddx() and ddy() are present in the current Cg profile.
+    //              2.) output_pixel_num_wrt_uv must increase with uv coords and
+    //                  measure the current fragment's output pixel number in:
+    //                      ([0, output_size.x), [0, output_size.y))
+    //  Returns:    Same as get_quad_vector_naive() (see that first), but it's
+    //              correct even if the 2x2 pixel quad starts at an odd pixel,
+    //              which can occur at odd resolutions.
+    //  Caveats:    This function requires less information than the version
+    //              taking a float4, but it's potentially slower.
+    //  Do screen coords increase with or against uv?  Get the direction
+    //  with respect to (uv.x, uv.y) for (screen.x, screen.y) in {-1, 1}.
+    float2 screen_uv_mirror = float2(ddx(output_pixel_num_wrt_uv.x),
+                                        ddy(output_pixel_num_wrt_uv.y));
+    float2 pixel_odd_wrt_uv = frac(output_pixel_num_wrt_uv * 0.5) * 2.0;
+    float2 quad_vector_uv_guess = (pixel_odd_wrt_uv - float2(0.5)) * 2.0;
+    float2 quad_vector_screen_guess = quad_vector_uv_guess * screen_uv_mirror;
+    //  If quad_vector_screen_guess doesn't increase with screen xy, we know
+    //  the 2x2 pixel quad starts at an odd pixel:
+    float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_screen_guess.x),
+                                                ddy(quad_vector_screen_guess.y));
+    float4 quad_vector_guess = float4(
+        quad_vector_uv_guess, quad_vector_screen_guess);
+    return quad_vector_guess * odd_start_mirror.xyxy;
+}
+
+void quad_gather(float4 quad_vector, float4 curr,
+    out float4 adjx, out float4 adjy, out float4 diag)
+{
+    //  Requires:   1.) ddx() and ddy() are present in the current Cg profile.
+    //              2.) The GPU driver is using fine/high-quality derivatives.
+    //              3.) quad_vector describes the current fragment's location in
+    //                  its 2x2 pixel quad using get_quad_vector()'s conventions.
+    //              4.) curr is any vector you wish to get neighboring values of.
+    //  Returns:    Values of an input vector (curr) at neighboring fragments
+    //              adjacent x, adjacent y, and diagonal (via out parameters).
+    adjx = curr - ddx(curr) * quad_vector.z;
+    adjy = curr - ddy(curr) * quad_vector.w;
+    diag = adjx - ddy(adjx) * quad_vector.w;
+}
+
+void quad_gather(float4 quad_vector, float3 curr,
+    out float3 adjx, out float3 adjy, out float3 diag)
+{
+    //  Float3 version
+    adjx = curr - ddx(curr) * quad_vector.z;
+    adjy = curr - ddy(curr) * quad_vector.w;
+    diag = adjx - ddy(adjx) * quad_vector.w;
+}
+
+void quad_gather(float4 quad_vector, float2 curr,
+    out float2 adjx, out float2 adjy, out float2 diag)
+{
+    //  Float2 version
+    adjx = curr - ddx(curr) * quad_vector.z;
+    adjy = curr - ddy(curr) * quad_vector.w;
+    diag = adjx - ddy(adjx) * quad_vector.w;
+}
+
+float4 quad_gather(float4 quad_vector, float curr)
+{
+    //  Float version:
+    //  Returns:    return.x == current
+    //              return.y == adjacent x
+    //              return.z == adjacent y
+    //              return.w == diagonal
+    float4 all = float4(curr);
+    all.y = all.x - ddx(all.x) * quad_vector.z;
+    all.zw = all.xy - ddy(all.xy) * quad_vector.w;
+    return all;
+}
+
+float4 quad_gather_sum(float4 quad_vector, float4 curr)
+{
+    //  Requires:   Same as quad_gather()
+    //  Returns:    Sum of an input vector (curr) at all fragments in a quad.
+    float4 adjx, adjy, diag;
+    quad_gather(quad_vector, curr, adjx, adjy, diag);
+    return (curr + adjx + adjy + diag);
+}
+
+float3 quad_gather_sum(float4 quad_vector, float3 curr)
+{
+    //  Float3 version:
+    float3 adjx, adjy, diag;
+    quad_gather(quad_vector, curr, adjx, adjy, diag);
+    return (curr + adjx + adjy + diag);
+}
+
+float2 quad_gather_sum(float4 quad_vector, float2 curr)
+{
+    //  Float2 version:
+    float2 adjx, adjy, diag;
+    quad_gather(quad_vector, curr, adjx, adjy, diag);
+    return (curr + adjx + adjy + diag);
+}
+
+float quad_gather_sum(float4 quad_vector, float curr)
+{
+    //  Float version:
+    float4 all_values = quad_gather(quad_vector, curr);
+    return (all_values.x + all_values.y + all_values.z + all_values.w);
+}
+
+bool fine_derivatives_working(float4 quad_vector, float4 curr)
+{
+    //  Requires:   1.) ddx() and ddy() are present in the current Cg profile.
+    //              2.) quad_vector describes the current fragment's location in
+    //                  its 2x2 pixel quad using get_quad_vector()'s conventions.
+    //              3.) curr must be a test vector with non-constant derivatives
+    //                  (its value should change nonlinearly across fragments).
+    //  Returns:    true if fine/hybrid/high-quality derivatives are used, or
+    //              false if coarse derivatives are used or inconclusive
+    //  Usage:      Test whether quad-pixel communication is working!
+    //  Method:     We can confirm fine derivatives are used if the following
+    //              holds (ever, for any value at any fragment):
+    //                  (ddy(curr) != ddy(adjx)) or (ddx(curr) != ddx(adjy))
+    //              The more values we test (e.g. test a float4 two ways), the
+    //              easier it is to demonstrate fine derivatives are working.
+    //  TODO: Check for floating point exact comparison issues!
+    float4 ddx_curr = ddx(curr);
+    float4 ddy_curr = ddy(curr);
+    float4 adjx = curr - ddx_curr * quad_vector.z;
+    float4 adjy = curr - ddy_curr * quad_vector.w;
+    bool ddy_different = any(bool4(ddy_curr.x != ddy(adjx).x, ddy_curr.y != ddy(adjx).y, ddy_curr.z != ddy(adjx).z, ddy_curr.w != ddy(adjx).w));
+    bool ddx_different = any(bool4(ddx_curr.x != ddx(adjy).x, ddx_curr.y != ddx(adjy).y, ddx_curr.z != ddx(adjy).z, ddx_curr.w != ddx(adjy).w));
+    return any(bool2(ddy_different, ddx_different));
+}
+
+bool fine_derivatives_working_fast(float4 quad_vector, float curr)
+{
+    //  Requires:   Same as fine_derivatives_working()
+    //  Returns:    Same as fine_derivatives_working()
+    //  Usage:      This is faster than fine_derivatives_working() but more
+    //              likely to return false negatives, so it's less useful for
+    //              offline testing/debugging.  It's also useless as the basis
+    //              for dynamic runtime branching as of May 2014: Derivatives
+    //              (and quad-pixel communication) are currently disallowed in
+    //              branches.  However, future GPU's may allow you to use them
+    //              in dynamic branches if you promise the branch condition
+    //              evaluates the same for every fragment in the quad (and/or if
+    //              the driver enforces that promise by making a single fragment
+    //              control branch decisions).  If that ever happens, this
+    //              version may become a more economical choice.
+    float ddx_curr = ddx(curr);
+    float ddy_curr = ddy(curr);
+    float adjx = curr - ddx_curr * quad_vector.z;
+    return (ddy_curr != ddy(adjx));
+}
+
+#endif  //  QUAD_PIXEL_COMMUNICATION_H
+
+////////////////////////  END QUAD-PIXEL-COMMUNICATION  ///////////////////////
+
+//#include "special-functions.h"
+
+///////////////////////////  BEGIN SPECIAL-FUNCTIONS  //////////////////////////
+
+#ifndef SPECIAL_FUNCTIONS_H
+#define SPECIAL_FUNCTIONS_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file implements the following mathematical special functions:
+//  1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2))
+//  2.) gamma(s), a real-numbered extension of the integer factorial function
+//  It also implements normalized_ligamma(s, z), a normalized lower incomplete
+//  gamma function for s < 0.5 only.  Both gamma() and normalized_ligamma() can
+//  be called with an _impl suffix to use an implementation version with a few
+//  extra precomputed parameters (which may be useful for the caller to reuse).
+//  See below for details.
+//
+//  Design Rationale:
+//  Pretty much every line of code in this file is duplicated four times for
+//  different input types (float4/float3/float2/float).  This is unfortunate,
+//  but Cg doesn't allow function templates.  Macros would be far less verbose,
+//  but they would make the code harder to document and read.  I don't expect
+//  these functions will require a whole lot of maintenance changes unless
+//  someone ever has need for more robust incomplete gamma functions, so code
+//  duplication seems to be the lesser evil in this case.
+
+
+///////////////////////////  GAUSSIAN ERROR FUNCTION  //////////////////////////
+
+float4 erf6(float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Return an Abramowitz/Stegun approximation of erf(), where:
+    //                  erf(x) = 2/sqrt(pi) * integral(e**(-x**2))
+    //              This approximation has a max absolute error of 2.5*10**-5
+    //              with solid numerical robustness and efficiency.  See:
+	//                  https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions
+	static const float4 one = float4(1.0);
+	const float4 sign_x = sign(x);
+	const float4 t = one/(one + 0.47047*abs(x));
+	const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float3 erf6(const float3 x)
+{
+    //  Float3 version:
+	static const float3 one = float3(1.0);
+	const float3 sign_x = sign(x);
+	const float3 t = one/(one + 0.47047*abs(x));
+	const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float2 erf6(const float2 x)
+{
+    //  Float2 version:
+	static const float2 one = float2(1.0);
+	const float2 sign_x = sign(x);
+	const float2 t = one/(one + 0.47047*abs(x));
+	const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float erf6(const float x)
+{
+    //  Float version:
+	const float sign_x = sign(x);
+	const float t = 1.0/(1.0 + 0.47047*abs(x));
+	const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float4 erft(const float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Approximate erf() with the hyperbolic tangent.  The error is
+    //              visually noticeable, but it's blazing fast and perceptually
+    //              close...at least on ATI hardware.  See:
+    //                  http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html
+    //  Warning:    Only use this if your hardware drivers correctly implement
+    //              tanh(): My nVidia 8800GTS returns garbage output.
+	return tanh(1.202760580 * x);
+}
+
+float3 erft(const float3 x)
+{
+    //  Float3 version:
+	return tanh(1.202760580 * x);
+}
+
+float2 erft(const float2 x)
+{
+    //  Float2 version:
+	return tanh(1.202760580 * x);
+}
+
+float erft(const float x)
+{
+    //  Float version:
+	return tanh(1.202760580 * x);
+}
+
+inline float4 erf(const float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Some approximation of erf(x), depending on user settings.
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float3 erf(const float3 x)
+{
+    //  Float3 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float2 erf(const float2 x)
+{
+    //  Float2 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float erf(const float x)
+{
+    //  Float version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+
+///////////////////////////  COMPLETE GAMMA FUNCTION  //////////////////////////
+
+float4 gamma_impl(const float4 s, const float4 s_inv)
+{
+    //  Requires:   1.) s is the standard parameter to the gamma function, and
+    //                  it should lie in the [0, 36] range.
+    //              2.) s_inv = 1.0/s.  This implementation function requires
+    //                  the caller to precompute this value, giving users the
+    //                  opportunity to reuse it.
+    //  Returns:    Return approximate gamma function (real-numbered factorial)
+    //              output using the Lanczos approximation with two coefficients
+    //              calculated using Paul Godfrey's method here:
+    //                  http://my.fit.edu/~gabdo/gamma.txt
+    //              An optimal g value for s in [0, 36] is ~1.12906830989, with
+    //              a maximum relative error of 0.000463 for 2**16 equally
+    //              evals.  We could use three coeffs (0.0000346 error) without
+    //              hurting latency, but this allows more parallelism with
+    //              outside instructions.
+	static const float4 g = float4(1.12906830989);
+	static const float4 c0 = float4(0.8109119309638332633713423362694399653724431);
+	static const float4 c1 = float4(0.4808354605142681877121661197951496120000040);
+	static const float4 e = float4(2.71828182845904523536028747135266249775724709);
+	const float4 sph = s + float4(0.5);
+	const float4 lanczos_sum = c0 + c1/(s + float4(1.0));
+	const float4 base = (sph + g)/e;  //  or (s + g + float4(0.5))/e
+	//  gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s).
+	//  This has less error for small s's than (s -= 1.0) at the beginning.
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float3 gamma_impl(const float3 s, const float3 s_inv)
+{
+    //  Float3 version:
+	static const float3 g = float3(1.12906830989);
+	static const float3 c0 = float3(0.8109119309638332633713423362694399653724431);
+	static const float3 c1 = float3(0.4808354605142681877121661197951496120000040);
+	static const float3 e = float3(2.71828182845904523536028747135266249775724709);
+	const float3 sph = s + float3(0.5);
+	const float3 lanczos_sum = c0 + c1/(s + float3(1.0));
+	const float3 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float2 gamma_impl(const float2 s, const float2 s_inv)
+{
+    //  Float2 version:
+	static const float2 g = float2(1.12906830989);
+	static const float2 c0 = float2(0.8109119309638332633713423362694399653724431);
+	static const float2 c1 = float2(0.4808354605142681877121661197951496120000040);
+	static const float2 e = float2(2.71828182845904523536028747135266249775724709);
+	const float2 sph = s + float2(0.5);
+	const float2 lanczos_sum = c0 + c1/(s + float2(1.0));
+	const float2 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float gamma_impl(const float s, const float s_inv)
+{
+    //  Float version:
+	static const float g = 1.12906830989;
+	static const float c0 = 0.8109119309638332633713423362694399653724431;
+	static const float c1 = 0.4808354605142681877121661197951496120000040;
+	static const float e = 2.71828182845904523536028747135266249775724709;
+	const float sph = s + 0.5;
+	const float lanczos_sum = c0 + c1/(s + 1.0);
+	const float base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float4 gamma(const float4 s)
+{
+    //  Requires:   s is the standard parameter to the gamma function, and it
+    //              should lie in the [0, 36] range.
+    //  Returns:    Return approximate gamma function output with a maximum
+    //              relative error of 0.000463.  See gamma_impl for details.
+	return gamma_impl(s, float4(1.0)/s);
+}
+
+float3 gamma(const float3 s)
+{
+    //  Float3 version:
+	return gamma_impl(s, float3(1.0)/s);
+}
+
+float2 gamma(const float2 s)
+{
+    //  Float2 version:
+	return gamma_impl(s, float2(1.0)/s);
+}
+
+float gamma(const float s)
+{
+    //  Float version:
+	return gamma_impl(s, 1.0/s);
+}
+
+
+////////////////  INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT)  ///////////////
+
+//  Lower incomplete gamma function for small s and z (implementation):
+float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z <= ~0.775075
+    //              3.) s_inv = 1.0/s (precomputed for outside reuse)
+    //  Returns:    A series representation for the lower incomplete gamma
+    //              function for small s and small z (4 terms).
+    //  The actual "rolled up" summation looks like:
+	//      last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0;
+	//      sum = last_sign * last_pow / ((s + k) * last_factorial)
+	//      for(int i = 0; i < 4; ++i)
+	//      {
+	//          last_sign *= -1.0; last_pow *= z; last_factorial *= i;
+	//          sum += last_sign * last_pow / ((s + k) * last_factorial);
+	//      }
+	//  Unrolled, constant-unfolded and arranged for madds and parallelism:
+	const float4 scale = pow(z, s);
+	float4 sum = s_inv;  //  Summation iteration 0 result
+	//  Summation iterations 1, 2, and 3:
+	const float4 z_sq = z*z;
+	const float4 denom1 = s + float4(1.0);
+	const float4 denom2 = 2.0*s + float4(4.0);
+	const float4 denom3 = 6.0*s + float4(18.0);
+	//float4 denom4 = 24.0*s + float4(96.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	//sum += z_sq * z_sq / denom4;
+	//  Scale and return:
+	return scale * sum;
+}
+
+float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv)
+{
+    //  Float3 version:
+	const float3 scale = pow(z, s);
+	float3 sum = s_inv;
+	const float3 z_sq = z*z;
+	const float3 denom1 = s + float3(1.0);
+	const float3 denom2 = 2.0*s + float3(4.0);
+	const float3 denom3 = 6.0*s + float3(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv)
+{
+    //  Float2 version:
+	const float2 scale = pow(z, s);
+	float2 sum = s_inv;
+	const float2 z_sq = z*z;
+	const float2 denom1 = s + float2(1.0);
+	const float2 denom2 = 2.0*s + float2(4.0);
+	const float2 denom3 = 6.0*s + float2(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float ligamma_small_z_impl(const float s, const float z, const float s_inv)
+{
+    //  Float version:
+	const float scale = pow(z, s);
+	float sum = s_inv;
+	const float z_sq = z*z;
+	const float denom1 = s + 1.0;
+	const float denom2 = 2.0*s + 4.0;
+	const float denom3 = 6.0*s + 18.0;
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+//  Upper incomplete gamma function for small s and large z (implementation):
+float4 uigamma_large_z_impl(const float4 s, const float4 z)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z > ~0.775075
+    //  Returns:    Gauss's continued fraction representation for the upper
+    //              incomplete gamma function (4 terms).
+	//  The "rolled up" continued fraction looks like this.  The denominator
+    //  is truncated, and it's calculated "from the bottom up:"
+	//      denom = float4('inf');
+	//      float4 one = float4(1.0);
+	//      for(int i = 4; i > 0; --i)
+	//      {
+	//          denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom;
+	//      }
+	//  Unrolled and constant-unfolded for madds and parallelism:
+	const float4 numerator = pow(z, s) * exp(-z);
+	float4 denom = float4(7.0) + z - s;
+	denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom;
+	denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom;
+	denom = float4(1.0) + z - s + (s - float4(1.0))/denom;
+	return numerator / denom;
+}
+
+float3 uigamma_large_z_impl(const float3 s, const float3 z)
+{
+    //  Float3 version:
+	const float3 numerator = pow(z, s) * exp(-z);
+	float3 denom = float3(7.0) + z - s;
+	denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom;
+	denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom;
+	denom = float3(1.0) + z - s + (s - float3(1.0))/denom;
+	return numerator / denom;
+}
+
+float2 uigamma_large_z_impl(const float2 s, const float2 z)
+{
+    //  Float2 version:
+	const float2 numerator = pow(z, s) * exp(-z);
+	float2 denom = float2(7.0) + z - s;
+	denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom;
+	denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom;
+	denom = float2(1.0) + z - s + (s - float2(1.0))/denom;
+	return numerator / denom;
+}
+
+float uigamma_large_z_impl(const float s, const float z)
+{
+    //  Float version:
+	const float numerator = pow(z, s) * exp(-z);
+	float denom = 7.0 + z - s;
+	denom = 5.0 + z - s + (3.0*s - 9.0)/denom;
+	denom = 3.0 + z - s + (2.0*s - 4.0)/denom;
+	denom = 1.0 + z - s + (s - 1.0)/denom;
+	return numerator / denom;
+}
+
+//  Normalized lower incomplete gamma function for small s (implementation):
+float4 normalized_ligamma_impl(const float4 s, const float4 z,
+    const float4 s_inv, const float4 gamma_s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) s_inv = 1/s (precomputed for outside reuse)
+    //              3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse)
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  Since we only care about s < 0.5, we only need
+    //              to evaluate two branches (not four) based on z.  Each branch
+    //              uses four terms, with a max relative error of ~0.00182.  The
+    //              branch threshold and specifics were adapted for fewer terms
+    //              from Gil/Segura/Temme's paper here:
+    //                  http://oai.cwi.nl/oai/asset/20433/20433B.pdf
+	//  Evaluate both branches: Real branches test slower even when available.
+	static const float4 thresh = float4(0.775075);
+	bool4 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	z_is_large.w = z.w > thresh.w;
+	const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	//  Combine the results from both branches:
+	bool4 inverse_z_is_large = not(z_is_large);
+	return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large);
+}
+
+float3 normalized_ligamma_impl(const float3 s, const float3 z,
+    const float3 s_inv, const float3 gamma_s_inv)
+{
+    //  Float3 version:
+	static const float3 thresh = float3(0.775075);
+	bool3 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool3 inverse_z_is_large = not(z_is_large);
+	return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large);
+}
+
+float2 normalized_ligamma_impl(const float2 s, const float2 z,
+    const float2 s_inv, const float2 gamma_s_inv)
+{
+    //  Float2 version:
+	static const float2 thresh = float2(0.775075);
+	bool2 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool2 inverse_z_is_large = not(z_is_large);
+	return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large);
+}
+
+float normalized_ligamma_impl(const float s, const float z,
+    const float s_inv, const float gamma_s_inv)
+{
+    //  Float version:
+	static const float thresh = 0.775075;
+	const bool z_is_large = z > thresh;
+	const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	return large_z * float(z_is_large) + small_z * float(!z_is_large);
+}
+
+//  Normalized lower incomplete gamma function for small s:
+float4 normalized_ligamma(const float4 s, const float4 z)
+{
+    //  Requires:   s < ~0.5
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  See normalized_ligamma_impl() for details.
+	const float4 s_inv = float4(1.0)/s;
+	const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float3 normalized_ligamma(const float3 s, const float3 z)
+{
+    //  Float3 version:
+	const float3 s_inv = float3(1.0)/s;
+	const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float2 normalized_ligamma(const float2 s, const float2 z)
+{
+    //  Float2 version:
+	const float2 s_inv = float2(1.0)/s;
+	const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float normalized_ligamma(const float s, const float z)
+{
+    //  Float version:
+	const float s_inv = 1.0/s;
+	const float gamma_s_inv = 1.0/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+#endif  //  SPECIAL_FUNCTIONS_H
+
+////////////////////////////  END SPECIAL-FUNCTIONS  ///////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////////  HELPERS  //////////////////////////////////
+
+inline float4 uv2_to_uv4(float2 tex_uv)
+{
+    //  Make a float2 uv offset safe for adding to float4 tex2Dlod coords:
+    return float4(tex_uv, 0.0, 0.0);
+}
+
+//  Make a length squared helper macro (for usage with static constants):
+#define LENGTH_SQ(vec) (dot(vec, vec))
+
+inline float get_fast_gaussian_weight_sum_inv(const float sigma)
+{
+    //  We can use the Gaussian integral to calculate the asymptotic weight for
+    //  the center pixel.  Since the unnormalized center pixel weight is 1.0,
+    //  the normalized weight is the same as the weight sum inverse.  Given a
+    //  large enough blur (9+), the asymptotic weight sum is close and faster:
+    //      center_weight = 0.5 *
+    //          (erf(0.5/(sigma*sqrt(2.0))) - erf(-0.5/(sigma*sqrt(2.0))))
+    //      erf(-x) == -erf(x), so we get 0.5 * (2.0 * erf(blah blah)):
+    //  However, we can get even faster results with curve-fitting.  These are
+    //  also closer than the asymptotic results, because they were constructed
+    //  from 64 blurs sizes from [3, 131) and 255 equally-spaced sigmas from
+    //  (0, blurN_std_dev), so the results for smaller sigmas are biased toward
+    //  smaller blurs.  The max error is 0.0031793913.
+    //  Relative FPS: 134.3 with erf, 135.8 with curve-fitting.
+    //static const float temp = 0.5/sqrt(2.0);
+    //return erf(temp/sigma);
+    return min(exp(exp(0.348348412457428/
+        (sigma - 0.0860587260734721))), 0.399334576340352/sigma);
+}
+
+
+////////////////////  ARBITRARILY RESIZABLE SEPARABLE BLURS  ///////////////////
+
+float3 tex2Dblur11resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 11x Gaussian blurred texture lookup using a 11-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  Calculate Gaussian blur kernel weights and a normalization factor for
+    //  distances of 0-4, ignoring constant factors (since we're normalizing).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float weight_sum_inv = 1.0 /
+        (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5));
+    //  Statically normalize weights, sum weighted samples, and return.  Blurs are
+    //  currently optimized for dynamic weights.
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w5 * tex2D_linearize(tex, tex_uv - 5.0 * dxdy).rgb;
+    sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
+    sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb;
+    sum += w5 * tex2D_linearize(tex, tex_uv + 5.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur9resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 9x Gaussian blurred texture lookup using a 9-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
+    sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur7resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 7x Gaussian blurred texture lookup using a 7-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3));
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur5resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 5x Gaussian blurred texture lookup using a 5-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2));
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur3resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 3x Gaussian blurred texture lookup using a 3-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1);
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+
+///////////////////////////  FAST SEPARABLE BLURS  ///////////////////////////
+
+float3 tex2Dblur11fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   1.) Global requirements must be met (see file description).
+    //              2.) filter_linearN must = "true" in your .cgp file.
+    //              3.) For gamma-correct bilinear filtering, global
+    //                  gamma_aware_bilinear == true (from gamma-management.h)
+    //  Returns:    A 1D 11x Gaussian blurred texture lookup using 6 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float weight_sum_inv = 1.0 /
+        (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    const float w01 = w0 * 0.5 + w1;
+    const float w23 = w2 + w3;
+    const float w45 = w4 + w5;
+    const float w01_ratio = w1/w01;
+    const float w23_ratio = w3/w23;
+    const float w45_ratio = w5/w45;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w45 * tex2D_linearize(tex, tex_uv - (4.0 + w45_ratio) * dxdy).rgb;
+    sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb;
+    sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb;
+    sum += w45 * tex2D_linearize(tex, tex_uv + (4.0 + w45_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur9fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 9x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 4 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    const float w12 = w1 + w2;
+    const float w34 = w3 + w4;
+    const float w12_ratio = w2/w12;
+    const float w34_ratio = w4/w34;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w34 * tex2D_linearize(tex, tex_uv - (3.0 + w34_ratio) * dxdy).rgb;
+    sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb;
+    sum += w34 * tex2D_linearize(tex, tex_uv + (3.0 + w34_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur7fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 7x Gaussian blurred texture lookup using 4 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    const float w01 = w0 * 0.5 + w1;
+    const float w23 = w2 + w3;
+    const float w01_ratio = w1/w01;
+    const float w23_ratio = w3/w23;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb;
+    sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur5fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 5x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 2 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    const float w12 = w1 + w2;
+    const float w12_ratio = w2/w12;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur3fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 3x Gaussian blurred texture lookup using 2 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    const float w01 = w0 * 0.5 + w1;
+    const float w01_ratio = w1/w01;
+    //  Weights for all samples are the same, so just average them:
+    return 0.5 * (
+        tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb +
+        tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb);
+}
+
+
+////////////////////////////  HUGE SEPARABLE BLURS  ////////////////////////////
+
+//  Huge separable blurs come only in "fast" versions.
+float3 tex2Dblur43fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 43x Gaussian blurred texture lookup using 22 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float w6 = exp(-36.0 * denom_inv);
+    const float w7 = exp(-49.0 * denom_inv);
+    const float w8 = exp(-64.0 * denom_inv);
+    const float w9 = exp(-81.0 * denom_inv);
+    const float w10 = exp(-100.0 * denom_inv);
+    const float w11 = exp(-121.0 * denom_inv);
+    const float w12 = exp(-144.0 * denom_inv);
+    const float w13 = exp(-169.0 * denom_inv);
+    const float w14 = exp(-196.0 * denom_inv);
+    const float w15 = exp(-225.0 * denom_inv);
+    const float w16 = exp(-256.0 * denom_inv);
+    const float w17 = exp(-289.0 * denom_inv);
+    const float w18 = exp(-324.0 * denom_inv);
+    const float w19 = exp(-361.0 * denom_inv);
+    const float w20 = exp(-400.0 * denom_inv);
+    const float w21 = exp(-441.0 * denom_inv);
+    //const float weight_sum_inv = 1.0 /
+    //    (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 +
+    //        w12 + w13 + w14 + w15 + w16 + w17 + w18 + w19 + w20 + w21));
+    const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    const float w0_1 = w0 * 0.5 + w1;
+    const float w2_3 = w2 + w3;
+    const float w4_5 = w4 + w5;
+    const float w6_7 = w6 + w7;
+    const float w8_9 = w8 + w9;
+    const float w10_11 = w10 + w11;
+    const float w12_13 = w12 + w13;
+    const float w14_15 = w14 + w15;
+    const float w16_17 = w16 + w17;
+    const float w18_19 = w18 + w19;
+    const float w20_21 = w20 + w21;
+    const float w0_1_ratio = w1/w0_1;
+    const float w2_3_ratio = w3/w2_3;
+    const float w4_5_ratio = w5/w4_5;
+    const float w6_7_ratio = w7/w6_7;
+    const float w8_9_ratio = w9/w8_9;
+    const float w10_11_ratio = w11/w10_11;
+    const float w12_13_ratio = w13/w12_13;
+    const float w14_15_ratio = w15/w14_15;
+    const float w16_17_ratio = w17/w16_17;
+    const float w18_19_ratio = w19/w18_19;
+    const float w20_21_ratio = w21/w20_21;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w20_21 * tex2D_linearize(tex, tex_uv - (20.0 + w20_21_ratio) * dxdy).rgb;
+    sum += w18_19 * tex2D_linearize(tex, tex_uv - (18.0 + w18_19_ratio) * dxdy).rgb;
+    sum += w16_17 * tex2D_linearize(tex, tex_uv - (16.0 + w16_17_ratio) * dxdy).rgb;
+    sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb;
+    sum += w16_17 * tex2D_linearize(tex, tex_uv + (16.0 + w16_17_ratio) * dxdy).rgb;
+    sum += w18_19 * tex2D_linearize(tex, tex_uv + (18.0 + w18_19_ratio) * dxdy).rgb;
+    sum += w20_21 * tex2D_linearize(tex, tex_uv + (20.0 + w20_21_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur31fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 31x Gaussian blurred texture lookup using 16 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float w6 = exp(-36.0 * denom_inv);
+    const float w7 = exp(-49.0 * denom_inv);
+    const float w8 = exp(-64.0 * denom_inv);
+    const float w9 = exp(-81.0 * denom_inv);
+    const float w10 = exp(-100.0 * denom_inv);
+    const float w11 = exp(-121.0 * denom_inv);
+    const float w12 = exp(-144.0 * denom_inv);
+    const float w13 = exp(-169.0 * denom_inv);
+    const float w14 = exp(-196.0 * denom_inv);
+    const float w15 = exp(-225.0 * denom_inv);
+    //const float weight_sum_inv = 1.0 /
+    //    (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 +
+    //        w9 + w10 + w11 + w12 + w13 + w14 + w15));
+    const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    const float w0_1 = w0 * 0.5 + w1;
+    const float w2_3 = w2 + w3;
+    const float w4_5 = w4 + w5;
+    const float w6_7 = w6 + w7;
+    const float w8_9 = w8 + w9;
+    const float w10_11 = w10 + w11;
+    const float w12_13 = w12 + w13;
+    const float w14_15 = w14 + w15;
+    const float w0_1_ratio = w1/w0_1;
+    const float w2_3_ratio = w3/w2_3;
+    const float w4_5_ratio = w5/w4_5;
+    const float w6_7_ratio = w7/w6_7;
+    const float w8_9_ratio = w9/w8_9;
+    const float w10_11_ratio = w11/w10_11;
+    const float w12_13_ratio = w13/w12_13;
+    const float w14_15_ratio = w15/w14_15;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur25fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 25x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 12 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float w6 = exp(-36.0 * denom_inv);
+    const float w7 = exp(-49.0 * denom_inv);
+    const float w8 = exp(-64.0 * denom_inv);
+    const float w9 = exp(-81.0 * denom_inv);
+    const float w10 = exp(-100.0 * denom_inv);
+    const float w11 = exp(-121.0 * denom_inv);
+    const float w12 = exp(-144.0 * denom_inv);
+    //const float weight_sum_inv = 1.0 / (w0 + 2.0 * (
+    //    w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12));
+    const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    const float w1_2 = w1 + w2;
+    const float w3_4 = w3 + w4;
+    const float w5_6 = w5 + w6;
+    const float w7_8 = w7 + w8;
+    const float w9_10 = w9 + w10;
+    const float w11_12 = w11 + w12;
+    const float w1_2_ratio = w2/w1_2;
+    const float w3_4_ratio = w4/w3_4;
+    const float w5_6_ratio = w6/w5_6;
+    const float w7_8_ratio = w8/w7_8;
+    const float w9_10_ratio = w10/w9_10;
+    const float w11_12_ratio = w12/w11_12;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w11_12 * tex2D_linearize(tex, tex_uv - (11.0 + w11_12_ratio) * dxdy).rgb;
+    sum += w9_10 * tex2D_linearize(tex, tex_uv - (9.0 + w9_10_ratio) * dxdy).rgb;
+    sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w9_10 * tex2D_linearize(tex, tex_uv + (9.0 + w9_10_ratio) * dxdy).rgb;
+    sum += w11_12 * tex2D_linearize(tex, tex_uv + (11.0 + w11_12_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur17fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 17x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 8 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float w6 = exp(-36.0 * denom_inv);
+    const float w7 = exp(-49.0 * denom_inv);
+    const float w8 = exp(-64.0 * denom_inv);
+    //const float weight_sum_inv = 1.0 / (w0 + 2.0 * (
+    //    w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8));
+    const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    const float w1_2 = w1 + w2;
+    const float w3_4 = w3 + w4;
+    const float w5_6 = w5 + w6;
+    const float w7_8 = w7 + w8;
+    const float w1_2_ratio = w2/w1_2;
+    const float w3_4_ratio = w4/w3_4;
+    const float w5_6_ratio = w6/w5_6;
+    const float w7_8_ratio = w8/w7_8;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+
+////////////////////  ARBITRARILY RESIZABLE ONE-PASS BLURS  ////////////////////
+
+float3 tex2Dblur3x3resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 3x3 Gaussian blurred mipmapped texture lookup of the
+    //              resized input.
+    //  Description:
+    //  This is the only arbitrarily resizable one-pass blur; tex2Dblur5x5resize
+    //  would perform like tex2Dblur9x9, MUCH slower than tex2Dblur5resize.
+    const float denom_inv = 0.5/(sigma*sigma);
+    //  Load each sample.  We need all 3x3 samples.  Quad-pixel communication
+    //  won't help either: This should perform like tex2Dblur5x5, but sharing a
+    //  4x4 sample field would perform more like tex2Dblur8x8shared (worse).
+    const float2 sample4_uv = tex_uv;
+    const float2 dx = float2(dxdy.x, 0.0);
+    const float2 dy = float2(0.0, dxdy.y);
+    const float2 sample1_uv = sample4_uv - dy;
+    const float2 sample7_uv = sample4_uv + dy;
+    const float3 sample0 = tex2D_linearize(tex, sample1_uv - dx).rgb;
+    const float3 sample1 = tex2D_linearize(tex, sample1_uv).rgb;
+    const float3 sample2 = tex2D_linearize(tex, sample1_uv + dx).rgb;
+    const float3 sample3 = tex2D_linearize(tex, sample4_uv - dx).rgb;
+    const float3 sample4 = tex2D_linearize(tex, sample4_uv).rgb;
+    const float3 sample5 = tex2D_linearize(tex, sample4_uv + dx).rgb;
+    const float3 sample6 = tex2D_linearize(tex, sample7_uv - dx).rgb;
+    const float3 sample7 = tex2D_linearize(tex, sample7_uv).rgb;
+    const float3 sample8 = tex2D_linearize(tex, sample7_uv + dx).rgb;
+    //  Statically compute Gaussian sample weights:
+    const float w4 = 1.0;
+    const float w1_3_5_7 = exp(-LENGTH_SQ(float2(1.0, 0.0)) * denom_inv);
+    const float w0_2_6_8 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
+    const float weight_sum_inv = 1.0/(w4 + 4.0 * (w1_3_5_7 + w0_2_6_8));
+    //  Weight and sum the samples:
+    const float3 sum = w4 * sample4 +
+        w1_3_5_7 * (sample1 + sample3 + sample5 + sample7) +
+        w0_2_6_8 * (sample0 + sample2 + sample6 + sample8);
+    return sum * weight_sum_inv;
+}
+
+
+////////////////////////////  FASTER ONE-PASS BLURS  ///////////////////////////
+
+float3 tex2Dblur9x9(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Perform a 1-pass 9x9 blur with 5x5 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 9x9 Gaussian blurred mipmapped texture lookup composed of
+    //              5x5 carefully selected bilinear samples.
+    //  Description:
+    //  Perform a 1-pass 9x9 blur with 5x5 bilinear samples.  Adjust the
+    //  bilinear sample location to reflect the true Gaussian weights for each
+    //  underlying texel.  The following diagram illustrates the relative
+    //  locations of bilinear samples.  Each sample with the same number has the
+    //  same weight (notice the symmetry).  The letters a, b, c, d distinguish
+    //  quadrants, and the letters U, D, L, R, C (up, down, left, right, center)
+    //  distinguish 1D directions along the line containing the pixel center:
+    //      6a 5a 2U 5b 6b
+    //      4a 3a 1U 3b 4b
+    //      2L 1L 0C 1R 2R
+    //      4c 3c 1D 3d 4d
+    //      6c 5c 2D 5d 6d
+    //  The following diagram illustrates the underlying equally spaced texels,
+    //  named after the sample that accesses them and subnamed by their location
+    //  within their 2x2, 2x1, 1x2, or 1x1 texel block:
+    //      6a4 6a3 5a4 5a3 2U2 5b3 5b4 6b3 6b4
+    //      6a2 6a1 5a2 5a1 2U1 5b1 5b2 6b1 6b2
+    //      4a4 4a3 3a4 3a3 1U2 3b3 3b4 4b3 4b4
+    //      4a2 4a1 3a2 3a1 1U1 3b1 3b2 4b1 4b2
+    //      2L2 2L1 1L2 1L1 0C1 1R1 1R2 2R1 2R2
+    //      4c2 4c1 3c2 3c1 1D1 3d1 3d2 4d1 4d2
+    //      4c4 4c3 3c4 3c3 1D2 3d3 3d4 4d3 4d4
+    //      6c2 6c1 5c2 5c1 2D1 5d1 5d2 6d1 6d2
+    //      6c4 6c3 5c4 5c3 2D2 5d3 5d4 6d3 6d4
+    //  Note there is only one C texel and only two texels for each U, D, L, or
+    //  R sample.  The center sample is effectively a nearest neighbor sample,
+    //  and the U/D/L/R samples use 1D linear filtering.  All other texels are
+    //  read with bilinear samples somewhere within their 2x2 texel blocks.
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute sampling offsets within each 2x2 texel block, based
+    //  on 1D sampling ratios between texels [1, 2] and [3, 4] texels away from
+    //  the center, and reuse them independently for both dimensions.  Compute
+    //  these offsets based on the relative 1D Gaussian weights of the texels
+    //  in question.  (w1off means "Gaussian weight for the texel 1.0 texels
+    //  away from the pixel center," etc.).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w1off = exp(-1.0 * denom_inv);
+    const float w2off = exp(-4.0 * denom_inv);
+    const float w3off = exp(-9.0 * denom_inv);
+    const float w4off = exp(-16.0 * denom_inv);
+    const float texel1to2ratio = w2off/(w1off + w2off);
+    const float texel3to4ratio = w4off/(w3off + w4off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including x-axis-aligned:
+    const float2 sample1R_texel_offset = float2(1.0, 0.0) + float2(texel1to2ratio, 0.0);
+    const float2 sample2R_texel_offset = float2(3.0, 0.0) + float2(texel3to4ratio, 0.0);
+    const float2 sample3d_texel_offset = float2(1.0, 1.0) + float2(texel1to2ratio, texel1to2ratio);
+    const float2 sample4d_texel_offset = float2(3.0, 1.0) + float2(texel3to4ratio, texel1to2ratio);
+    const float2 sample5d_texel_offset = float2(1.0, 3.0) + float2(texel1to2ratio, texel3to4ratio);
+    const float2 sample6d_texel_offset = float2(3.0, 3.0) + float2(texel3to4ratio, texel3to4ratio);
+
+    //  CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
+    //  Statically compute Gaussian texel weights for the bottom-right quadrant.
+    //  Read underscores as "and."
+    const float w1R1 = w1off;
+    const float w1R2 = w2off;
+    const float w2R1 = w3off;
+    const float w2R2 = w4off;
+    const float w3d1 =     exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
+    const float w3d2_3d3 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv);
+    const float w3d4 =     exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv);
+    const float w4d1_5d1 = exp(-LENGTH_SQ(float2(3.0, 1.0)) * denom_inv);
+    const float w4d2_5d3 = exp(-LENGTH_SQ(float2(4.0, 1.0)) * denom_inv);
+    const float w4d3_5d2 = exp(-LENGTH_SQ(float2(3.0, 2.0)) * denom_inv);
+    const float w4d4_5d4 = exp(-LENGTH_SQ(float2(4.0, 2.0)) * denom_inv);
+    const float w6d1 =     exp(-LENGTH_SQ(float2(3.0, 3.0)) * denom_inv);
+    const float w6d2_6d3 = exp(-LENGTH_SQ(float2(4.0, 3.0)) * denom_inv);
+    const float w6d4 =     exp(-LENGTH_SQ(float2(4.0, 4.0)) * denom_inv);
+    //  Statically add texel weights in each sample to get sample weights:
+    const float w0 = 1.0;
+    const float w1 = w1R1 + w1R2;
+    const float w2 = w2R1 + w2R2;
+    const float w3 = w3d1 + 2.0 * w3d2_3d3 + w3d4;
+    const float w4 = w4d1_5d1 + w4d2_5d3 + w4d3_5d2 + w4d4_5d4;
+    const float w5 = w4;
+    const float w6 = w6d1 + 2.0 * w6d2_6d3 + w6d4;
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv =
+        1.0/(w0 + 4.0 * (w1 + w2 + w3 + w4 + w5 + w6));
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 25 samples (1 nearest, 8 linear, 16 bilinear) using symmetry:
+    const float2 mirror_x = float2(-1.0, 1.0);
+    const float2 mirror_y = float2(1.0, -1.0);
+    const float2 mirror_xy = float2(-1.0, -1.0);
+    const float2 dxdy_mirror_x = dxdy * mirror_x;
+    const float2 dxdy_mirror_y = dxdy * mirror_y;
+    const float2 dxdy_mirror_xy = dxdy * mirror_xy;
+    //  Sampling order doesn't seem to affect performance, so just be clear:
+    const float3 sample0C = tex2D_linearize(tex, tex_uv).rgb;
+    const float3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb;
+    const float3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb;
+    const float3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb;
+    const float3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb;
+    const float3 sample2R = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset).rgb;
+    const float3 sample2D = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset.yx).rgb;
+    const float3 sample2L = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset).rgb;
+    const float3 sample2U = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset.yx).rgb;
+    const float3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb;
+    const float3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb;
+    const float3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb;
+    const float3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb;
+    const float3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb;
+    const float3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb;
+    const float3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb;
+    const float3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb;
+    const float3 sample5d = tex2D_linearize(tex, tex_uv + dxdy * sample5d_texel_offset).rgb;
+    const float3 sample5c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample5d_texel_offset).rgb;
+    const float3 sample5b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample5d_texel_offset).rgb;
+    const float3 sample5a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample5d_texel_offset).rgb;
+    const float3 sample6d = tex2D_linearize(tex, tex_uv + dxdy * sample6d_texel_offset).rgb;
+    const float3 sample6c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample6d_texel_offset).rgb;
+    const float3 sample6b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample6d_texel_offset).rgb;
+    const float3 sample6a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample6d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    float3 sum = w0 * sample0C;
+    sum += w1 * (sample1R + sample1D + sample1L + sample1U);
+    sum += w2 * (sample2R + sample2D + sample2L + sample2U);
+    sum += w3 * (sample3d + sample3c + sample3b + sample3a);
+    sum += w4 * (sample4d + sample4c + sample4b + sample4a);
+    sum += w5 * (sample5d + sample5c + sample5b + sample5a);
+    sum += w6 * (sample6d + sample6c + sample6b + sample6a);
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur7x7(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Perform a 1-pass 7x7 blur with 5x5 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 7x7 Gaussian blurred mipmapped texture lookup composed of
+    //              4x4 carefully selected bilinear samples.
+    //  Description:
+    //  First see the descriptions for tex2Dblur9x9() and tex2Dblur7().  This
+    //  blur mixes concepts from both.  The sample layout is as follows:
+    //      4a 3a 3b 4b
+    //      2a 1a 1b 2b
+    //      2c 1c 1d 2d
+    //      4c 3c 3d 4d
+    //  The texel layout is as follows.  Note that samples 3a/3b, 1a/1b, 1c/1d,
+    //  and 3c/3d share a vertical column of texels, and samples 2a/2c, 1a/1c,
+    //  1b/1d, and 2b/2d share a horizontal row of texels (all sample1's share
+    //  the center texel):
+    //      4a4  4a3  3a4  3ab3 3b4  4b3  4b4
+    //      4a2  4a1  3a2  3ab1 3b2  4b1  4b2
+    //      2a4  2a3  1a4  1ab3 1b4  2b3  2b4
+    //      2ac2 2ac1 1ac2 1*   1bd2 2bd1 2bd2
+    //      2c4  2c3  1c4  1cd3 1d4  2d3  2d4
+    //      4c2  4c1  3c2  3cd1 3d2  4d1  4d2
+    //      4c4  4c3  3c4  3cd3 3d4  4d3  4d4
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off = 1.0;
+    const float w1off = exp(-1.0 * denom_inv);
+    const float w2off = exp(-4.0 * denom_inv);
+    const float w3off = exp(-9.0 * denom_inv);
+    const float texel0to1ratio = w1off/(w0off * 0.5 + w1off);
+    const float texel2to3ratio = w3off/(w2off + w3off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including axis-aligned:
+    const float2 sample1d_texel_offset = float2(texel0to1ratio, texel0to1ratio);
+    const float2 sample2d_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample3d_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample4d_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+
+    //  CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
+    //  Statically compute Gaussian texel weights for the bottom-right quadrant.
+    //  Read underscores as "and."
+    const float w1abcd = 1.0;
+    const float w1bd2_1cd3 = exp(-LENGTH_SQ(float2(1.0, 0.0)) * denom_inv);
+    const float w2bd1_3cd1 = exp(-LENGTH_SQ(float2(2.0, 0.0)) * denom_inv);
+    const float w2bd2_3cd2 = exp(-LENGTH_SQ(float2(3.0, 0.0)) * denom_inv);
+    const float w1d4 =       exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
+    const float w2d3_3d2 =   exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv);
+    const float w2d4_3d4 =   exp(-LENGTH_SQ(float2(3.0, 1.0)) * denom_inv);
+    const float w4d1 =       exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv);
+    const float w4d2_4d3 =   exp(-LENGTH_SQ(float2(3.0, 2.0)) * denom_inv);
+    const float w4d4 =       exp(-LENGTH_SQ(float2(3.0, 3.0)) * denom_inv);
+    //  Statically add texel weights in each sample to get sample weights.
+    //  Split weights for shared texels between samples sharing them:
+    const float w1 = w1abcd * 0.25 + w1bd2_1cd3 + w1d4;
+    const float w2_3 = (w2bd1_3cd1 + w2bd2_3cd2) * 0.5 + w2d3_3d2 + w2d4_3d4;
+    const float w4 = w4d1 + 2.0 * w4d2_4d3 + w4d4;
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv =
+        1.0/(4.0 * (w1 + 2.0 * w2_3 + w4));
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 16 samples using symmetry:
+    const float2 mirror_x = float2(-1.0, 1.0);
+    const float2 mirror_y = float2(1.0, -1.0);
+    const float2 mirror_xy = float2(-1.0, -1.0);
+    const float2 dxdy_mirror_x = dxdy * mirror_x;
+    const float2 dxdy_mirror_y = dxdy * mirror_y;
+    const float2 dxdy_mirror_xy = dxdy * mirror_xy;
+    const float3 sample1a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample1d_texel_offset).rgb;
+    const float3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb;
+    const float3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb;
+    const float3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb;
+    const float3 sample1b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample1d_texel_offset).rgb;
+    const float3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb;
+    const float3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb;
+    const float3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb;
+    const float3 sample1c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample1d_texel_offset).rgb;
+    const float3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb;
+    const float3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb;
+    const float3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb;
+    const float3 sample1d = tex2D_linearize(tex, tex_uv + dxdy * sample1d_texel_offset).rgb;
+    const float3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb;
+    const float3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb;
+    const float3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w1 * (sample1a + sample1b + sample1c + sample1d);
+    sum += w2_3 * (sample2a + sample2b + sample2c + sample2d);
+    sum += w2_3 * (sample3a + sample3b + sample3c + sample3d);
+    sum += w4 * (sample4a + sample4b + sample4c + sample4d);
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur5x5(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Perform a 1-pass 5x5 blur with 3x3 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 5x5 Gaussian blurred mipmapped texture lookup composed of
+    //              3x3 carefully selected bilinear samples.
+    //  Description:
+    //  First see the description for tex2Dblur9x9().  This blur uses the same
+    //  concept and sample/texel locations except on a smaller scale.  Samples:
+    //      2a 1U 2b
+    //      1L 0C 1R
+    //      2c 1D 2d
+    //  Texels:
+    //      2a4 2a3 1U2 2b3 2b4
+    //      2a2 2a1 1U1 2b1 2b2
+    //      1L2 1L1 0C1 1R1 1R2
+    //      2c2 2c1 1D1 2d1 2d2
+    //      2c4 2c3 1D2 2d3 2d4
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w1off = exp(-1.0 * denom_inv);
+    const float w2off = exp(-4.0 * denom_inv);
+    const float texel1to2ratio = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including x-axis-aligned:
+    const float2 sample1R_texel_offset = float2(1.0, 0.0) + float2(texel1to2ratio, 0.0);
+    const float2 sample2d_texel_offset = float2(1.0, 1.0) + float2(texel1to2ratio, texel1to2ratio);
+
+    //  CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
+    //  Statically compute Gaussian texel weights for the bottom-right quadrant.
+    //  Read underscores as "and."
+    const float w1R1 = w1off;
+    const float w1R2 = w2off;
+    const float w2d1 =   exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
+    const float w2d2_3 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv);
+    const float w2d4 =   exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv);
+    //  Statically add texel weights in each sample to get sample weights:
+    const float w0 = 1.0;
+    const float w1 = w1R1 + w1R2;
+    const float w2 = w2d1 + 2.0 * w2d2_3 + w2d4;
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv = 1.0/(w0 + 4.0 * (w1 + w2));
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 9 samples (1 nearest, 4 linear, 4 bilinear) using symmetry:
+    const float2 mirror_x = float2(-1.0, 1.0);
+    const float2 mirror_y = float2(1.0, -1.0);
+    const float2 mirror_xy = float2(-1.0, -1.0);
+    const float2 dxdy_mirror_x = dxdy * mirror_x;
+    const float2 dxdy_mirror_y = dxdy * mirror_y;
+    const float2 dxdy_mirror_xy = dxdy * mirror_xy;
+    const float3 sample0C = tex2D_linearize(tex, tex_uv).rgb;
+    const float3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb;
+    const float3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb;
+    const float3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb;
+    const float3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb;
+    const float3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb;
+    const float3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb;
+    const float3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb;
+    const float3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    float3 sum = w0 * sample0C;
+    sum += w1 * (sample1R + sample1D + sample1L + sample1U);
+    sum += w2 * (sample2a + sample2b + sample2c + sample2d);
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur3x3(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Perform a 1-pass 3x3 blur with 5x5 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 3x3 Gaussian blurred mipmapped texture lookup composed of
+    //              2x2 carefully selected bilinear samples.
+    //  Description:
+    //  First see the descriptions for tex2Dblur9x9() and tex2Dblur7().  This
+    //  blur mixes concepts from both.  The sample layout is as follows:
+    //      0a 0b
+    //      0c 0d
+    //  The texel layout is as follows.  Note that samples 0a/0b and 0c/0d share
+    //  a vertical column of texels, and samples 0a/0c and 0b/0d share a
+    //  horizontal row of texels (all samples share the center texel):
+    //      0a3  0ab2 0b3
+    //      0ac1 0*0  0bd1
+    //      0c3  0cd2 0d3
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off = 1.0;
+    const float w1off = exp(-1.0 * denom_inv);
+    const float texel0to1ratio = w1off/(w0off * 0.5 + w1off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including axis-aligned:
+    const float2 sample0d_texel_offset = float2(texel0to1ratio, texel0to1ratio);
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 4 samples using symmetry:
+    const float2 mirror_x = float2(-1.0, 1.0);
+    const float2 mirror_y = float2(1.0, -1.0);
+    const float2 mirror_xy = float2(-1.0, -1.0);
+    const float2 dxdy_mirror_x = dxdy * mirror_x;
+    const float2 dxdy_mirror_y = dxdy * mirror_y;
+    const float2 dxdy_mirror_xy = dxdy * mirror_xy;
+    const float3 sample0a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample0d_texel_offset).rgb;
+    const float3 sample0b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample0d_texel_offset).rgb;
+    const float3 sample0c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample0d_texel_offset).rgb;
+    const float3 sample0d = tex2D_linearize(tex, tex_uv + dxdy * sample0d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Weights for all samples are the same, so just average them:
+    return 0.25 * (sample0a + sample0b + sample0c + sample0d);
+}
+
+
+//////////////////  LINEAR ONE-PASS BLURS WITH SHARED SAMPLES  /////////////////
+
+float3 tex2Dblur12x12shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
+    const float sigma)
+{
+    //  Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
+    //  Requires:   1.) Same as tex2Dblur9()
+    //              2.) ddx() and ddy() are present in the current Cg profile.
+    //              3.) The GPU driver is using fine/high-quality derivatives.
+    //              4.) quad_vector *correctly* describes the current fragment's
+    //                  location in its pixel quad, by the conventions noted in
+    //                  get_quad_vector[_naive].
+    //              5.) tex_uv.w = log2(video_size/output_size).y
+    //              6.) tex2Dlod() is present in the current Cg profile.
+    //  Optional:   Tune artifacts vs. excessive blurriness with the global
+    //              float error_blurring.
+    //  Returns:    A blurred texture lookup using a "virtual" 12x12 Gaussian
+    //              blur (a 6x6 blur of carefully selected bilinear samples)
+    //              of the given mip level.  There will be subtle inaccuracies,
+    //              especially for small or high-frequency detailed sources.
+    //  Description:
+    //  Perform a 1-pass blur with shared texture lookups across a pixel quad.
+    //  We'll get neighboring samples with high-quality ddx/ddy derivatives, as
+    //  in GPU Pro 2, Chapter VI.2, "Shader Amortization using Pixel Quad
+    //  Message Passing" by Eric Penner.
+    //
+    //  Our "virtual" 12x12 blur will be comprised of ((6 - 1)^2)/4 + 3 = 12
+    //  bilinear samples, where bilinear sampling positions are computed from
+    //  the relative Gaussian weights of the 4 surrounding texels.  The catch is
+    //  that the appropriate texel weights and sample coords differ for each
+    //  fragment, but we're reusing most of the same samples across a quad of
+    //  destination fragments.  (We do use unique coords for the four nearest
+    //  samples at each fragment.)  Mixing bilinear filtering and sample-sharing
+    //  therefore introduces some error into the weights, and this can get nasty
+    //  when the source image is small or high-frequency.  Computing bilinear
+    //  ratios based on weights at the sample field center results in sharpening
+    //  and ringing artifacts, but we can move samples closer to halfway between
+    //  texels to try blurring away the error (which can move features around by
+    //  a texel or so).  Tune this with the global float "error_blurring".
+    //
+    //  The pixel quad's sample field covers 12x12 texels, accessed through 6x6
+    //  bilinear (2x2 texel) taps.  Each fragment depends on a window of 10x10
+    //  texels (5x5 bilinear taps), and each fragment is responsible for loading
+    //  a 6x6 texel quadrant as a 3x3 block of bilinear taps, plus 3 more taps
+    //  to use unique bilinear coords for sample0* for each fragment.  This
+    //  diagram illustrates the relative locations of bilinear samples 1-9 for
+    //  each quadrant a, b, c, d (note samples will not be equally spaced):
+    //      8a 7a 6a 6b 7b 8b
+    //      5a 4a 3a 3b 4b 5b
+    //      2a 1a 0a 0b 1b 2b
+    //      2c 1c 0c 0d 1d 2d
+    //      5c 4c 3c 3d 4d 5d
+    //      8c 7c 6c 6d 7d 8d
+    //  The following diagram illustrates the underlying equally spaced texels,
+    //  named after the sample that accesses them and subnamed by their location
+    //  within their 2x2 texel block:
+    //      8a3 8a2 7a3 7a2 6a3 6a2 6b2 6b3 7b2 7b3 8b2 8b3
+    //      8a1 8a0 7a1 7a0 6a1 6a0 6b0 6b1 7b0 7b1 8b0 8b1
+    //      5a3 5a2 4a3 4a2 3a3 3a2 3b2 3b3 4b2 4b3 5b2 5b3
+    //      5a1 5a0 4a1 4a0 3a1 3a0 3b0 3b1 4b0 4b1 5b0 5b1
+    //      2a3 2a2 1a3 1a2 0a3 0a2 0b2 0b3 1b2 1b3 2b2 2b3
+    //      2a1 2a0 1a1 1a0 0a1 0a0 0b0 0b1 1b0 1b1 2b0 2b1
+    //      2c1 2c0 1c1 1c0 0c1 0c0 0d0 0d1 1d0 1d1 2d0 2d1
+    //      2c3 2c2 1c3 1c2 0c3 0c2 0d2 0d3 1d2 1d3 2d2 2d3
+    //      5c1 5c0 4c1 4c0 3c1 3c0 3d0 3d1 4d0 4d1 5d0 5d1
+    //      5c3 5c2 4c3 4c2 3c3 3c2 3d2 3d3 4d2 4d3 5d2 5d3
+    //      8c1 8c0 7c1 7c0 6c1 6c0 6d0 6d1 7d0 7d1 8d0 8d1
+    //      8c3 8c2 7c3 7c2 6c3 6c2 6d2 6d3 7d2 7d3 8d2 8d3
+    //  With this symmetric arrangement, we don't have to know which absolute
+    //  quadrant a sample lies in to assign kernel weights; it's enough to know
+    //  the sample number and the relative quadrant of the sample (relative to
+    //  the current quadrant):
+    //      {current, adjacent x, adjacent y, diagonal}
+
+    //  COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Statically compute sampling offsets within each 2x2 texel block, based
+    //  on appropriate 1D Gaussian sampling ratio between texels [0, 1], [2, 3],
+    //  and [4, 5] away from the fragment, and reuse them independently for both
+    //  dimensions.  Use the sample field center as the estimated destination,
+    //  but nudge the result closer to halfway between texels to blur error.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off   = 1.0;
+    const float w0_5off = exp(-(0.5*0.5) * denom_inv);
+    const float w1off   = exp(-(1.0*1.0) * denom_inv);
+    const float w1_5off = exp(-(1.5*1.5) * denom_inv);
+    const float w2off   = exp(-(2.0*2.0) * denom_inv);
+    const float w2_5off = exp(-(2.5*2.5) * denom_inv);
+    const float w3_5off = exp(-(3.5*3.5) * denom_inv);
+    const float w4_5off = exp(-(4.5*4.5) * denom_inv);
+    const float w5_5off = exp(-(5.5*5.5) * denom_inv);
+    const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
+    const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
+    const float texel4to5ratio = lerp(w5_5off/(w4_5off + w5_5off), 0.5, error_blurring);
+    //  We don't share sample0*, so use the nearest destination fragment:
+    const float texel0to1ratio_nearest = w1off/(w0off + w1off);
+    const float texel1to2ratio_nearest = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the bottom-right fragment to each
+    //  bilinear sample in the bottom-right quadrant:
+    const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample2_texel_offset = float2(4.0, 0.0) + float2(texel4to5ratio, texel0to1ratio);
+    const float2 sample3_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample4_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+    const float2 sample5_texel_offset = float2(4.0, 2.0) + float2(texel4to5ratio, texel2to3ratio);
+    const float2 sample6_texel_offset = float2(0.0, 4.0) + float2(texel0to1ratio, texel4to5ratio);
+    const float2 sample7_texel_offset = float2(2.0, 4.0) + float2(texel2to3ratio, texel4to5ratio);
+    const float2 sample8_texel_offset = float2(4.0, 4.0) + float2(texel4to5ratio, texel4to5ratio);
+
+    //  CALCULATE KERNEL WEIGHTS:
+    //  Statically compute bilinear sample weights at each destination fragment
+    //  based on the sum of their 4 underlying texel weights.  Assume a same-
+    //  resolution blur, so each symmetrically named sample weight will compute
+    //  the same at every fragment in the pixel quad: We can therefore compute
+    //  texel weights based only on the bottom-right quadrant (fragment at 0d0).
+    //  Too avoid too much boilerplate code, use a macro to get all 4 texel
+    //  weights for a bilinear sample based on the offset of its top-left texel:
+    #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
+        (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
+    const float w8diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -6.0);
+    const float w7diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -6.0);
+    const float w6diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -6.0);
+    const float w6adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -6.0);
+    const float w7adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -6.0);
+    const float w8adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -6.0);
+    const float w5diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -4.0);
+    const float w4diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0);
+    const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0);
+    const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0);
+    const float w4adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0);
+    const float w5adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -4.0);
+    const float w2diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -2.0);
+    const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0);
+    const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
+    const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
+    const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
+    const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -2.0);
+    const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 0.0);
+    const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0);
+    const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
+    const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
+    const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
+    const float w2curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 0.0);
+    const float w5adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 2.0);
+    const float w4adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0);
+    const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
+    const float w3curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
+    const float w4curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
+    const float w5curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 2.0);
+    const float w8adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 4.0);
+    const float w7adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 4.0);
+    const float w6adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 4.0);
+    const float w6curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 4.0);
+    const float w7curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 4.0);
+    const float w8curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 4.0);
+    #undef GET_TEXEL_QUAD_WEIGHTS
+    //  Statically pack weights for runtime:
+    const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
+    const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag);
+    const float4 w2 = float4(w2curr, w2adjx, w2adjy, w2diag);
+    const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag);
+    const float4 w4 = float4(w4curr, w4adjx, w4adjy, w4diag);
+    const float4 w5 = float4(w5curr, w5adjx, w5adjy, w5diag);
+    const float4 w6 = float4(w6curr, w6adjx, w6adjy, w6diag);
+    const float4 w7 = float4(w7curr, w7adjx, w7adjy, w7diag);
+    const float4 w8 = float4(w8curr, w8adjx, w8adjy, w8diag);
+    //  Get the weight sum inverse (normalization factor):
+    const float4 weight_sum4 = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8;
+    const float2 weight_sum2 = weight_sum4.xy + weight_sum4.zw;
+    const float weight_sum = weight_sum2.x + weight_sum2.y;
+    const float weight_sum_inv = 1.0/(weight_sum);
+
+    //  LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
+    const float2 dxdy_curr = dxdy * quad_vector.xy;
+    //  Load bilinear samples for the current quadrant (for this fragment):
+    const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
+    const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
+    const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
+    const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
+    const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
+    const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
+    const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
+    const float3 sample4curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample4_texel_offset)).rgb;
+    const float3 sample5curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample5_texel_offset)).rgb;
+    const float3 sample6curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample6_texel_offset)).rgb;
+    const float3 sample7curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample7_texel_offset)).rgb;
+    const float3 sample8curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample8_texel_offset)).rgb;
+
+    //  GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
+    //  Fetch the samples from other fragments in the 2x2 quad:
+    float3 sample1adjx, sample1adjy, sample1diag;
+    float3 sample2adjx, sample2adjy, sample2diag;
+    float3 sample3adjx, sample3adjy, sample3diag;
+    float3 sample4adjx, sample4adjy, sample4diag;
+    float3 sample5adjx, sample5adjy, sample5diag;
+    float3 sample6adjx, sample6adjy, sample6diag;
+    float3 sample7adjx, sample7adjy, sample7diag;
+    float3 sample8adjx, sample8adjy, sample8diag;
+    quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
+    quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
+    quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag);
+    quad_gather(quad_vector, sample4curr, sample4adjx, sample4adjy, sample4diag);
+    quad_gather(quad_vector, sample5curr, sample5adjx, sample5adjy, sample5diag);
+    quad_gather(quad_vector, sample6curr, sample6adjx, sample6adjy, sample6diag);
+    quad_gather(quad_vector, sample7curr, sample7adjx, sample7adjy, sample7diag);
+    quad_gather(quad_vector, sample8curr, sample8adjx, sample8adjy, sample8diag);
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    //  Fill each row of a matrix with an rgb sample and pre-multiply by the
+    //  weights to obtain a weighted result:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
+    sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag));
+    sum += mul(w2, float4x3(sample2curr, sample2adjx, sample2adjy, sample2diag));
+    sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag));
+    sum += mul(w4, float4x3(sample4curr, sample4adjx, sample4adjy, sample4diag));
+    sum += mul(w5, float4x3(sample5curr, sample5adjx, sample5adjy, sample5diag));
+    sum += mul(w6, float4x3(sample6curr, sample6adjx, sample6adjy, sample6diag));
+    sum += mul(w7, float4x3(sample7curr, sample7adjx, sample7adjy, sample7diag));
+    sum += mul(w8, float4x3(sample8curr, sample8adjx, sample8adjy, sample8diag));
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur10x10shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
+    const float sigma)
+{
+    //  Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
+    //  Requires:   Same as tex2Dblur12x12shared()
+    //  Returns:    A blurred texture lookup using a "virtual" 10x10 Gaussian
+    //              blur (a 5x5 blur of carefully selected bilinear samples)
+    //              of the given mip level.  There will be subtle inaccuracies,
+    //              especially for small or high-frequency detailed sources.
+    //  Description:
+    //  First see the description for tex2Dblur12x12shared().  This
+    //  function shares the same concept and sample placement, but each fragment
+    //  only uses 25 of the 36 samples taken across the pixel quad (to cover a
+    //  5x5 sample area, or 10x10 texel area), and it uses a lower standard
+    //  deviation to compensate.  Thanks to symmetry, the 11 omitted samples
+    //  are always the "same:"
+    //      8adjx, 2adjx, 5adjx,
+    //      6adjy, 7adjy, 8adjy,
+    //      2diag, 5diag, 6diag, 7diag, 8diag
+
+    //  COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off   = 1.0;
+    const float w0_5off = exp(-(0.5*0.5) * denom_inv);
+    const float w1off   = exp(-(1.0*1.0) * denom_inv);
+    const float w1_5off = exp(-(1.5*1.5) * denom_inv);
+    const float w2off   = exp(-(2.0*2.0) * denom_inv);
+    const float w2_5off = exp(-(2.5*2.5) * denom_inv);
+    const float w3_5off = exp(-(3.5*3.5) * denom_inv);
+    const float w4_5off = exp(-(4.5*4.5) * denom_inv);
+    const float w5_5off = exp(-(5.5*5.5) * denom_inv);
+    const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
+    const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
+    const float texel4to5ratio = lerp(w5_5off/(w4_5off + w5_5off), 0.5, error_blurring);
+    //  We don't share sample0*, so use the nearest destination fragment:
+    const float texel0to1ratio_nearest = w1off/(w0off + w1off);
+    const float texel1to2ratio_nearest = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the bottom-right fragment to each
+    //  bilinear sample in the bottom-right quadrant:
+    const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample2_texel_offset = float2(4.0, 0.0) + float2(texel4to5ratio, texel0to1ratio);
+    const float2 sample3_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample4_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+    const float2 sample5_texel_offset = float2(4.0, 2.0) + float2(texel4to5ratio, texel2to3ratio);
+    const float2 sample6_texel_offset = float2(0.0, 4.0) + float2(texel0to1ratio, texel4to5ratio);
+    const float2 sample7_texel_offset = float2(2.0, 4.0) + float2(texel2to3ratio, texel4to5ratio);
+    const float2 sample8_texel_offset = float2(4.0, 4.0) + float2(texel4to5ratio, texel4to5ratio);
+
+    //  CALCULATE KERNEL WEIGHTS:
+    //  Statically compute bilinear sample weights at each destination fragment
+    //  from the sum of their 4 texel weights (details in tex2Dblur12x12shared).
+    #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
+        (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
+    //  We only need 25 of the 36 sample weights.  Skip the following weights:
+    //      8adjx, 2adjx, 5adjx,
+    //      6adjy, 7adjy, 8adjy,
+    //      2diag, 5diag, 6diag, 7diag, 8diag
+    const float w4diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0);
+    const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0);
+    const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0);
+    const float w4adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0);
+    const float w5adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -4.0);
+    const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0);
+    const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
+    const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
+    const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
+    const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -2.0);
+    const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0);
+    const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
+    const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
+    const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
+    const float w2curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 0.0);
+    const float w4adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0);
+    const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
+    const float w3curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
+    const float w4curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
+    const float w5curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 2.0);
+    const float w7adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 4.0);
+    const float w6adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 4.0);
+    const float w6curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 4.0);
+    const float w7curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 4.0);
+    const float w8curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 4.0);
+    #undef GET_TEXEL_QUAD_WEIGHTS
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv = 1.0/(w0curr + w1curr + w2curr + w3curr +
+        w4curr + w5curr + w6curr + w7curr + w8curr +
+        w0adjx + w1adjx + w3adjx + w4adjx + w6adjx + w7adjx +
+        w0adjy + w1adjy + w2adjy + w3adjy + w4adjy + w5adjy +
+        w0diag + w1diag + w3diag + w4diag);
+    //  Statically pack most weights for runtime.  Note the mixed packing:
+    const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
+    const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag);
+    const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag);
+    const float4 w4 = float4(w4curr, w4adjx, w4adjy, w4diag);
+    const float4 w2and5 = float4(w2curr, w2adjy, w5curr, w5adjy);
+    const float4 w6and7 = float4(w6curr, w6adjx, w7curr, w7adjx);
+
+    //  LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
+    const float2 dxdy_curr = dxdy * quad_vector.xy;
+    //  Load bilinear samples for the current quadrant (for this fragment):
+    const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
+    const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
+    const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
+    const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
+    const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
+    const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
+    const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
+    const float3 sample4curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample4_texel_offset)).rgb;
+    const float3 sample5curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample5_texel_offset)).rgb;
+    const float3 sample6curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample6_texel_offset)).rgb;
+    const float3 sample7curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample7_texel_offset)).rgb;
+    const float3 sample8curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample8_texel_offset)).rgb;
+
+    //  GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
+    //  Fetch the samples from other fragments in the 2x2 quad in order of need:
+    float3 sample1adjx, sample1adjy, sample1diag;
+    float3 sample2adjx, sample2adjy, sample2diag;
+    float3 sample3adjx, sample3adjy, sample3diag;
+    float3 sample4adjx, sample4adjy, sample4diag;
+    float3 sample5adjx, sample5adjy, sample5diag;
+    float3 sample6adjx, sample6adjy, sample6diag;
+    float3 sample7adjx, sample7adjy, sample7diag;
+    quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
+    quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
+    quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag);
+    quad_gather(quad_vector, sample4curr, sample4adjx, sample4adjy, sample4diag);
+    quad_gather(quad_vector, sample5curr, sample5adjx, sample5adjy, sample5diag);
+    quad_gather(quad_vector, sample6curr, sample6adjx, sample6adjy, sample6diag);
+    quad_gather(quad_vector, sample7curr, sample7adjx, sample7adjy, sample7diag);
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    //  Fill each row of a matrix with an rgb sample and pre-multiply by the
+    //  weights to obtain a weighted result.  First do the simple ones:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
+    sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag));
+    sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag));
+    sum += mul(w4, float4x3(sample4curr, sample4adjx, sample4adjy, sample4diag));
+    //  Now do the mixed-sample ones:
+    sum += mul(w2and5, float4x3(sample2curr, sample2adjy, sample5curr, sample5adjy));
+    sum += mul(w6and7, float4x3(sample6curr, sample6adjx, sample7curr, sample7adjx));
+    sum += w8curr * sample8curr;
+    //  Normalize the sum (so the weights add to 1.0) and return:
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur8x8shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
+    const float sigma)
+{
+    //  Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
+    //  Requires:   Same as tex2Dblur12x12shared()
+    //  Returns:    A blurred texture lookup using a "virtual" 8x8 Gaussian
+    //              blur (a 4x4 blur of carefully selected bilinear samples)
+    //              of the given mip level.  There will be subtle inaccuracies,
+    //              especially for small or high-frequency detailed sources.
+    //  Description:
+    //  First see the description for tex2Dblur12x12shared().  This function
+    //  shares the same concept and a similar sample placement, except each
+    //  quadrant contains 4x4 texels and 2x2 samples instead of 6x6 and 3x3
+    //  respectively.  There could be a total of 16 samples, 4 of which each
+    //  fragment is responsible for, but each fragment loads 0a/0b/0c/0d with
+    //  its own offset to reduce shared sample artifacts, bringing the sample
+    //  count for each fragment to 7.  Sample placement:
+    //      3a 2a 2b 3b
+    //      1a 0a 0b 1b
+    //      1c 0c 0d 1d
+    //      3c 2c 2d 3d
+    //  Texel placement:
+    //      3a3 3a2 2a3 2a2 2b2 2b3 3b2 3b3
+    //      3a1 3a0 2a1 2a0 2b0 2b1 3b0 3b1
+    //      1a3 1a2 0a3 0a2 0b2 0b3 1b2 1b3
+    //      1a1 1a0 0a1 0a0 0b0 0b1 1b0 1b1
+    //      1c1 1c0 0c1 0c0 0d0 0d1 1d0 1d1
+    //      1c3 1c2 0c3 0c2 0d2 0d3 1d2 1d3
+    //      3c1 3c0 2c1 2c0 2d0 2d1 3d0 4d1
+    //      3c3 3c2 2c3 2c2 2d2 2d3 3d2 4d3
+    
+    //  COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off   = 1.0;
+    const float w0_5off = exp(-(0.5*0.5) * denom_inv);
+    const float w1off   = exp(-(1.0*1.0) * denom_inv);
+    const float w1_5off = exp(-(1.5*1.5) * denom_inv);
+    const float w2off   = exp(-(2.0*2.0) * denom_inv);
+    const float w2_5off = exp(-(2.5*2.5) * denom_inv);
+    const float w3_5off = exp(-(3.5*3.5) * denom_inv);
+    const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
+    const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
+    //  We don't share sample0*, so use the nearest destination fragment:
+    const float texel0to1ratio_nearest = w1off/(w0off + w1off);
+    const float texel1to2ratio_nearest = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the bottom-right fragment to each
+    //  bilinear sample in the bottom-right quadrant:
+    const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample2_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample3_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+
+    //  CALCULATE KERNEL WEIGHTS:
+    //  Statically compute bilinear sample weights at each destination fragment
+    //  from the sum of their 4 texel weights (details in tex2Dblur12x12shared).
+    #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
+        (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
+    const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0);
+    const float w2diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0);
+    const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0);
+    const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0);
+    const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0);
+    const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
+    const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
+    const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
+    const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0);
+    const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
+    const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
+    const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
+    const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0);
+    const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
+    const float w2curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
+    const float w3curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
+    #undef GET_TEXEL_QUAD_WEIGHTS
+    //  Statically pack weights for runtime:
+    const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
+    const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag);
+    const float4 w2 = float4(w2curr, w2adjx, w2adjy, w2diag);
+    const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag);
+    //  Get the weight sum inverse (normalization factor):
+    const float4 weight_sum4 = w0 + w1 + w2 + w3;
+    const float2 weight_sum2 = weight_sum4.xy + weight_sum4.zw;
+    const float weight_sum = weight_sum2.x + weight_sum2.y;
+    const float weight_sum_inv = 1.0/(weight_sum);
+
+    //  LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
+    const float2 dxdy_curr = dxdy * quad_vector.xy;
+    //  Load bilinear samples for the current quadrant (for this fragment):
+    const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
+    const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
+    const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
+    const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
+    const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
+    const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
+    const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
+
+    //  GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
+    //  Fetch the samples from other fragments in the 2x2 quad:
+    float3 sample1adjx, sample1adjy, sample1diag;
+    float3 sample2adjx, sample2adjy, sample2diag;
+    float3 sample3adjx, sample3adjy, sample3diag;
+    quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
+    quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
+    quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag);
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    //  Fill each row of a matrix with an rgb sample and pre-multiply by the
+    //  weights to obtain a weighted result:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
+    sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag));
+    sum += mul(w2, float4x3(sample2curr, sample2adjx, sample2adjy, sample2diag));
+    sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag));
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur6x6shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
+    const float sigma)
+{
+    //  Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
+    //  Requires:   Same as tex2Dblur12x12shared()
+    //  Returns:    A blurred texture lookup using a "virtual" 6x6 Gaussian
+    //              blur (a 3x3 blur of carefully selected bilinear samples)
+    //              of the given mip level.  There will be some inaccuracies,subtle inaccuracies,
+    //              especially for small or high-frequency detailed sources.
+    //  Description:
+    //  First see the description for tex2Dblur8x8shared().  This
+    //  function shares the same concept and sample placement, but each fragment
+    //  only uses 9 of the 16 samples taken across the pixel quad (to cover a
+    //  3x3 sample area, or 6x6 texel area), and it uses a lower standard
+    //  deviation to compensate.  Thanks to symmetry, the 7 omitted samples
+    //  are always the "same:"
+    //      1adjx, 3adjx
+    //      2adjy, 3adjy
+    //      1diag, 2diag, 3diag
+
+    //  COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off   = 1.0;
+    const float w0_5off = exp(-(0.5*0.5) * denom_inv);
+    const float w1off   = exp(-(1.0*1.0) * denom_inv);
+    const float w1_5off = exp(-(1.5*1.5) * denom_inv);
+    const float w2off   = exp(-(2.0*2.0) * denom_inv);
+    const float w2_5off = exp(-(2.5*2.5) * denom_inv);
+    const float w3_5off = exp(-(3.5*3.5) * denom_inv);
+    const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
+    const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
+    //  We don't share sample0*, so use the nearest destination fragment:
+    const float texel0to1ratio_nearest = w1off/(w0off + w1off);
+    const float texel1to2ratio_nearest = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the bottom-right fragment to each
+    //  bilinear sample in the bottom-right quadrant:
+    const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample2_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample3_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+
+    //  CALCULATE KERNEL WEIGHTS:
+    //  Statically compute bilinear sample weights at each destination fragment
+    //  from the sum of their 4 texel weights (details in tex2Dblur12x12shared).
+    #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
+        (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
+    //  We only need 9 of the 16 sample weights.  Skip the following weights:
+    //      1adjx, 3adjx
+    //      2adjy, 3adjy
+    //      1diag, 2diag, 3diag
+    const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
+    const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
+    const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
+    const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
+    const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
+    const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
+    const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
+    const float w2curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
+    const float w3curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
+    #undef GET_TEXEL_QUAD_WEIGHTS
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv = 1.0/(w0curr + w1curr + w2curr + w3curr +
+        w0adjx + w2adjx + w0adjy + w1adjy + w0diag);
+    //  Statically pack some weights for runtime:
+    const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
+
+    //  LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
+    const float2 dxdy_curr = dxdy * quad_vector.xy;
+    //  Load bilinear samples for the current quadrant (for this fragment):
+    const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
+    const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
+    const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
+    const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
+    const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
+    const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
+    const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
+
+    //  GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
+    //  Fetch the samples from other fragments in the 2x2 quad:
+    float3 sample1adjx, sample1adjy, sample1diag;
+    float3 sample2adjx, sample2adjy, sample2diag;
+    quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
+    quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    //  Fill each row of a matrix with an rgb sample and pre-multiply by the
+    //  weights to obtain a weighted result for sample1*, and handle the rest
+    //  of the weights more directly/verbosely:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
+    sum += w1curr * sample1curr + w1adjy * sample1adjy + w2curr * sample2curr +
+            w2adjx * sample2adjx + w3curr * sample3curr;
+    return sum * weight_sum_inv;
+}
+
+
+///////////////////////  MAX OPTIMAL SIGMA BLUR WRAPPERS  //////////////////////
+
+//  The following blurs are static wrappers around the dynamic blurs above.
+//  HOPEFULLY, the compiler will be smart enough to do constant-folding.
+
+//  Resizable separable blurs:
+inline float3 tex2Dblur11resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur11resize(tex, tex_uv, dxdy, blur11_std_dev);
+}
+inline float3 tex2Dblur9resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur9resize(tex, tex_uv, dxdy, blur9_std_dev);
+}
+inline float3 tex2Dblur7resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur7resize(tex, tex_uv, dxdy, blur7_std_dev);
+}
+inline float3 tex2Dblur5resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur5resize(tex, tex_uv, dxdy, blur5_std_dev);
+}
+inline float3 tex2Dblur3resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur3resize(tex, tex_uv, dxdy, blur3_std_dev);
+}
+//  Fast separable blurs:
+inline float3 tex2Dblur11fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur11fast(tex, tex_uv, dxdy, blur11_std_dev);
+}
+inline float3 tex2Dblur9fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur9fast(tex, tex_uv, dxdy, blur9_std_dev);
+}
+inline float3 tex2Dblur7fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur7fast(tex, tex_uv, dxdy, blur7_std_dev);
+}
+inline float3 tex2Dblur5fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur5fast(tex, tex_uv, dxdy, blur5_std_dev);
+}
+inline float3 tex2Dblur3fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur3fast(tex, tex_uv, dxdy, blur3_std_dev);
+}
+//  Huge, "fast" separable blurs:
+inline float3 tex2Dblur43fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur43fast(tex, tex_uv, dxdy, blur43_std_dev);
+}
+inline float3 tex2Dblur31fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur31fast(tex, tex_uv, dxdy, blur31_std_dev);
+}
+inline float3 tex2Dblur25fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur25fast(tex, tex_uv, dxdy, blur25_std_dev);
+}
+inline float3 tex2Dblur17fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur17fast(tex, tex_uv, dxdy, blur17_std_dev);
+}
+//  Resizable one-pass blurs:
+inline float3 tex2Dblur3x3resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur3x3resize(tex, tex_uv, dxdy, blur3_std_dev);
+}
+//  "Fast" one-pass blurs:
+inline float3 tex2Dblur9x9(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur9x9(tex, tex_uv, dxdy, blur9_std_dev);
+}
+inline float3 tex2Dblur7x7(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur7x7(tex, tex_uv, dxdy, blur7_std_dev);
+}
+inline float3 tex2Dblur5x5(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur5x5(tex, tex_uv, dxdy, blur5_std_dev);
+}
+inline float3 tex2Dblur3x3(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur3x3(tex, tex_uv, dxdy, blur3_std_dev);
+}
+//  "Fast" shared-sample one-pass blurs:
+inline float3 tex2Dblur12x12shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
+{
+    return tex2Dblur12x12shared(tex, tex_uv, dxdy, quad_vector, blur12_std_dev);
+}
+inline float3 tex2Dblur10x10shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
+{
+    return tex2Dblur10x10shared(tex, tex_uv, dxdy, quad_vector, blur10_std_dev);
+}
+inline float3 tex2Dblur8x8shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
+{
+    return tex2Dblur8x8shared(tex, tex_uv, dxdy, quad_vector, blur8_std_dev);
+}
+inline float3 tex2Dblur6x6shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
+{
+    return tex2Dblur6x6shared(tex, tex_uv, dxdy, quad_vector, blur6_std_dev);
+}
+
+
+#endif  //  BLUR_FUNCTIONS_H
+
+////////////////////////////  END BLUR-FUNCTIONS  ///////////////////////////
+
+///////////////////////////////  BLOOM CONSTANTS  //////////////////////////////
+
+//  Compute constants with manual inlines of the functions below:
+static const float bloom_diff_thresh = 1.0/256.0;
+
+
+
+///////////////////////////////////  HELPERS  //////////////////////////////////
+
+inline float get_min_sigma_to_blur_triad(const float triad_size,
+    const float thresh)
+{
+    //  Requires:   1.) triad_size is the final phosphor triad size in pixels
+    //              2.) thresh is the max desired pixel difference in the
+    //                  blurred triad (e.g. 1.0/256.0).
+    //  Returns:    Return the minimum sigma that will fully blur a phosphor
+    //              triad on the screen to an even color, within thresh.
+    //              This closed-form function was found by curve-fitting data.
+    //  Estimate: max error = ~0.086036, mean sq. error = ~0.0013387:
+    return -0.05168 + 0.6113*triad_size -
+        1.122*triad_size*sqrt(0.000416 + thresh);
+    //  Estimate: max error = ~0.16486, mean sq. error = ~0.0041041:
+    //return 0.5985*triad_size - triad_size*sqrt(thresh)
+}
+
+inline float get_absolute_scale_blur_sigma(const float thresh)
+{
+    //  Requires:   1.) min_expected_triads must be a global float.  The number
+    //                  of horizontal phosphor triads in the final image must be
+    //                  >= min_allowed_viewport_triads.x for realistic results.
+    //              2.) bloom_approx_scale_x must be a global float equal to the
+    //                  absolute horizontal scale of BLOOM_APPROX.
+    //              3.) bloom_approx_scale_x/min_allowed_viewport_triads.x
+    //                  should be <= 1.1658025090 to keep the final result <
+    //                  0.62666015625 (the largest sigma ensuring the largest
+    //                  unused texel weight stays < 1.0/256.0 for a 3x3 blur).
+    //              4.) thresh is the max desired pixel difference in the
+    //                  blurred triad (e.g. 1.0/256.0).
+    //  Returns:    Return the minimum Gaussian sigma that will blur the pass
+    //              output as much as it would have taken to blur away
+    //              bloom_approx_scale_x horizontal phosphor triads.
+    //  Description:
+    //  BLOOM_APPROX should look like a downscaled phosphor blur.  Ideally, we'd
+    //  use the same blur sigma as the actual phosphor bloom and scale it down
+    //  to the current resolution with (bloom_approx_scale_x/viewport_size_x), but
+    //  we don't know the viewport size in this pass.  Instead, we'll blur as
+    //  much as it would take to blur away min_allowed_viewport_triads.x.  This
+    //  will blur "more than necessary" if the user actually uses more triads,
+    //  but that's not terrible either, because blurring a constant fraction of
+    //  the viewport may better resemble a true optical bloom anyway (since the
+    //  viewport will generally be about the same fraction of each player's
+    //  field of view, regardless of screen size and resolution).
+    //  Assume an extremely large viewport size for asymptotic results.
+    return bloom_approx_scale_x/max_viewport_size_x *
+        get_min_sigma_to_blur_triad(
+            max_viewport_size_x/min_allowed_viewport_triads.x, thresh);
+}
+
+inline float get_center_weight(const float sigma)
+{
+    //  Given a Gaussian blur sigma, get the blur weight for the center texel.
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        return get_fast_gaussian_weight_sum_inv(sigma);
+    #else
+        const float denom_inv = 0.5/(sigma*sigma);
+        const float w0 = 1.0;
+        const float w1 = exp(-1.0 * denom_inv);
+        const float w2 = exp(-4.0 * denom_inv);
+        const float w3 = exp(-9.0 * denom_inv);
+        const float w4 = exp(-16.0 * denom_inv);
+        const float w5 = exp(-25.0 * denom_inv);
+        const float w6 = exp(-36.0 * denom_inv);
+        const float w7 = exp(-49.0 * denom_inv);
+        const float w8 = exp(-64.0 * denom_inv);
+        const float w9 = exp(-81.0 * denom_inv);
+        const float w10 = exp(-100.0 * denom_inv);
+        const float w11 = exp(-121.0 * denom_inv);
+        const float w12 = exp(-144.0 * denom_inv);
+        const float w13 = exp(-169.0 * denom_inv);
+        const float w14 = exp(-196.0 * denom_inv);
+        const float w15 = exp(-225.0 * denom_inv);
+        const float w16 = exp(-256.0 * denom_inv);
+        const float w17 = exp(-289.0 * denom_inv);
+        const float w18 = exp(-324.0 * denom_inv);
+        const float w19 = exp(-361.0 * denom_inv);
+        const float w20 = exp(-400.0 * denom_inv);
+        const float w21 = exp(-441.0 * denom_inv);
+        //  Note: If the implementation uses a smaller blur than the max allowed,
+        //  the worst case scenario is that the center weight will be overestimated,
+        //  so we'll put a bit more energy into the brightpass...no huge deal.
+        //  Then again, if the implementation uses a larger blur than the max
+        //  "allowed" because of dynamic branching, the center weight could be
+        //  underestimated, which is more of a problem...consider always using
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+            //  43x blur:
+            const float weight_sum_inv = 1.0 /
+                (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 +
+                w11 + w12 + w13 + w14 + w15 + w16 + w17 + w18 + w19 + w20 + w21));
+        #else
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+            //  31x blur:
+            const float weight_sum_inv = 1.0 /
+                (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 +
+                w8 + w9 + w10 + w11 + w12 + w13 + w14 + w15));
+        #else
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+            //  25x blur:
+            const float weight_sum_inv = 1.0 / (w0 + 2.0 * (
+                w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12));
+        #else
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+            //  17x blur:
+            const float weight_sum_inv = 1.0 / (w0 + 2.0 * (
+                w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8));
+        #else
+            //  9x blur:
+            const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+        const float center_weight = weight_sum_inv * weight_sum_inv;
+        return center_weight;
+    #endif
+}
+
+inline float3 tex2DblurNfast(const sampler2D texture, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  If sigma is static, we can safely branch and use the smallest blur
+    //  that's big enough.  Ignore #define hints, because we'll only use a
+    //  large blur if we actually need it, and the branches cost nothing.
+    #ifndef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #define PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE
+    #else
+        //  It's still worth branching if the profile supports dynamic branches:
+        //  It's much faster than using a hugely excessive blur, but each branch
+        //  eats ~1% FPS.
+        #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+            #define PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE
+        #endif
+    #endif
+    //  Failed optimization notes:
+    //  I originally created a same-size mipmapped 5-tap separable blur10 that
+    //  could handle any sigma by reaching into lower mip levels.  It was
+    //  as fast as blur25fast for runtime sigmas and a tad faster than
+    //  blur31fast for static sigmas, but mipmapping two viewport-size passes
+    //  ate 10% of FPS across all codepaths, so it wasn't worth it.
+    #ifdef PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE
+        if(sigma <= blur9_std_dev)
+        {
+            return tex2Dblur9fast(texture, tex_uv, dxdy, sigma);
+        }
+        else if(sigma <= blur17_std_dev)
+        {
+            return tex2Dblur17fast(texture, tex_uv, dxdy, sigma);
+        }
+        else if(sigma <= blur25_std_dev)
+        {
+            return tex2Dblur25fast(texture, tex_uv, dxdy, sigma);
+        }
+        else if(sigma <= blur31_std_dev)
+        {
+            return tex2Dblur31fast(texture, tex_uv, dxdy, sigma);
+        }
+        else
+        {
+            return tex2Dblur43fast(texture, tex_uv, dxdy, sigma);
+        }
+    #else
+        //  If we can't afford to branch, we can only guess at what blur
+        //  size we need.  Therefore, use the largest blur allowed.
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+            return tex2Dblur43fast(texture, tex_uv, dxdy, sigma);
+        #else
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+            return tex2Dblur31fast(texture, tex_uv, dxdy, sigma);
+        #else
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+            return tex2Dblur25fast(texture, tex_uv, dxdy, sigma);
+        #else
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+            return tex2Dblur17fast(texture, tex_uv, dxdy, sigma);
+        #else
+            return tex2Dblur9fast(texture, tex_uv, dxdy, sigma);
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    #endif  //  PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE
+}
+
+inline float get_bloom_approx_sigma(const float output_size_x_runtime,
+    const float estimated_viewport_size_x)
+{
+    //  Requires:   1.) output_size_x_runtime == BLOOM_APPROX.output_size.x.
+    //                  This is included for dynamic codepaths just in case the
+    //                  following two globals are incorrect:
+    //              2.) bloom_approx_size_x_for_skip should == the same
+    //                  if PHOSPHOR_BLOOM_FAKE is #defined
+    //              3.) bloom_approx_size_x should == the same otherwise
+    //  Returns:    For gaussian4x4, return a dynamic small bloom sigma that's
+    //              as close to optimal as possible given available information.
+    //              For blur3x3, return the a static small bloom sigma that
+    //              works well for typical cases.  Otherwise, we're using simple
+    //              bilinear filtering, so use static calculations.
+    //  Assume the default static value.  This is a compromise that ensures
+    //  typical triads are blurred, even if unusually large ones aren't.
+    static const float mask_num_triads_static =
+        max(min_allowed_viewport_triads.x, mask_num_triads_desired_static);
+    const float mask_num_triads_from_size =
+        estimated_viewport_size_x/mask_triad_size_desired;
+    const float mask_num_triads_runtime = max(min_allowed_viewport_triads.x,
+        lerp(mask_num_triads_from_size, mask_num_triads_desired,
+            mask_specify_num_triads));
+    //  Assume an extremely large viewport size for asymptotic results:
+    static const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0);
+    if(bloom_approx_filter > 1.5)   //  4x4 true Gaussian resize
+    {
+        //  Use the runtime num triads and output size:
+        const float asymptotic_triad_size =
+            max_viewport_size_x/mask_num_triads_runtime;
+        const float asymptotic_sigma = get_min_sigma_to_blur_triad(
+            asymptotic_triad_size, bloom_diff_thresh);
+        const float bloom_approx_sigma =
+            asymptotic_sigma * output_size_x_runtime/max_viewport_size_x;
+        //  The BLOOM_APPROX input has to be ORIG_LINEARIZED to avoid moire, but
+        //  account for the Gaussian scanline sigma from the last pass too.
+        //  The bloom will be too wide horizontally but tall enough vertically.
+        return length(float2(bloom_approx_sigma, beam_max_sigma));
+    }
+    else    //  3x3 blur resize (the bilinear resize doesn't need a sigma)
+    {
+        //  We're either using blur3x3 or bilinear filtering.  The biggest
+        //  reason to choose blur3x3 is to avoid dynamic weights, so use a
+        //  static calculation.
+        #ifdef PHOSPHOR_BLOOM_FAKE
+            static const float output_size_x_static =
+                bloom_approx_size_x_for_fake;
+        #else
+            static const float output_size_x_static = bloom_approx_size_x;
+        #endif
+        static const float asymptotic_triad_size =
+            max_viewport_size_x/mask_num_triads_static;
+        const float asymptotic_sigma = get_min_sigma_to_blur_triad(
+            asymptotic_triad_size, bloom_diff_thresh);
+        const float bloom_approx_sigma =
+            asymptotic_sigma * output_size_x_static/max_viewport_size_x;
+        //  The BLOOM_APPROX input has to be ORIG_LINEARIZED to avoid moire, but
+        //  try accounting for the Gaussian scanline sigma from the last pass
+        //  too; use the static default value:
+        return length(float2(bloom_approx_sigma, beam_max_sigma_static));
+    }
+}
+
+inline float get_final_bloom_sigma(const float bloom_sigma_runtime)
+{
+    //  Requires:   1.) bloom_sigma_runtime is a precalculated sigma that's
+    //                  optimal for the [known] triad size.
+    //              2.) Call this from a fragment shader (not a vertex shader),
+    //                  or blurring with static sigmas won't be constant-folded.
+    //  Returns:    Return the optimistic static sigma if the triad size is
+    //              known at compile time.  Otherwise return the optimal runtime
+    //              sigma (10% slower) or an implementation-specific compromise
+    //              between an optimistic or pessimistic static sigma.
+    //  Notes:      Call this from the fragment shader, NOT the vertex shader,
+    //              so static sigmas can be constant-folded!
+    const float bloom_sigma_optimistic = get_min_sigma_to_blur_triad(
+        mask_triad_size_desired_static, bloom_diff_thresh);
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        return bloom_sigma_runtime;
+    #else
+        //  Overblurring looks as bad as underblurring, so assume average-size
+        //  triads, not worst-case huge triads:
+        return bloom_sigma_optimistic;
+    #endif
+}
+
+
+#endif  //  BLOOM_FUNCTIONS_H
+
+////////////////////////////  END BLOOM-FUNCTIONS  ///////////////////////////
+
+//#include "../../../../include/blur-functions.h"
+
+////////////////////////////  BEGIN BLUR-FUNCTIONS  ///////////////////////////
+
+#ifndef BLUR_FUNCTIONS_H
+#define BLUR_FUNCTIONS_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides reusable one-pass and separable (two-pass) blurs.
+//  Requires:   All blurs share these requirements (dxdy requirement is split):
+//              1.) All requirements of gamma-management.h must be satisfied!
+//              2.) filter_linearN must == "true" in your .cgp preset unless
+//                  you're using tex2DblurNresize at 1x scale.
+//              3.) mipmap_inputN must == "true" in your .cgp preset if
+//                  output_size < video_size.
+//              4.) output_size == video_size / pow(2, M), where M is some
+//                  positive integer.  tex2Dblur*resize can resize arbitrarily
+//                  (and the blur will be done after resizing), but arbitrary
+//                  resizes "fail" with other blurs due to the way they mix
+//                  static weights with bilinear sample exploitation.
+//              5.) In general, dxdy should contain the uv pixel spacing:
+//                      dxdy = (video_size/output_size)/texture_size
+//              6.) For separable blurs (tex2DblurNresize and tex2DblurNfast),
+//                  zero out the dxdy component in the unblurred dimension:
+//                      dxdy = float2(dxdy.x, 0.0) or float2(0.0, dxdy.y)
+//              Many blurs share these requirements:
+//              1.) One-pass blurs require scale_xN == scale_yN or scales > 1.0,
+//                  or they will blur more in the lower-scaled dimension.
+//              2.) One-pass shared sample blurs require ddx(), ddy(), and
+//                  tex2Dlod() to be supported by the current Cg profile, and
+//                  the drivers must support high-quality derivatives.
+//              3.) One-pass shared sample blurs require:
+//                      tex_uv.w == log2(video_size/output_size).y;
+//              Non-wrapper blurs share this requirement:
+//              1.) sigma is the intended standard deviation of the blur
+//              Wrapper blurs share this requirement, which is automatically
+//              met (unless OVERRIDE_BLUR_STD_DEVS is #defined; see below):
+//              1.) blurN_std_dev must be global static const float values
+//                  specifying standard deviations for Nx blurs in units
+//                  of destination pixels
+//  Optional:   1.) The including file (or an earlier included file) may
+//                  optionally #define USE_BINOMIAL_BLUR_STD_DEVS to replace
+//                  default standard deviations with those matching a binomial
+//                  distribution.  (See below for details/properties.)
+//              2.) The including file (or an earlier included file) may
+//                  optionally #define OVERRIDE_BLUR_STD_DEVS and override:
+//                      static const float blur3_std_dev
+//                      static const float blur4_std_dev
+//                      static const float blur5_std_dev
+//                      static const float blur6_std_dev
+//                      static const float blur7_std_dev
+//                      static const float blur8_std_dev
+//                      static const float blur9_std_dev
+//                      static const float blur10_std_dev
+//                      static const float blur11_std_dev
+//                      static const float blur12_std_dev
+//                      static const float blur17_std_dev
+//                      static const float blur25_std_dev
+//                      static const float blur31_std_dev
+//                      static const float blur43_std_dev
+//              3.) The including file (or an earlier included file) may
+//                  optionally #define OVERRIDE_ERROR_BLURRING and override:
+//                      static const float error_blurring
+//                  This tuning value helps mitigate weighting errors from one-
+//                  pass shared-sample blurs sharing bilinear samples between
+//                  fragments.  Values closer to 0.0 have "correct" blurriness
+//                  but allow more artifacts, and values closer to 1.0 blur away
+//                  artifacts by sampling closer to halfway between texels.
+//              UPDATE 6/21/14: The above static constants may now be overridden
+//              by non-static uniform constants.  This permits exposing blur
+//              standard deviations as runtime GUI shader parameters.  However,
+//              using them keeps weights from being statically computed, and the
+//              speed hit depends on the blur: On my machine, uniforms kill over
+//              53% of the framerate with tex2Dblur12x12shared, but they only
+//              drop the framerate by about 18% with tex2Dblur11fast.
+//  Quality and Performance Comparisons:
+//  For the purposes of the following discussion, "no sRGB" means
+//  GAMMA_ENCODE_EVERY_FBO is #defined, and "sRGB" means it isn't.
+//  1.) tex2DblurNfast is always faster than tex2DblurNresize.
+//  2.) tex2DblurNresize functions are the only ones that can arbitrarily resize
+//      well, because they're the only ones that don't exploit bilinear samples.
+//      This also means they're the only functions which can be truly gamma-
+//      correct without linear (or sRGB FBO) input, but only at 1x scale.
+//  3.) One-pass shared sample blurs only have a speed advantage without sRGB.
+//      They also have some inaccuracies due to their shared-[bilinear-]sample
+//      design, which grow increasingly bothersome for smaller blurs and higher-
+//      frequency source images (relative to their resolution).  I had high
+//      hopes for them, but their most realistic use case is limited to quickly
+//      reblurring an already blurred input at full resolution.  Otherwise:
+//      a.) If you're blurring a low-resolution source, you want a better blur.
+//      b.) If you're blurring a lower mipmap, you want a better blur.
+//      c.) If you're blurring a high-resolution, high-frequency source, you
+//          want a better blur.
+//  4.) The one-pass blurs without shared samples grow slower for larger blurs,
+//      but they're competitive with separable blurs at 5x5 and smaller, and
+//      even tex2Dblur7x7 isn't bad if you're wanting to conserve passes.
+//  Here are some framerates from a GeForce 8800GTS.  The first pass resizes to
+//  viewport size (4x in this test) and linearizes for sRGB codepaths, and the
+//  remaining passes perform 6 full blurs.  Mipmapped tests are performed at the
+//  same scale, so they just measure the cost of mipmapping each FBO (only every
+//  other FBO is mipmapped for separable blurs, to mimic realistic usage).
+//  Mipmap      Neither     sRGB+Mipmap sRGB        Function
+//  76.0        92.3        131.3       193.7       tex2Dblur3fast
+//  63.2        74.4        122.4       175.5       tex2Dblur3resize
+//  93.7        121.2       159.3       263.2       tex2Dblur3x3
+//  59.7        68.7        115.4       162.1       tex2Dblur3x3resize
+//  63.2        74.4        122.4       175.5       tex2Dblur5fast
+//  49.3        54.8        100.0       132.7       tex2Dblur5resize
+//  59.7        68.7        115.4       162.1       tex2Dblur5x5
+//  64.9        77.2        99.1        137.2       tex2Dblur6x6shared
+//  55.8        63.7        110.4       151.8       tex2Dblur7fast
+//  39.8        43.9        83.9        105.8       tex2Dblur7resize
+//  40.0        44.2        83.2        104.9       tex2Dblur7x7
+//  56.4        65.5        71.9        87.9        tex2Dblur8x8shared
+//  49.3        55.1        99.9        132.5       tex2Dblur9fast
+//  33.3        36.2        72.4        88.0        tex2Dblur9resize
+//  27.8        29.7        61.3        72.2        tex2Dblur9x9
+//  37.2        41.1        52.6        60.2        tex2Dblur10x10shared
+//  44.4        49.5        91.3        117.8       tex2Dblur11fast
+//  28.8        30.8        63.6        75.4        tex2Dblur11resize
+//  33.6        36.5        40.9        45.5        tex2Dblur12x12shared
+//  TODO: Fill in benchmarks for new untested blurs.
+//                                                  tex2Dblur17fast
+//                                                  tex2Dblur25fast
+//                                                  tex2Dblur31fast
+//                                                  tex2Dblur43fast
+//                                                  tex2Dblur3x3resize
+
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+//  Set static standard deviations, but allow users to override them with their
+//  own constants (even non-static uniforms if they're okay with the speed hit):
+#ifndef OVERRIDE_BLUR_STD_DEVS
+    //  blurN_std_dev values are specified in terms of dxdy strides.
+    #ifdef USE_BINOMIAL_BLUR_STD_DEVS
+        //  By request, we can define standard deviations corresponding to a
+        //  binomial distribution with p = 0.5 (related to Pascal's triangle).
+        //  This distribution works such that blurring multiple times should
+        //  have the same result as a single larger blur.  These values are
+        //  larger than default for blurs up to 6x and smaller thereafter.
+        static const float blur3_std_dev = 0.84931640625;
+        static const float blur4_std_dev = 0.84931640625;
+        static const float blur5_std_dev = 1.0595703125;
+        static const float blur6_std_dev = 1.06591796875;
+        static const float blur7_std_dev = 1.17041015625;
+        static const float blur8_std_dev = 1.1720703125;
+        static const float blur9_std_dev = 1.2259765625;
+        static const float blur10_std_dev = 1.21982421875;
+        static const float blur11_std_dev = 1.25361328125;
+        static const float blur12_std_dev = 1.2423828125;
+        static const float blur17_std_dev = 1.27783203125;
+        static const float blur25_std_dev = 1.2810546875;
+        static const float blur31_std_dev = 1.28125;
+        static const float blur43_std_dev = 1.28125;
+    #else
+        //  The defaults are the largest values that keep the largest unused
+        //  blur term on each side <= 1.0/256.0.  (We could get away with more
+        //  or be more conservative, but this compromise is pretty reasonable.)
+        static const float blur3_std_dev = 0.62666015625;
+        static const float blur4_std_dev = 0.66171875;
+        static const float blur5_std_dev = 0.9845703125;
+        static const float blur6_std_dev = 1.02626953125;
+        static const float blur7_std_dev = 1.36103515625;
+        static const float blur8_std_dev = 1.4080078125;
+        static const float blur9_std_dev = 1.7533203125;
+        static const float blur10_std_dev = 1.80478515625;
+        static const float blur11_std_dev = 2.15986328125;
+        static const float blur12_std_dev = 2.215234375;
+        static const float blur17_std_dev = 3.45535583496;
+        static const float blur25_std_dev = 5.3409576416;
+        static const float blur31_std_dev = 6.86488037109;
+        static const float blur43_std_dev = 10.1852050781;
+    #endif  //  USE_BINOMIAL_BLUR_STD_DEVS
+#endif  //  OVERRIDE_BLUR_STD_DEVS
+
+#ifndef OVERRIDE_ERROR_BLURRING
+    //  error_blurring should be in [0.0, 1.0].  Higher values reduce ringing
+    //  in shared-sample blurs but increase blurring and feature shifting.
+    static const float error_blurring = 0.5;
+#endif
+
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//  gamma-management.h relies on pass-specific settings to guide its behavior:
+//  FIRST_PASS, LAST_PASS, GAMMA_ENCODE_EVERY_FBO, etc.  See it for details.
+//#include "gamma-management.h"
+
+////////////////////////////  BEGIN GAMMA-MANAGEMENT  //////////////////////////
+
+#ifndef GAMMA_MANAGEMENT_H
+#define GAMMA_MANAGEMENT_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//  
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides gamma-aware tex*D*() and encode_output() functions.
+//  Requires:   Before #include-ing this file, the including file must #define
+//              the following macros when applicable and follow their rules:
+//              1.) #define FIRST_PASS if this is the first pass.
+//              2.) #define LAST_PASS if this is the last pass.
+//              3.) If sRGB is available, set srgb_framebufferN = "true" for
+//                  every pass except the last in your .cgp preset.
+//              4.) If sRGB isn't available but you want gamma-correctness with
+//                  no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
+//              5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
+//              6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
+//              7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
+//              8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
+//              If an option in [5, 8] is #defined in the first or last pass, it
+//              should be #defined for both.  It shouldn't make a difference
+//              whether it's #defined for intermediate passes or not.
+//  Optional:   The including file (or an earlier included file) may optionally
+//              #define a number of macros indicating it will override certain
+//              macros and associated constants are as follows:
+//              static constants with either static or uniform constants.  The
+//              1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
+//                  static const float ntsc_gamma
+//                  static const float pal_gamma
+//                  static const float crt_reference_gamma_high
+//                  static const float crt_reference_gamma_low
+//                  static const float lcd_reference_gamma
+//                  static const float crt_office_gamma
+//                  static const float lcd_office_gamma
+//              2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
+//                  static const float crt_gamma
+//                  static const float gba_gamma
+//                  static const float lcd_gamma
+//              3.) OVERRIDE_FINAL_GAMMA: The user must first define:
+//                  static const float input_gamma
+//                  static const float intermediate_gamma
+//                  static const float output_gamma
+//                  (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
+//              4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
+//                  static const bool assume_opaque_alpha
+//              The gamma constant overrides must be used in every pass or none,
+//              and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
+//              OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
+//  Usage:      After setting macros appropriately, ignore gamma correction and
+//              replace all tex*D*() calls with equivalent gamma-aware
+//              tex*D*_linearize calls, except:
+//              1.) When you read an LUT, use regular tex*D or a gamma-specified
+//                  function, depending on its gamma encoding:
+//                      tex*D*_linearize_gamma (takes a runtime gamma parameter)
+//              2.) If you must read pass0's original input in a later pass, use
+//                  tex2D_linearize_ntsc_gamma.  If you want to read pass0's
+//                  input with gamma-corrected bilinear filtering, consider
+//                  creating a first linearizing pass and reading from the input
+//                  of pass1 later.
+//              Then, return encode_output(color) from every fragment shader.
+//              Finally, use the global gamma_aware_bilinear boolean if you want
+//              to statically branch based on whether bilinear filtering is
+//              gamma-correct or not (e.g. for placing Gaussian blur samples).
+//
+//  Detailed Policy:
+//  tex*D*_linearize() functions enforce a consistent gamma-management policy
+//  based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings.  They assume
+//  their input texture has the same encoding characteristics as the input for
+//  the current pass (which doesn't apply to the exceptions listed above).
+//  Similarly, encode_output() enforces a policy based on the LAST_PASS and
+//  GAMMA_ENCODE_EVERY_FBO settings.  Together, they result in one of the
+//  following two pipelines.
+//  Typical pipeline with intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = linear_color;     //  Automatic sRGB encoding
+//      linear_color = intermediate_output;     //  Automatic sRGB decoding
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Typical pipeline without intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
+//      linear_color = pow(intermediate_output, intermediate_gamma);
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
+//  easily get gamma-correctness without banding on devices where sRGB isn't
+//  supported.
+//
+//  Use This Header to Maximize Code Reuse:
+//  The purpose of this header is to provide a consistent interface for texture
+//  reads and output gamma-encoding that localizes and abstracts away all the
+//  annoying details.  This greatly reduces the amount of code in each shader
+//  pass that depends on the pass number in the .cgp preset or whether sRGB
+//  FBO's are being used: You can trivially change the gamma behavior of your
+//  whole pass by commenting or uncommenting 1-3 #defines.  To reuse the same
+//  code in your first, Nth, and last passes, you can even put it all in another
+//  header file and #include it from skeleton .cg files that #define the
+//  appropriate pass-specific settings.
+//
+//  Rationale for Using Three Macros:
+//  This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
+//  SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
+//  a lower maintenance burden on each pass.  At first glance it seems we could
+//  accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
+//  This works for simple use cases where input_gamma == output_gamma, but it
+//  breaks down for more complex scenarios like CRT simulation, where the pass
+//  number determines the gamma encoding of the input and output.
+
+
+///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
+
+//  Set standard gamma constants, but allow users to override them:
+#ifndef OVERRIDE_STANDARD_GAMMA
+    //  Standard encoding gammas:
+    static const float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
+    static const float pal_gamma = 2.8;     //  Never actually 2.8 in practice
+    //  Typical device decoding gammas (only use for emulating devices):
+    //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
+    //  gammas: The standards purposely undercorrected for an analog CRT's
+    //  assumed 2.5 reference display gamma to maintain contrast in assumed
+    //  [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
+    //  These unstated assumptions about display gamma and perceptual rendering
+    //  intent caused a lot of confusion, and more modern CRT's seemed to target
+    //  NTSC 2.2 gamma with circuitry.  LCD displays seem to have followed suit
+    //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
+    //  displays designed to view sRGB in bright environments.  (Standards are
+    //  also in flux again with BT.1886, but it's underspecified for displays.)
+    static const float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
+    static const float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
+    static const float lcd_reference_gamma = 2.5;       //  To match CRT
+    static const float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
+    static const float lcd_office_gamma = 2.2;  //  Approximates sRGB
+#endif  //  OVERRIDE_STANDARD_GAMMA
+
+//  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
+//  but only if they're aware of it.
+#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
+    static const bool assume_opaque_alpha = false;
+#endif
+
+
+///////////////////////  DERIVED CONSTANTS AS FUNCTIONS  ///////////////////////
+
+//  gamma-management.h should be compatible with overriding gamma values with
+//  runtime user parameters, but we can only define other global constants in
+//  terms of static constants, not uniform user parameters.  To get around this
+//  limitation, we need to define derived constants using functions.
+
+//  Set device gamma constants, but allow users to override them:
+#ifdef OVERRIDE_DEVICE_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_crt_gamma()    {   return crt_gamma;   }
+    inline float get_gba_gamma()    {   return gba_gamma;   }
+    inline float get_lcd_gamma()    {   return lcd_gamma;   }
+#else
+    inline float get_crt_gamma()    {   return crt_reference_gamma_high;    }
+    inline float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
+    inline float get_lcd_gamma()    {   return lcd_office_gamma;            }
+#endif  //  OVERRIDE_DEVICE_GAMMA
+
+//  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
+#ifdef OVERRIDE_FINAL_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_intermediate_gamma()   {   return intermediate_gamma;  }
+    inline float get_input_gamma()          {   return input_gamma;         }
+    inline float get_output_gamma()         {   return output_gamma;        }
+#else
+    //  If we gamma-correct every pass, always use ntsc_gamma between passes to
+    //  ensure middle passes don't need to care if anything is being simulated:
+    inline float get_intermediate_gamma()   {   return ntsc_gamma;          }
+    #ifdef SIMULATE_CRT_ON_LCD
+        inline float get_input_gamma()      {   return get_crt_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_LCD
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_LCD_ON_CRT
+        inline float get_input_gamma()      {   return get_lcd_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_CRT
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else   //  Don't simulate anything:
+        inline float get_input_gamma()      {   return ntsc_gamma;          }
+        inline float get_output_gamma()     {   return ntsc_gamma;          }
+    #endif  //  SIMULATE_GBA_ON_CRT
+    #endif  //  SIMULATE_LCD_ON_CRT
+    #endif  //  SIMULATE_GBA_ON_LCD
+    #endif  //  SIMULATE_CRT_ON_LCD
+#endif  //  OVERRIDE_FINAL_GAMMA
+
+//  Set decoding/encoding gammas for the current pass.  Use static constants for
+//  linearize_input and gamma_encode_output, because they aren't derived, and
+//  they let the compiler do dead-code elimination.
+#ifndef GAMMA_ENCODE_EVERY_FBO
+    #ifdef FIRST_PASS
+        static const bool linearize_input = true;
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        static const bool linearize_input = false;
+        inline float get_pass_input_gamma()     {   return 1.0;                 }
+    #endif
+    #ifdef LAST_PASS
+        static const bool gamma_encode_output = true;
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        static const bool gamma_encode_output = false;
+        inline float get_pass_output_gamma()    {   return 1.0;                 }
+    #endif
+#else
+    static const bool linearize_input = true;
+    static const bool gamma_encode_output = true;
+    #ifdef FIRST_PASS
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        inline float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
+    #endif
+    #ifdef LAST_PASS
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        inline float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
+    #endif
+#endif
+
+//  Users might want to know if bilinear filtering will be gamma-correct:
+static const bool gamma_aware_bilinear = !linearize_input;
+
+
+//////////////////////  COLOR ENCODING/DECODING FUNCTIONS  /////////////////////
+
+inline float4 encode_output(const float4 color)
+{
+    if(gamma_encode_output)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_input(const float4 color)
+{
+    if(linearize_input)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_gamma_input(const float4 color, const float3 gamma)
+{
+    if(assume_opaque_alpha)
+    {
+        return float4(pow(color.rgb, gamma), 1.0);
+    }
+    else
+    {
+        return float4(pow(color.rgb, gamma), color.a);
+    }
+}
+
+//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯
+//#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D)))
+// EDIT: it's the 'const' in front of the coords that's doing it
+
+///////////////////////////  TEXTURE LOOKUP WRAPPERS  //////////////////////////
+
+//  "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a wide array of linearizing texture lookup wrapper functions.  The
+//  Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
+//  lookups are provided for completeness in case that changes someday.  Nobody
+//  is likely to use the *fetch and *proj functions, but they're included just
+//  in case.  The only tex*D texture sampling functions omitted are:
+//      - tex*Dcmpbias
+//      - tex*Dcmplod
+//      - tex*DARRAY*
+//      - tex*DMS*
+//      - Variants returning integers
+//  Standard line length restrictions are ignored below for vertical brevity.
+/*
+//  tex1D:
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex1Dbias:
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dbias(tex, tex_coords));   }
+
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dbias(tex, tex_coords, texel_off));    }
+
+//  tex1Dfetch:
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
+{   return decode_input(tex1Dfetch(tex, tex_coords));  }
+
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex1Dlod:
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dlod(tex, tex_coords));    }
+
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dlod(tex, tex_coords, texel_off));     }
+
+//  tex1Dproj:
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+*/
+//  tex2D:
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords, texel_off));    }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex2Dbias:
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
+//{   return decode_input(tex2Dbias(tex, tex_coords));   }
+
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dbias(tex, tex_coords, texel_off));    }
+
+//  tex2Dfetch:
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
+//{   return decode_input(tex2Dfetch(tex, tex_coords));  }
+
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords)
+{   return decode_input(textureLod(tex, tex_coords.xy, 0.0));    }
+
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));     }
+/*
+//  tex2Dproj:
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+*/
+/*
+//  tex3D:
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
+{   return decode_input(tex3D(tex, tex_coords));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, texel_off));    }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex3Dbias:
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dbias(tex, tex_coords));   }
+
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dbias(tex, tex_coords, texel_off));    }
+
+//  tex3Dfetch:
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
+{   return decode_input(tex3Dfetch(tex, tex_coords));  }
+
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex3Dlod:
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dlod(tex, tex_coords));    }
+
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dlod(tex, tex_coords, texel_off));     }
+
+//  tex3Dproj:
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dproj(tex, tex_coords));   }
+
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dproj(tex, tex_coords, texel_off));    }
+/////////*
+
+//  NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  This narrow selection of nonstandard tex2D* functions can be useful:
+
+//  tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0)));   }
+
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off));    }
+
+
+//  MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a narrower selection of tex2D* wrapper functions that decode an
+//  input sample with a specified gamma value.  These are useful for reading
+//  LUT's and for reading the input of pass0 in a later pass.
+
+//  tex2D:
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma);   }
+
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+/*
+//  tex2Dbias:
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma);   }
+
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma);    }
+
+//  tex2Dfetch:
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma);  }
+
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma);   }
+*/
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma);    }
+
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma);     }
+
+
+#endif  //  GAMMA_MANAGEMENT_H
+
+////////////////////////////  END GAMMA-MANAGEMENT  //////////////////////////
+
+//#include "quad-pixel-communication.h"
+
+///////////////////////  BEGIN QUAD-PIXEL-COMMUNICATION  //////////////////////
+
+#ifndef QUAD_PIXEL_COMMUNICATION_H
+#define QUAD_PIXEL_COMMUNICATION_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey*
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DISCLAIMER  /////////////////////////////////
+
+//  *This code was inspired by "Shader Amortization using Pixel Quad Message
+//  Passing" by Eric Penner, published in GPU Pro 2, Chapter VI.2.  My intent
+//  is not to plagiarize his fundamentally similar code and assert my own
+//  copyright, but the algorithmic helper functions require so little code that
+//  implementations can't vary by much except bugfixes and conventions.  I just
+//  wanted to license my own particular code here to avoid ambiguity and make it
+//  clear that as far as I'm concerned, people can do as they please with it.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  Given screen pixel numbers, derive a "quad vector" describing a fragment's
+//  position in its 2x2 pixel quad.  Given that vector, obtain the values of any
+//  variable at neighboring fragments.
+//  Requires:   Using this file in general requires:
+//              1.) ddx() and ddy() are present in the current Cg profile.
+//              2.) The GPU driver is using fine/high-quality derivatives.
+//                  Functions will give incorrect results if this is not true,
+//                  so a test function is included.
+
+
+/////////////////////  QUAD-PIXEL COMMUNICATION PRIMITIVES  ////////////////////
+
+float4 get_quad_vector_naive(float4 output_pixel_num_wrt_uvxy)
+{
+    //  Requires:   Two measures of the current fragment's output pixel number
+    //              in the range ([0, output_size.x), [0, output_size.y)):
+    //              1.) output_pixel_num_wrt_uvxy.xy increase with uv coords.
+    //              2.) output_pixel_num_wrt_uvxy.zw increase with screen xy.
+    //  Returns:    Two measures of the fragment's position in its 2x2 quad:
+    //              1.) The .xy components are its 2x2 placement with respect to
+    //                  uv direction (the origin (0, 0) is at the top-left):
+    //                  top-left     = (-1.0, -1.0) top-right    = ( 1.0, -1.0)
+    //                  bottom-left  = (-1.0,  1.0) bottom-right = ( 1.0,  1.0)
+    //                  You need this to arrange/weight shared texture samples.
+    //              2.) The .zw components are its 2x2 placement with respect to
+    //                  screen xy direction (position); the origin varies.
+    //                  quad_gather needs this measure to work correctly.
+    //              Note: quad_vector.zw = quad_vector.xy * float2(
+    //                      ddx(output_pixel_num_wrt_uvxy.x),
+    //                      ddy(output_pixel_num_wrt_uvxy.y));
+    //  Caveats:    This function assumes the GPU driver always starts 2x2 pixel
+    //              quads at even pixel numbers.  This assumption can be wrong
+    //              for odd output resolutions (nondeterministically so).
+    float4 pixel_odd = frac(output_pixel_num_wrt_uvxy * 0.5) * 2.0;
+    float4 quad_vector = pixel_odd * 2.0 - float4(1.0);
+    return quad_vector;
+}
+
+float4 get_quad_vector(float4 output_pixel_num_wrt_uvxy)
+{
+    //  Requires:   Same as get_quad_vector_naive() (see that first).
+    //  Returns:    Same as get_quad_vector_naive() (see that first), but it's
+    //              correct even if the 2x2 pixel quad starts at an odd pixel,
+    //              which can occur at odd resolutions.
+    float4 quad_vector_guess =
+        get_quad_vector_naive(output_pixel_num_wrt_uvxy);
+    //  If quad_vector_guess.zw doesn't increase with screen xy, we know
+    //  the 2x2 pixel quad starts at an odd pixel:
+    float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_guess.z),
+                                                ddy(quad_vector_guess.w));
+    return quad_vector_guess * odd_start_mirror.xyxy;
+}
+
+float4 get_quad_vector(float2 output_pixel_num_wrt_uv)
+{
+    //  Requires:   1.) ddx() and ddy() are present in the current Cg profile.
+    //              2.) output_pixel_num_wrt_uv must increase with uv coords and
+    //                  measure the current fragment's output pixel number in:
+    //                      ([0, output_size.x), [0, output_size.y))
+    //  Returns:    Same as get_quad_vector_naive() (see that first), but it's
+    //              correct even if the 2x2 pixel quad starts at an odd pixel,
+    //              which can occur at odd resolutions.
+    //  Caveats:    This function requires less information than the version
+    //              taking a float4, but it's potentially slower.
+    //  Do screen coords increase with or against uv?  Get the direction
+    //  with respect to (uv.x, uv.y) for (screen.x, screen.y) in {-1, 1}.
+    float2 screen_uv_mirror = float2(ddx(output_pixel_num_wrt_uv.x),
+                                        ddy(output_pixel_num_wrt_uv.y));
+    float2 pixel_odd_wrt_uv = frac(output_pixel_num_wrt_uv * 0.5) * 2.0;
+    float2 quad_vector_uv_guess = (pixel_odd_wrt_uv - float2(0.5)) * 2.0;
+    float2 quad_vector_screen_guess = quad_vector_uv_guess * screen_uv_mirror;
+    //  If quad_vector_screen_guess doesn't increase with screen xy, we know
+    //  the 2x2 pixel quad starts at an odd pixel:
+    float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_screen_guess.x),
+                                                ddy(quad_vector_screen_guess.y));
+    float4 quad_vector_guess = float4(
+        quad_vector_uv_guess, quad_vector_screen_guess);
+    return quad_vector_guess * odd_start_mirror.xyxy;
+}
+
+void quad_gather(float4 quad_vector, float4 curr,
+    out float4 adjx, out float4 adjy, out float4 diag)
+{
+    //  Requires:   1.) ddx() and ddy() are present in the current Cg profile.
+    //              2.) The GPU driver is using fine/high-quality derivatives.
+    //              3.) quad_vector describes the current fragment's location in
+    //                  its 2x2 pixel quad using get_quad_vector()'s conventions.
+    //              4.) curr is any vector you wish to get neighboring values of.
+    //  Returns:    Values of an input vector (curr) at neighboring fragments
+    //              adjacent x, adjacent y, and diagonal (via out parameters).
+    adjx = curr - ddx(curr) * quad_vector.z;
+    adjy = curr - ddy(curr) * quad_vector.w;
+    diag = adjx - ddy(adjx) * quad_vector.w;
+}
+
+void quad_gather(float4 quad_vector, float3 curr,
+    out float3 adjx, out float3 adjy, out float3 diag)
+{
+    //  Float3 version
+    adjx = curr - ddx(curr) * quad_vector.z;
+    adjy = curr - ddy(curr) * quad_vector.w;
+    diag = adjx - ddy(adjx) * quad_vector.w;
+}
+
+void quad_gather(float4 quad_vector, float2 curr,
+    out float2 adjx, out float2 adjy, out float2 diag)
+{
+    //  Float2 version
+    adjx = curr - ddx(curr) * quad_vector.z;
+    adjy = curr - ddy(curr) * quad_vector.w;
+    diag = adjx - ddy(adjx) * quad_vector.w;
+}
+
+float4 quad_gather(float4 quad_vector, float curr)
+{
+    //  Float version:
+    //  Returns:    return.x == current
+    //              return.y == adjacent x
+    //              return.z == adjacent y
+    //              return.w == diagonal
+    float4 all = float4(curr);
+    all.y = all.x - ddx(all.x) * quad_vector.z;
+    all.zw = all.xy - ddy(all.xy) * quad_vector.w;
+    return all;
+}
+
+float4 quad_gather_sum(float4 quad_vector, float4 curr)
+{
+    //  Requires:   Same as quad_gather()
+    //  Returns:    Sum of an input vector (curr) at all fragments in a quad.
+    float4 adjx, adjy, diag;
+    quad_gather(quad_vector, curr, adjx, adjy, diag);
+    return (curr + adjx + adjy + diag);
+}
+
+float3 quad_gather_sum(float4 quad_vector, float3 curr)
+{
+    //  Float3 version:
+    float3 adjx, adjy, diag;
+    quad_gather(quad_vector, curr, adjx, adjy, diag);
+    return (curr + adjx + adjy + diag);
+}
+
+float2 quad_gather_sum(float4 quad_vector, float2 curr)
+{
+    //  Float2 version:
+    float2 adjx, adjy, diag;
+    quad_gather(quad_vector, curr, adjx, adjy, diag);
+    return (curr + adjx + adjy + diag);
+}
+
+float quad_gather_sum(float4 quad_vector, float curr)
+{
+    //  Float version:
+    float4 all_values = quad_gather(quad_vector, curr);
+    return (all_values.x + all_values.y + all_values.z + all_values.w);
+}
+
+bool fine_derivatives_working(float4 quad_vector, float4 curr)
+{
+    //  Requires:   1.) ddx() and ddy() are present in the current Cg profile.
+    //              2.) quad_vector describes the current fragment's location in
+    //                  its 2x2 pixel quad using get_quad_vector()'s conventions.
+    //              3.) curr must be a test vector with non-constant derivatives
+    //                  (its value should change nonlinearly across fragments).
+    //  Returns:    true if fine/hybrid/high-quality derivatives are used, or
+    //              false if coarse derivatives are used or inconclusive
+    //  Usage:      Test whether quad-pixel communication is working!
+    //  Method:     We can confirm fine derivatives are used if the following
+    //              holds (ever, for any value at any fragment):
+    //                  (ddy(curr) != ddy(adjx)) or (ddx(curr) != ddx(adjy))
+    //              The more values we test (e.g. test a float4 two ways), the
+    //              easier it is to demonstrate fine derivatives are working.
+    //  TODO: Check for floating point exact comparison issues!
+    float4 ddx_curr = ddx(curr);
+    float4 ddy_curr = ddy(curr);
+    float4 adjx = curr - ddx_curr * quad_vector.z;
+    float4 adjy = curr - ddy_curr * quad_vector.w;
+    bool ddy_different = any(bool4(ddy_curr.x != ddy(adjx).x, ddy_curr.y != ddy(adjx).y, ddy_curr.z != ddy(adjx).z, ddy_curr.w != ddy(adjx).w));
+    bool ddx_different = any(bool4(ddx_curr.x != ddx(adjy).x, ddx_curr.y != ddx(adjy).y, ddx_curr.z != ddx(adjy).z, ddx_curr.w != ddx(adjy).w));
+    return any(bool2(ddy_different, ddx_different));
+}
+
+bool fine_derivatives_working_fast(float4 quad_vector, float curr)
+{
+    //  Requires:   Same as fine_derivatives_working()
+    //  Returns:    Same as fine_derivatives_working()
+    //  Usage:      This is faster than fine_derivatives_working() but more
+    //              likely to return false negatives, so it's less useful for
+    //              offline testing/debugging.  It's also useless as the basis
+    //              for dynamic runtime branching as of May 2014: Derivatives
+    //              (and quad-pixel communication) are currently disallowed in
+    //              branches.  However, future GPU's may allow you to use them
+    //              in dynamic branches if you promise the branch condition
+    //              evaluates the same for every fragment in the quad (and/or if
+    //              the driver enforces that promise by making a single fragment
+    //              control branch decisions).  If that ever happens, this
+    //              version may become a more economical choice.
+    float ddx_curr = ddx(curr);
+    float ddy_curr = ddy(curr);
+    float adjx = curr - ddx_curr * quad_vector.z;
+    return (ddy_curr != ddy(adjx));
+}
+
+#endif  //  QUAD_PIXEL_COMMUNICATION_H
+
+////////////////////////  END QUAD-PIXEL-COMMUNICATION  ///////////////////////
+
+//#include "special-functions.h"
+
+///////////////////////////  BEGIN SPECIAL-FUNCTIONS  //////////////////////////
+
+#ifndef SPECIAL_FUNCTIONS_H
+#define SPECIAL_FUNCTIONS_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file implements the following mathematical special functions:
+//  1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2))
+//  2.) gamma(s), a real-numbered extension of the integer factorial function
+//  It also implements normalized_ligamma(s, z), a normalized lower incomplete
+//  gamma function for s < 0.5 only.  Both gamma() and normalized_ligamma() can
+//  be called with an _impl suffix to use an implementation version with a few
+//  extra precomputed parameters (which may be useful for the caller to reuse).
+//  See below for details.
+//
+//  Design Rationale:
+//  Pretty much every line of code in this file is duplicated four times for
+//  different input types (float4/float3/float2/float).  This is unfortunate,
+//  but Cg doesn't allow function templates.  Macros would be far less verbose,
+//  but they would make the code harder to document and read.  I don't expect
+//  these functions will require a whole lot of maintenance changes unless
+//  someone ever has need for more robust incomplete gamma functions, so code
+//  duplication seems to be the lesser evil in this case.
+
+
+///////////////////////////  GAUSSIAN ERROR FUNCTION  //////////////////////////
+
+float4 erf6(float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Return an Abramowitz/Stegun approximation of erf(), where:
+    //                  erf(x) = 2/sqrt(pi) * integral(e**(-x**2))
+    //              This approximation has a max absolute error of 2.5*10**-5
+    //              with solid numerical robustness and efficiency.  See:
+	//                  https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions
+	static const float4 one = float4(1.0);
+	const float4 sign_x = sign(x);
+	const float4 t = one/(one + 0.47047*abs(x));
+	const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float3 erf6(const float3 x)
+{
+    //  Float3 version:
+	static const float3 one = float3(1.0);
+	const float3 sign_x = sign(x);
+	const float3 t = one/(one + 0.47047*abs(x));
+	const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float2 erf6(const float2 x)
+{
+    //  Float2 version:
+	static const float2 one = float2(1.0);
+	const float2 sign_x = sign(x);
+	const float2 t = one/(one + 0.47047*abs(x));
+	const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float erf6(const float x)
+{
+    //  Float version:
+	const float sign_x = sign(x);
+	const float t = 1.0/(1.0 + 0.47047*abs(x));
+	const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float4 erft(const float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Approximate erf() with the hyperbolic tangent.  The error is
+    //              visually noticeable, but it's blazing fast and perceptually
+    //              close...at least on ATI hardware.  See:
+    //                  http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html
+    //  Warning:    Only use this if your hardware drivers correctly implement
+    //              tanh(): My nVidia 8800GTS returns garbage output.
+	return tanh(1.202760580 * x);
+}
+
+float3 erft(const float3 x)
+{
+    //  Float3 version:
+	return tanh(1.202760580 * x);
+}
+
+float2 erft(const float2 x)
+{
+    //  Float2 version:
+	return tanh(1.202760580 * x);
+}
+
+float erft(const float x)
+{
+    //  Float version:
+	return tanh(1.202760580 * x);
+}
+
+inline float4 erf(const float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Some approximation of erf(x), depending on user settings.
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float3 erf(const float3 x)
+{
+    //  Float3 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float2 erf(const float2 x)
+{
+    //  Float2 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float erf(const float x)
+{
+    //  Float version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+
+///////////////////////////  COMPLETE GAMMA FUNCTION  //////////////////////////
+
+float4 gamma_impl(const float4 s, const float4 s_inv)
+{
+    //  Requires:   1.) s is the standard parameter to the gamma function, and
+    //                  it should lie in the [0, 36] range.
+    //              2.) s_inv = 1.0/s.  This implementation function requires
+    //                  the caller to precompute this value, giving users the
+    //                  opportunity to reuse it.
+    //  Returns:    Return approximate gamma function (real-numbered factorial)
+    //              output using the Lanczos approximation with two coefficients
+    //              calculated using Paul Godfrey's method here:
+    //                  http://my.fit.edu/~gabdo/gamma.txt
+    //              An optimal g value for s in [0, 36] is ~1.12906830989, with
+    //              a maximum relative error of 0.000463 for 2**16 equally
+    //              evals.  We could use three coeffs (0.0000346 error) without
+    //              hurting latency, but this allows more parallelism with
+    //              outside instructions.
+	static const float4 g = float4(1.12906830989);
+	static const float4 c0 = float4(0.8109119309638332633713423362694399653724431);
+	static const float4 c1 = float4(0.4808354605142681877121661197951496120000040);
+	static const float4 e = float4(2.71828182845904523536028747135266249775724709);
+	const float4 sph = s + float4(0.5);
+	const float4 lanczos_sum = c0 + c1/(s + float4(1.0));
+	const float4 base = (sph + g)/e;  //  or (s + g + float4(0.5))/e
+	//  gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s).
+	//  This has less error for small s's than (s -= 1.0) at the beginning.
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float3 gamma_impl(const float3 s, const float3 s_inv)
+{
+    //  Float3 version:
+	static const float3 g = float3(1.12906830989);
+	static const float3 c0 = float3(0.8109119309638332633713423362694399653724431);
+	static const float3 c1 = float3(0.4808354605142681877121661197951496120000040);
+	static const float3 e = float3(2.71828182845904523536028747135266249775724709);
+	const float3 sph = s + float3(0.5);
+	const float3 lanczos_sum = c0 + c1/(s + float3(1.0));
+	const float3 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float2 gamma_impl(const float2 s, const float2 s_inv)
+{
+    //  Float2 version:
+	static const float2 g = float2(1.12906830989);
+	static const float2 c0 = float2(0.8109119309638332633713423362694399653724431);
+	static const float2 c1 = float2(0.4808354605142681877121661197951496120000040);
+	static const float2 e = float2(2.71828182845904523536028747135266249775724709);
+	const float2 sph = s + float2(0.5);
+	const float2 lanczos_sum = c0 + c1/(s + float2(1.0));
+	const float2 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float gamma_impl(const float s, const float s_inv)
+{
+    //  Float version:
+	static const float g = 1.12906830989;
+	static const float c0 = 0.8109119309638332633713423362694399653724431;
+	static const float c1 = 0.4808354605142681877121661197951496120000040;
+	static const float e = 2.71828182845904523536028747135266249775724709;
+	const float sph = s + 0.5;
+	const float lanczos_sum = c0 + c1/(s + 1.0);
+	const float base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float4 gamma(const float4 s)
+{
+    //  Requires:   s is the standard parameter to the gamma function, and it
+    //              should lie in the [0, 36] range.
+    //  Returns:    Return approximate gamma function output with a maximum
+    //              relative error of 0.000463.  See gamma_impl for details.
+	return gamma_impl(s, float4(1.0)/s);
+}
+
+float3 gamma(const float3 s)
+{
+    //  Float3 version:
+	return gamma_impl(s, float3(1.0)/s);
+}
+
+float2 gamma(const float2 s)
+{
+    //  Float2 version:
+	return gamma_impl(s, float2(1.0)/s);
+}
+
+float gamma(const float s)
+{
+    //  Float version:
+	return gamma_impl(s, 1.0/s);
+}
+
+
+////////////////  INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT)  ///////////////
+
+//  Lower incomplete gamma function for small s and z (implementation):
+float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z <= ~0.775075
+    //              3.) s_inv = 1.0/s (precomputed for outside reuse)
+    //  Returns:    A series representation for the lower incomplete gamma
+    //              function for small s and small z (4 terms).
+    //  The actual "rolled up" summation looks like:
+	//      last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0;
+	//      sum = last_sign * last_pow / ((s + k) * last_factorial)
+	//      for(int i = 0; i < 4; ++i)
+	//      {
+	//          last_sign *= -1.0; last_pow *= z; last_factorial *= i;
+	//          sum += last_sign * last_pow / ((s + k) * last_factorial);
+	//      }
+	//  Unrolled, constant-unfolded and arranged for madds and parallelism:
+	const float4 scale = pow(z, s);
+	float4 sum = s_inv;  //  Summation iteration 0 result
+	//  Summation iterations 1, 2, and 3:
+	const float4 z_sq = z*z;
+	const float4 denom1 = s + float4(1.0);
+	const float4 denom2 = 2.0*s + float4(4.0);
+	const float4 denom3 = 6.0*s + float4(18.0);
+	//float4 denom4 = 24.0*s + float4(96.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	//sum += z_sq * z_sq / denom4;
+	//  Scale and return:
+	return scale * sum;
+}
+
+float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv)
+{
+    //  Float3 version:
+	const float3 scale = pow(z, s);
+	float3 sum = s_inv;
+	const float3 z_sq = z*z;
+	const float3 denom1 = s + float3(1.0);
+	const float3 denom2 = 2.0*s + float3(4.0);
+	const float3 denom3 = 6.0*s + float3(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv)
+{
+    //  Float2 version:
+	const float2 scale = pow(z, s);
+	float2 sum = s_inv;
+	const float2 z_sq = z*z;
+	const float2 denom1 = s + float2(1.0);
+	const float2 denom2 = 2.0*s + float2(4.0);
+	const float2 denom3 = 6.0*s + float2(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float ligamma_small_z_impl(const float s, const float z, const float s_inv)
+{
+    //  Float version:
+	const float scale = pow(z, s);
+	float sum = s_inv;
+	const float z_sq = z*z;
+	const float denom1 = s + 1.0;
+	const float denom2 = 2.0*s + 4.0;
+	const float denom3 = 6.0*s + 18.0;
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+//  Upper incomplete gamma function for small s and large z (implementation):
+float4 uigamma_large_z_impl(const float4 s, const float4 z)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z > ~0.775075
+    //  Returns:    Gauss's continued fraction representation for the upper
+    //              incomplete gamma function (4 terms).
+	//  The "rolled up" continued fraction looks like this.  The denominator
+    //  is truncated, and it's calculated "from the bottom up:"
+	//      denom = float4('inf');
+	//      float4 one = float4(1.0);
+	//      for(int i = 4; i > 0; --i)
+	//      {
+	//          denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom;
+	//      }
+	//  Unrolled and constant-unfolded for madds and parallelism:
+	const float4 numerator = pow(z, s) * exp(-z);
+	float4 denom = float4(7.0) + z - s;
+	denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom;
+	denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom;
+	denom = float4(1.0) + z - s + (s - float4(1.0))/denom;
+	return numerator / denom;
+}
+
+float3 uigamma_large_z_impl(const float3 s, const float3 z)
+{
+    //  Float3 version:
+	const float3 numerator = pow(z, s) * exp(-z);
+	float3 denom = float3(7.0) + z - s;
+	denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom;
+	denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom;
+	denom = float3(1.0) + z - s + (s - float3(1.0))/denom;
+	return numerator / denom;
+}
+
+float2 uigamma_large_z_impl(const float2 s, const float2 z)
+{
+    //  Float2 version:
+	const float2 numerator = pow(z, s) * exp(-z);
+	float2 denom = float2(7.0) + z - s;
+	denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom;
+	denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom;
+	denom = float2(1.0) + z - s + (s - float2(1.0))/denom;
+	return numerator / denom;
+}
+
+float uigamma_large_z_impl(const float s, const float z)
+{
+    //  Float version:
+	const float numerator = pow(z, s) * exp(-z);
+	float denom = 7.0 + z - s;
+	denom = 5.0 + z - s + (3.0*s - 9.0)/denom;
+	denom = 3.0 + z - s + (2.0*s - 4.0)/denom;
+	denom = 1.0 + z - s + (s - 1.0)/denom;
+	return numerator / denom;
+}
+
+//  Normalized lower incomplete gamma function for small s (implementation):
+float4 normalized_ligamma_impl(const float4 s, const float4 z,
+    const float4 s_inv, const float4 gamma_s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) s_inv = 1/s (precomputed for outside reuse)
+    //              3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse)
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  Since we only care about s < 0.5, we only need
+    //              to evaluate two branches (not four) based on z.  Each branch
+    //              uses four terms, with a max relative error of ~0.00182.  The
+    //              branch threshold and specifics were adapted for fewer terms
+    //              from Gil/Segura/Temme's paper here:
+    //                  http://oai.cwi.nl/oai/asset/20433/20433B.pdf
+	//  Evaluate both branches: Real branches test slower even when available.
+	static const float4 thresh = float4(0.775075);
+	bool4 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	z_is_large.w = z.w > thresh.w;
+	const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	//  Combine the results from both branches:
+	bool4 inverse_z_is_large = not(z_is_large);
+	return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large);
+}
+
+float3 normalized_ligamma_impl(const float3 s, const float3 z,
+    const float3 s_inv, const float3 gamma_s_inv)
+{
+    //  Float3 version:
+	static const float3 thresh = float3(0.775075);
+	bool3 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool3 inverse_z_is_large = not(z_is_large);
+	return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large);
+}
+
+float2 normalized_ligamma_impl(const float2 s, const float2 z,
+    const float2 s_inv, const float2 gamma_s_inv)
+{
+    //  Float2 version:
+	static const float2 thresh = float2(0.775075);
+	bool2 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool2 inverse_z_is_large = not(z_is_large);
+	return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large);
+}
+
+float normalized_ligamma_impl(const float s, const float z,
+    const float s_inv, const float gamma_s_inv)
+{
+    //  Float version:
+	static const float thresh = 0.775075;
+	const bool z_is_large = z > thresh;
+	const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	return large_z * float(z_is_large) + small_z * float(!z_is_large);
+}
+
+//  Normalized lower incomplete gamma function for small s:
+float4 normalized_ligamma(const float4 s, const float4 z)
+{
+    //  Requires:   s < ~0.5
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  See normalized_ligamma_impl() for details.
+	const float4 s_inv = float4(1.0)/s;
+	const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float3 normalized_ligamma(const float3 s, const float3 z)
+{
+    //  Float3 version:
+	const float3 s_inv = float3(1.0)/s;
+	const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float2 normalized_ligamma(const float2 s, const float2 z)
+{
+    //  Float2 version:
+	const float2 s_inv = float2(1.0)/s;
+	const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float normalized_ligamma(const float s, const float z)
+{
+    //  Float version:
+	const float s_inv = 1.0/s;
+	const float gamma_s_inv = 1.0/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+#endif  //  SPECIAL_FUNCTIONS_H
+
+////////////////////////////  END SPECIAL-FUNCTIONS  ///////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////////  HELPERS  //////////////////////////////////
+
+inline float4 uv2_to_uv4(float2 tex_uv)
+{
+    //  Make a float2 uv offset safe for adding to float4 tex2Dlod coords:
+    return float4(tex_uv, 0.0, 0.0);
+}
+
+//  Make a length squared helper macro (for usage with static constants):
+#define LENGTH_SQ(vec) (dot(vec, vec))
+
+inline float get_fast_gaussian_weight_sum_inv(const float sigma)
+{
+    //  We can use the Gaussian integral to calculate the asymptotic weight for
+    //  the center pixel.  Since the unnormalized center pixel weight is 1.0,
+    //  the normalized weight is the same as the weight sum inverse.  Given a
+    //  large enough blur (9+), the asymptotic weight sum is close and faster:
+    //      center_weight = 0.5 *
+    //          (erf(0.5/(sigma*sqrt(2.0))) - erf(-0.5/(sigma*sqrt(2.0))))
+    //      erf(-x) == -erf(x), so we get 0.5 * (2.0 * erf(blah blah)):
+    //  However, we can get even faster results with curve-fitting.  These are
+    //  also closer than the asymptotic results, because they were constructed
+    //  from 64 blurs sizes from [3, 131) and 255 equally-spaced sigmas from
+    //  (0, blurN_std_dev), so the results for smaller sigmas are biased toward
+    //  smaller blurs.  The max error is 0.0031793913.
+    //  Relative FPS: 134.3 with erf, 135.8 with curve-fitting.
+    //static const float temp = 0.5/sqrt(2.0);
+    //return erf(temp/sigma);
+    return min(exp(exp(0.348348412457428/
+        (sigma - 0.0860587260734721))), 0.399334576340352/sigma);
+}
+
+
+////////////////////  ARBITRARILY RESIZABLE SEPARABLE BLURS  ///////////////////
+
+float3 tex2Dblur11resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 11x Gaussian blurred texture lookup using a 11-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  Calculate Gaussian blur kernel weights and a normalization factor for
+    //  distances of 0-4, ignoring constant factors (since we're normalizing).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float weight_sum_inv = 1.0 /
+        (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5));
+    //  Statically normalize weights, sum weighted samples, and return.  Blurs are
+    //  currently optimized for dynamic weights.
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w5 * tex2D_linearize(tex, tex_uv - 5.0 * dxdy).rgb;
+    sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
+    sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb;
+    sum += w5 * tex2D_linearize(tex, tex_uv + 5.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur9resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 9x Gaussian blurred texture lookup using a 9-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
+    sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur7resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 7x Gaussian blurred texture lookup using a 7-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3));
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur5resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 5x Gaussian blurred texture lookup using a 5-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2));
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur3resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 3x Gaussian blurred texture lookup using a 3-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1);
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+
+///////////////////////////  FAST SEPARABLE BLURS  ///////////////////////////
+
+float3 tex2Dblur11fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   1.) Global requirements must be met (see file description).
+    //              2.) filter_linearN must = "true" in your .cgp file.
+    //              3.) For gamma-correct bilinear filtering, global
+    //                  gamma_aware_bilinear == true (from gamma-management.h)
+    //  Returns:    A 1D 11x Gaussian blurred texture lookup using 6 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float weight_sum_inv = 1.0 /
+        (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    const float w01 = w0 * 0.5 + w1;
+    const float w23 = w2 + w3;
+    const float w45 = w4 + w5;
+    const float w01_ratio = w1/w01;
+    const float w23_ratio = w3/w23;
+    const float w45_ratio = w5/w45;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w45 * tex2D_linearize(tex, tex_uv - (4.0 + w45_ratio) * dxdy).rgb;
+    sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb;
+    sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb;
+    sum += w45 * tex2D_linearize(tex, tex_uv + (4.0 + w45_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur9fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 9x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 4 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    const float w12 = w1 + w2;
+    const float w34 = w3 + w4;
+    const float w12_ratio = w2/w12;
+    const float w34_ratio = w4/w34;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w34 * tex2D_linearize(tex, tex_uv - (3.0 + w34_ratio) * dxdy).rgb;
+    sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb;
+    sum += w34 * tex2D_linearize(tex, tex_uv + (3.0 + w34_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur7fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 7x Gaussian blurred texture lookup using 4 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    const float w01 = w0 * 0.5 + w1;
+    const float w23 = w2 + w3;
+    const float w01_ratio = w1/w01;
+    const float w23_ratio = w3/w23;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb;
+    sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur5fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 5x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 2 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    const float w12 = w1 + w2;
+    const float w12_ratio = w2/w12;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur3fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 3x Gaussian blurred texture lookup using 2 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    const float w01 = w0 * 0.5 + w1;
+    const float w01_ratio = w1/w01;
+    //  Weights for all samples are the same, so just average them:
+    return 0.5 * (
+        tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb +
+        tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb);
+}
+
+
+////////////////////////////  HUGE SEPARABLE BLURS  ////////////////////////////
+
+//  Huge separable blurs come only in "fast" versions.
+float3 tex2Dblur43fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 43x Gaussian blurred texture lookup using 22 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float w6 = exp(-36.0 * denom_inv);
+    const float w7 = exp(-49.0 * denom_inv);
+    const float w8 = exp(-64.0 * denom_inv);
+    const float w9 = exp(-81.0 * denom_inv);
+    const float w10 = exp(-100.0 * denom_inv);
+    const float w11 = exp(-121.0 * denom_inv);
+    const float w12 = exp(-144.0 * denom_inv);
+    const float w13 = exp(-169.0 * denom_inv);
+    const float w14 = exp(-196.0 * denom_inv);
+    const float w15 = exp(-225.0 * denom_inv);
+    const float w16 = exp(-256.0 * denom_inv);
+    const float w17 = exp(-289.0 * denom_inv);
+    const float w18 = exp(-324.0 * denom_inv);
+    const float w19 = exp(-361.0 * denom_inv);
+    const float w20 = exp(-400.0 * denom_inv);
+    const float w21 = exp(-441.0 * denom_inv);
+    //const float weight_sum_inv = 1.0 /
+    //    (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 +
+    //        w12 + w13 + w14 + w15 + w16 + w17 + w18 + w19 + w20 + w21));
+    const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    const float w0_1 = w0 * 0.5 + w1;
+    const float w2_3 = w2 + w3;
+    const float w4_5 = w4 + w5;
+    const float w6_7 = w6 + w7;
+    const float w8_9 = w8 + w9;
+    const float w10_11 = w10 + w11;
+    const float w12_13 = w12 + w13;
+    const float w14_15 = w14 + w15;
+    const float w16_17 = w16 + w17;
+    const float w18_19 = w18 + w19;
+    const float w20_21 = w20 + w21;
+    const float w0_1_ratio = w1/w0_1;
+    const float w2_3_ratio = w3/w2_3;
+    const float w4_5_ratio = w5/w4_5;
+    const float w6_7_ratio = w7/w6_7;
+    const float w8_9_ratio = w9/w8_9;
+    const float w10_11_ratio = w11/w10_11;
+    const float w12_13_ratio = w13/w12_13;
+    const float w14_15_ratio = w15/w14_15;
+    const float w16_17_ratio = w17/w16_17;
+    const float w18_19_ratio = w19/w18_19;
+    const float w20_21_ratio = w21/w20_21;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w20_21 * tex2D_linearize(tex, tex_uv - (20.0 + w20_21_ratio) * dxdy).rgb;
+    sum += w18_19 * tex2D_linearize(tex, tex_uv - (18.0 + w18_19_ratio) * dxdy).rgb;
+    sum += w16_17 * tex2D_linearize(tex, tex_uv - (16.0 + w16_17_ratio) * dxdy).rgb;
+    sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb;
+    sum += w16_17 * tex2D_linearize(tex, tex_uv + (16.0 + w16_17_ratio) * dxdy).rgb;
+    sum += w18_19 * tex2D_linearize(tex, tex_uv + (18.0 + w18_19_ratio) * dxdy).rgb;
+    sum += w20_21 * tex2D_linearize(tex, tex_uv + (20.0 + w20_21_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur31fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 31x Gaussian blurred texture lookup using 16 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float w6 = exp(-36.0 * denom_inv);
+    const float w7 = exp(-49.0 * denom_inv);
+    const float w8 = exp(-64.0 * denom_inv);
+    const float w9 = exp(-81.0 * denom_inv);
+    const float w10 = exp(-100.0 * denom_inv);
+    const float w11 = exp(-121.0 * denom_inv);
+    const float w12 = exp(-144.0 * denom_inv);
+    const float w13 = exp(-169.0 * denom_inv);
+    const float w14 = exp(-196.0 * denom_inv);
+    const float w15 = exp(-225.0 * denom_inv);
+    //const float weight_sum_inv = 1.0 /
+    //    (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 +
+    //        w9 + w10 + w11 + w12 + w13 + w14 + w15));
+    const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    const float w0_1 = w0 * 0.5 + w1;
+    const float w2_3 = w2 + w3;
+    const float w4_5 = w4 + w5;
+    const float w6_7 = w6 + w7;
+    const float w8_9 = w8 + w9;
+    const float w10_11 = w10 + w11;
+    const float w12_13 = w12 + w13;
+    const float w14_15 = w14 + w15;
+    const float w0_1_ratio = w1/w0_1;
+    const float w2_3_ratio = w3/w2_3;
+    const float w4_5_ratio = w5/w4_5;
+    const float w6_7_ratio = w7/w6_7;
+    const float w8_9_ratio = w9/w8_9;
+    const float w10_11_ratio = w11/w10_11;
+    const float w12_13_ratio = w13/w12_13;
+    const float w14_15_ratio = w15/w14_15;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur25fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 25x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 12 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float w6 = exp(-36.0 * denom_inv);
+    const float w7 = exp(-49.0 * denom_inv);
+    const float w8 = exp(-64.0 * denom_inv);
+    const float w9 = exp(-81.0 * denom_inv);
+    const float w10 = exp(-100.0 * denom_inv);
+    const float w11 = exp(-121.0 * denom_inv);
+    const float w12 = exp(-144.0 * denom_inv);
+    //const float weight_sum_inv = 1.0 / (w0 + 2.0 * (
+    //    w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12));
+    const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    const float w1_2 = w1 + w2;
+    const float w3_4 = w3 + w4;
+    const float w5_6 = w5 + w6;
+    const float w7_8 = w7 + w8;
+    const float w9_10 = w9 + w10;
+    const float w11_12 = w11 + w12;
+    const float w1_2_ratio = w2/w1_2;
+    const float w3_4_ratio = w4/w3_4;
+    const float w5_6_ratio = w6/w5_6;
+    const float w7_8_ratio = w8/w7_8;
+    const float w9_10_ratio = w10/w9_10;
+    const float w11_12_ratio = w12/w11_12;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w11_12 * tex2D_linearize(tex, tex_uv - (11.0 + w11_12_ratio) * dxdy).rgb;
+    sum += w9_10 * tex2D_linearize(tex, tex_uv - (9.0 + w9_10_ratio) * dxdy).rgb;
+    sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w9_10 * tex2D_linearize(tex, tex_uv + (9.0 + w9_10_ratio) * dxdy).rgb;
+    sum += w11_12 * tex2D_linearize(tex, tex_uv + (11.0 + w11_12_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur17fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 17x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 8 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float w6 = exp(-36.0 * denom_inv);
+    const float w7 = exp(-49.0 * denom_inv);
+    const float w8 = exp(-64.0 * denom_inv);
+    //const float weight_sum_inv = 1.0 / (w0 + 2.0 * (
+    //    w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8));
+    const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    const float w1_2 = w1 + w2;
+    const float w3_4 = w3 + w4;
+    const float w5_6 = w5 + w6;
+    const float w7_8 = w7 + w8;
+    const float w1_2_ratio = w2/w1_2;
+    const float w3_4_ratio = w4/w3_4;
+    const float w5_6_ratio = w6/w5_6;
+    const float w7_8_ratio = w8/w7_8;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+
+////////////////////  ARBITRARILY RESIZABLE ONE-PASS BLURS  ////////////////////
+
+float3 tex2Dblur3x3resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 3x3 Gaussian blurred mipmapped texture lookup of the
+    //              resized input.
+    //  Description:
+    //  This is the only arbitrarily resizable one-pass blur; tex2Dblur5x5resize
+    //  would perform like tex2Dblur9x9, MUCH slower than tex2Dblur5resize.
+    const float denom_inv = 0.5/(sigma*sigma);
+    //  Load each sample.  We need all 3x3 samples.  Quad-pixel communication
+    //  won't help either: This should perform like tex2Dblur5x5, but sharing a
+    //  4x4 sample field would perform more like tex2Dblur8x8shared (worse).
+    const float2 sample4_uv = tex_uv;
+    const float2 dx = float2(dxdy.x, 0.0);
+    const float2 dy = float2(0.0, dxdy.y);
+    const float2 sample1_uv = sample4_uv - dy;
+    const float2 sample7_uv = sample4_uv + dy;
+    const float3 sample0 = tex2D_linearize(tex, sample1_uv - dx).rgb;
+    const float3 sample1 = tex2D_linearize(tex, sample1_uv).rgb;
+    const float3 sample2 = tex2D_linearize(tex, sample1_uv + dx).rgb;
+    const float3 sample3 = tex2D_linearize(tex, sample4_uv - dx).rgb;
+    const float3 sample4 = tex2D_linearize(tex, sample4_uv).rgb;
+    const float3 sample5 = tex2D_linearize(tex, sample4_uv + dx).rgb;
+    const float3 sample6 = tex2D_linearize(tex, sample7_uv - dx).rgb;
+    const float3 sample7 = tex2D_linearize(tex, sample7_uv).rgb;
+    const float3 sample8 = tex2D_linearize(tex, sample7_uv + dx).rgb;
+    //  Statically compute Gaussian sample weights:
+    const float w4 = 1.0;
+    const float w1_3_5_7 = exp(-LENGTH_SQ(float2(1.0, 0.0)) * denom_inv);
+    const float w0_2_6_8 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
+    const float weight_sum_inv = 1.0/(w4 + 4.0 * (w1_3_5_7 + w0_2_6_8));
+    //  Weight and sum the samples:
+    const float3 sum = w4 * sample4 +
+        w1_3_5_7 * (sample1 + sample3 + sample5 + sample7) +
+        w0_2_6_8 * (sample0 + sample2 + sample6 + sample8);
+    return sum * weight_sum_inv;
+}
+
+
+////////////////////////////  FASTER ONE-PASS BLURS  ///////////////////////////
+
+float3 tex2Dblur9x9(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Perform a 1-pass 9x9 blur with 5x5 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 9x9 Gaussian blurred mipmapped texture lookup composed of
+    //              5x5 carefully selected bilinear samples.
+    //  Description:
+    //  Perform a 1-pass 9x9 blur with 5x5 bilinear samples.  Adjust the
+    //  bilinear sample location to reflect the true Gaussian weights for each
+    //  underlying texel.  The following diagram illustrates the relative
+    //  locations of bilinear samples.  Each sample with the same number has the
+    //  same weight (notice the symmetry).  The letters a, b, c, d distinguish
+    //  quadrants, and the letters U, D, L, R, C (up, down, left, right, center)
+    //  distinguish 1D directions along the line containing the pixel center:
+    //      6a 5a 2U 5b 6b
+    //      4a 3a 1U 3b 4b
+    //      2L 1L 0C 1R 2R
+    //      4c 3c 1D 3d 4d
+    //      6c 5c 2D 5d 6d
+    //  The following diagram illustrates the underlying equally spaced texels,
+    //  named after the sample that accesses them and subnamed by their location
+    //  within their 2x2, 2x1, 1x2, or 1x1 texel block:
+    //      6a4 6a3 5a4 5a3 2U2 5b3 5b4 6b3 6b4
+    //      6a2 6a1 5a2 5a1 2U1 5b1 5b2 6b1 6b2
+    //      4a4 4a3 3a4 3a3 1U2 3b3 3b4 4b3 4b4
+    //      4a2 4a1 3a2 3a1 1U1 3b1 3b2 4b1 4b2
+    //      2L2 2L1 1L2 1L1 0C1 1R1 1R2 2R1 2R2
+    //      4c2 4c1 3c2 3c1 1D1 3d1 3d2 4d1 4d2
+    //      4c4 4c3 3c4 3c3 1D2 3d3 3d4 4d3 4d4
+    //      6c2 6c1 5c2 5c1 2D1 5d1 5d2 6d1 6d2
+    //      6c4 6c3 5c4 5c3 2D2 5d3 5d4 6d3 6d4
+    //  Note there is only one C texel and only two texels for each U, D, L, or
+    //  R sample.  The center sample is effectively a nearest neighbor sample,
+    //  and the U/D/L/R samples use 1D linear filtering.  All other texels are
+    //  read with bilinear samples somewhere within their 2x2 texel blocks.
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute sampling offsets within each 2x2 texel block, based
+    //  on 1D sampling ratios between texels [1, 2] and [3, 4] texels away from
+    //  the center, and reuse them independently for both dimensions.  Compute
+    //  these offsets based on the relative 1D Gaussian weights of the texels
+    //  in question.  (w1off means "Gaussian weight for the texel 1.0 texels
+    //  away from the pixel center," etc.).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w1off = exp(-1.0 * denom_inv);
+    const float w2off = exp(-4.0 * denom_inv);
+    const float w3off = exp(-9.0 * denom_inv);
+    const float w4off = exp(-16.0 * denom_inv);
+    const float texel1to2ratio = w2off/(w1off + w2off);
+    const float texel3to4ratio = w4off/(w3off + w4off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including x-axis-aligned:
+    const float2 sample1R_texel_offset = float2(1.0, 0.0) + float2(texel1to2ratio, 0.0);
+    const float2 sample2R_texel_offset = float2(3.0, 0.0) + float2(texel3to4ratio, 0.0);
+    const float2 sample3d_texel_offset = float2(1.0, 1.0) + float2(texel1to2ratio, texel1to2ratio);
+    const float2 sample4d_texel_offset = float2(3.0, 1.0) + float2(texel3to4ratio, texel1to2ratio);
+    const float2 sample5d_texel_offset = float2(1.0, 3.0) + float2(texel1to2ratio, texel3to4ratio);
+    const float2 sample6d_texel_offset = float2(3.0, 3.0) + float2(texel3to4ratio, texel3to4ratio);
+
+    //  CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
+    //  Statically compute Gaussian texel weights for the bottom-right quadrant.
+    //  Read underscores as "and."
+    const float w1R1 = w1off;
+    const float w1R2 = w2off;
+    const float w2R1 = w3off;
+    const float w2R2 = w4off;
+    const float w3d1 =     exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
+    const float w3d2_3d3 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv);
+    const float w3d4 =     exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv);
+    const float w4d1_5d1 = exp(-LENGTH_SQ(float2(3.0, 1.0)) * denom_inv);
+    const float w4d2_5d3 = exp(-LENGTH_SQ(float2(4.0, 1.0)) * denom_inv);
+    const float w4d3_5d2 = exp(-LENGTH_SQ(float2(3.0, 2.0)) * denom_inv);
+    const float w4d4_5d4 = exp(-LENGTH_SQ(float2(4.0, 2.0)) * denom_inv);
+    const float w6d1 =     exp(-LENGTH_SQ(float2(3.0, 3.0)) * denom_inv);
+    const float w6d2_6d3 = exp(-LENGTH_SQ(float2(4.0, 3.0)) * denom_inv);
+    const float w6d4 =     exp(-LENGTH_SQ(float2(4.0, 4.0)) * denom_inv);
+    //  Statically add texel weights in each sample to get sample weights:
+    const float w0 = 1.0;
+    const float w1 = w1R1 + w1R2;
+    const float w2 = w2R1 + w2R2;
+    const float w3 = w3d1 + 2.0 * w3d2_3d3 + w3d4;
+    const float w4 = w4d1_5d1 + w4d2_5d3 + w4d3_5d2 + w4d4_5d4;
+    const float w5 = w4;
+    const float w6 = w6d1 + 2.0 * w6d2_6d3 + w6d4;
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv =
+        1.0/(w0 + 4.0 * (w1 + w2 + w3 + w4 + w5 + w6));
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 25 samples (1 nearest, 8 linear, 16 bilinear) using symmetry:
+    const float2 mirror_x = float2(-1.0, 1.0);
+    const float2 mirror_y = float2(1.0, -1.0);
+    const float2 mirror_xy = float2(-1.0, -1.0);
+    const float2 dxdy_mirror_x = dxdy * mirror_x;
+    const float2 dxdy_mirror_y = dxdy * mirror_y;
+    const float2 dxdy_mirror_xy = dxdy * mirror_xy;
+    //  Sampling order doesn't seem to affect performance, so just be clear:
+    const float3 sample0C = tex2D_linearize(tex, tex_uv).rgb;
+    const float3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb;
+    const float3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb;
+    const float3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb;
+    const float3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb;
+    const float3 sample2R = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset).rgb;
+    const float3 sample2D = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset.yx).rgb;
+    const float3 sample2L = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset).rgb;
+    const float3 sample2U = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset.yx).rgb;
+    const float3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb;
+    const float3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb;
+    const float3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb;
+    const float3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb;
+    const float3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb;
+    const float3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb;
+    const float3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb;
+    const float3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb;
+    const float3 sample5d = tex2D_linearize(tex, tex_uv + dxdy * sample5d_texel_offset).rgb;
+    const float3 sample5c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample5d_texel_offset).rgb;
+    const float3 sample5b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample5d_texel_offset).rgb;
+    const float3 sample5a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample5d_texel_offset).rgb;
+    const float3 sample6d = tex2D_linearize(tex, tex_uv + dxdy * sample6d_texel_offset).rgb;
+    const float3 sample6c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample6d_texel_offset).rgb;
+    const float3 sample6b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample6d_texel_offset).rgb;
+    const float3 sample6a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample6d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    float3 sum = w0 * sample0C;
+    sum += w1 * (sample1R + sample1D + sample1L + sample1U);
+    sum += w2 * (sample2R + sample2D + sample2L + sample2U);
+    sum += w3 * (sample3d + sample3c + sample3b + sample3a);
+    sum += w4 * (sample4d + sample4c + sample4b + sample4a);
+    sum += w5 * (sample5d + sample5c + sample5b + sample5a);
+    sum += w6 * (sample6d + sample6c + sample6b + sample6a);
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur7x7(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Perform a 1-pass 7x7 blur with 5x5 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 7x7 Gaussian blurred mipmapped texture lookup composed of
+    //              4x4 carefully selected bilinear samples.
+    //  Description:
+    //  First see the descriptions for tex2Dblur9x9() and tex2Dblur7().  This
+    //  blur mixes concepts from both.  The sample layout is as follows:
+    //      4a 3a 3b 4b
+    //      2a 1a 1b 2b
+    //      2c 1c 1d 2d
+    //      4c 3c 3d 4d
+    //  The texel layout is as follows.  Note that samples 3a/3b, 1a/1b, 1c/1d,
+    //  and 3c/3d share a vertical column of texels, and samples 2a/2c, 1a/1c,
+    //  1b/1d, and 2b/2d share a horizontal row of texels (all sample1's share
+    //  the center texel):
+    //      4a4  4a3  3a4  3ab3 3b4  4b3  4b4
+    //      4a2  4a1  3a2  3ab1 3b2  4b1  4b2
+    //      2a4  2a3  1a4  1ab3 1b4  2b3  2b4
+    //      2ac2 2ac1 1ac2 1*   1bd2 2bd1 2bd2
+    //      2c4  2c3  1c4  1cd3 1d4  2d3  2d4
+    //      4c2  4c1  3c2  3cd1 3d2  4d1  4d2
+    //      4c4  4c3  3c4  3cd3 3d4  4d3  4d4
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off = 1.0;
+    const float w1off = exp(-1.0 * denom_inv);
+    const float w2off = exp(-4.0 * denom_inv);
+    const float w3off = exp(-9.0 * denom_inv);
+    const float texel0to1ratio = w1off/(w0off * 0.5 + w1off);
+    const float texel2to3ratio = w3off/(w2off + w3off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including axis-aligned:
+    const float2 sample1d_texel_offset = float2(texel0to1ratio, texel0to1ratio);
+    const float2 sample2d_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample3d_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample4d_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+
+    //  CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
+    //  Statically compute Gaussian texel weights for the bottom-right quadrant.
+    //  Read underscores as "and."
+    const float w1abcd = 1.0;
+    const float w1bd2_1cd3 = exp(-LENGTH_SQ(float2(1.0, 0.0)) * denom_inv);
+    const float w2bd1_3cd1 = exp(-LENGTH_SQ(float2(2.0, 0.0)) * denom_inv);
+    const float w2bd2_3cd2 = exp(-LENGTH_SQ(float2(3.0, 0.0)) * denom_inv);
+    const float w1d4 =       exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
+    const float w2d3_3d2 =   exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv);
+    const float w2d4_3d4 =   exp(-LENGTH_SQ(float2(3.0, 1.0)) * denom_inv);
+    const float w4d1 =       exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv);
+    const float w4d2_4d3 =   exp(-LENGTH_SQ(float2(3.0, 2.0)) * denom_inv);
+    const float w4d4 =       exp(-LENGTH_SQ(float2(3.0, 3.0)) * denom_inv);
+    //  Statically add texel weights in each sample to get sample weights.
+    //  Split weights for shared texels between samples sharing them:
+    const float w1 = w1abcd * 0.25 + w1bd2_1cd3 + w1d4;
+    const float w2_3 = (w2bd1_3cd1 + w2bd2_3cd2) * 0.5 + w2d3_3d2 + w2d4_3d4;
+    const float w4 = w4d1 + 2.0 * w4d2_4d3 + w4d4;
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv =
+        1.0/(4.0 * (w1 + 2.0 * w2_3 + w4));
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 16 samples using symmetry:
+    const float2 mirror_x = float2(-1.0, 1.0);
+    const float2 mirror_y = float2(1.0, -1.0);
+    const float2 mirror_xy = float2(-1.0, -1.0);
+    const float2 dxdy_mirror_x = dxdy * mirror_x;
+    const float2 dxdy_mirror_y = dxdy * mirror_y;
+    const float2 dxdy_mirror_xy = dxdy * mirror_xy;
+    const float3 sample1a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample1d_texel_offset).rgb;
+    const float3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb;
+    const float3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb;
+    const float3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb;
+    const float3 sample1b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample1d_texel_offset).rgb;
+    const float3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb;
+    const float3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb;
+    const float3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb;
+    const float3 sample1c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample1d_texel_offset).rgb;
+    const float3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb;
+    const float3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb;
+    const float3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb;
+    const float3 sample1d = tex2D_linearize(tex, tex_uv + dxdy * sample1d_texel_offset).rgb;
+    const float3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb;
+    const float3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb;
+    const float3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w1 * (sample1a + sample1b + sample1c + sample1d);
+    sum += w2_3 * (sample2a + sample2b + sample2c + sample2d);
+    sum += w2_3 * (sample3a + sample3b + sample3c + sample3d);
+    sum += w4 * (sample4a + sample4b + sample4c + sample4d);
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur5x5(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Perform a 1-pass 5x5 blur with 3x3 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 5x5 Gaussian blurred mipmapped texture lookup composed of
+    //              3x3 carefully selected bilinear samples.
+    //  Description:
+    //  First see the description for tex2Dblur9x9().  This blur uses the same
+    //  concept and sample/texel locations except on a smaller scale.  Samples:
+    //      2a 1U 2b
+    //      1L 0C 1R
+    //      2c 1D 2d
+    //  Texels:
+    //      2a4 2a3 1U2 2b3 2b4
+    //      2a2 2a1 1U1 2b1 2b2
+    //      1L2 1L1 0C1 1R1 1R2
+    //      2c2 2c1 1D1 2d1 2d2
+    //      2c4 2c3 1D2 2d3 2d4
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w1off = exp(-1.0 * denom_inv);
+    const float w2off = exp(-4.0 * denom_inv);
+    const float texel1to2ratio = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including x-axis-aligned:
+    const float2 sample1R_texel_offset = float2(1.0, 0.0) + float2(texel1to2ratio, 0.0);
+    const float2 sample2d_texel_offset = float2(1.0, 1.0) + float2(texel1to2ratio, texel1to2ratio);
+
+    //  CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
+    //  Statically compute Gaussian texel weights for the bottom-right quadrant.
+    //  Read underscores as "and."
+    const float w1R1 = w1off;
+    const float w1R2 = w2off;
+    const float w2d1 =   exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
+    const float w2d2_3 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv);
+    const float w2d4 =   exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv);
+    //  Statically add texel weights in each sample to get sample weights:
+    const float w0 = 1.0;
+    const float w1 = w1R1 + w1R2;
+    const float w2 = w2d1 + 2.0 * w2d2_3 + w2d4;
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv = 1.0/(w0 + 4.0 * (w1 + w2));
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 9 samples (1 nearest, 4 linear, 4 bilinear) using symmetry:
+    const float2 mirror_x = float2(-1.0, 1.0);
+    const float2 mirror_y = float2(1.0, -1.0);
+    const float2 mirror_xy = float2(-1.0, -1.0);
+    const float2 dxdy_mirror_x = dxdy * mirror_x;
+    const float2 dxdy_mirror_y = dxdy * mirror_y;
+    const float2 dxdy_mirror_xy = dxdy * mirror_xy;
+    const float3 sample0C = tex2D_linearize(tex, tex_uv).rgb;
+    const float3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb;
+    const float3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb;
+    const float3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb;
+    const float3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb;
+    const float3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb;
+    const float3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb;
+    const float3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb;
+    const float3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    float3 sum = w0 * sample0C;
+    sum += w1 * (sample1R + sample1D + sample1L + sample1U);
+    sum += w2 * (sample2a + sample2b + sample2c + sample2d);
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur3x3(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Perform a 1-pass 3x3 blur with 5x5 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 3x3 Gaussian blurred mipmapped texture lookup composed of
+    //              2x2 carefully selected bilinear samples.
+    //  Description:
+    //  First see the descriptions for tex2Dblur9x9() and tex2Dblur7().  This
+    //  blur mixes concepts from both.  The sample layout is as follows:
+    //      0a 0b
+    //      0c 0d
+    //  The texel layout is as follows.  Note that samples 0a/0b and 0c/0d share
+    //  a vertical column of texels, and samples 0a/0c and 0b/0d share a
+    //  horizontal row of texels (all samples share the center texel):
+    //      0a3  0ab2 0b3
+    //      0ac1 0*0  0bd1
+    //      0c3  0cd2 0d3
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off = 1.0;
+    const float w1off = exp(-1.0 * denom_inv);
+    const float texel0to1ratio = w1off/(w0off * 0.5 + w1off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including axis-aligned:
+    const float2 sample0d_texel_offset = float2(texel0to1ratio, texel0to1ratio);
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 4 samples using symmetry:
+    const float2 mirror_x = float2(-1.0, 1.0);
+    const float2 mirror_y = float2(1.0, -1.0);
+    const float2 mirror_xy = float2(-1.0, -1.0);
+    const float2 dxdy_mirror_x = dxdy * mirror_x;
+    const float2 dxdy_mirror_y = dxdy * mirror_y;
+    const float2 dxdy_mirror_xy = dxdy * mirror_xy;
+    const float3 sample0a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample0d_texel_offset).rgb;
+    const float3 sample0b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample0d_texel_offset).rgb;
+    const float3 sample0c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample0d_texel_offset).rgb;
+    const float3 sample0d = tex2D_linearize(tex, tex_uv + dxdy * sample0d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Weights for all samples are the same, so just average them:
+    return 0.25 * (sample0a + sample0b + sample0c + sample0d);
+}
+
+
+//////////////////  LINEAR ONE-PASS BLURS WITH SHARED SAMPLES  /////////////////
+
+float3 tex2Dblur12x12shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
+    const float sigma)
+{
+    //  Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
+    //  Requires:   1.) Same as tex2Dblur9()
+    //              2.) ddx() and ddy() are present in the current Cg profile.
+    //              3.) The GPU driver is using fine/high-quality derivatives.
+    //              4.) quad_vector *correctly* describes the current fragment's
+    //                  location in its pixel quad, by the conventions noted in
+    //                  get_quad_vector[_naive].
+    //              5.) tex_uv.w = log2(video_size/output_size).y
+    //              6.) tex2Dlod() is present in the current Cg profile.
+    //  Optional:   Tune artifacts vs. excessive blurriness with the global
+    //              float error_blurring.
+    //  Returns:    A blurred texture lookup using a "virtual" 12x12 Gaussian
+    //              blur (a 6x6 blur of carefully selected bilinear samples)
+    //              of the given mip level.  There will be subtle inaccuracies,
+    //              especially for small or high-frequency detailed sources.
+    //  Description:
+    //  Perform a 1-pass blur with shared texture lookups across a pixel quad.
+    //  We'll get neighboring samples with high-quality ddx/ddy derivatives, as
+    //  in GPU Pro 2, Chapter VI.2, "Shader Amortization using Pixel Quad
+    //  Message Passing" by Eric Penner.
+    //
+    //  Our "virtual" 12x12 blur will be comprised of ((6 - 1)^2)/4 + 3 = 12
+    //  bilinear samples, where bilinear sampling positions are computed from
+    //  the relative Gaussian weights of the 4 surrounding texels.  The catch is
+    //  that the appropriate texel weights and sample coords differ for each
+    //  fragment, but we're reusing most of the same samples across a quad of
+    //  destination fragments.  (We do use unique coords for the four nearest
+    //  samples at each fragment.)  Mixing bilinear filtering and sample-sharing
+    //  therefore introduces some error into the weights, and this can get nasty
+    //  when the source image is small or high-frequency.  Computing bilinear
+    //  ratios based on weights at the sample field center results in sharpening
+    //  and ringing artifacts, but we can move samples closer to halfway between
+    //  texels to try blurring away the error (which can move features around by
+    //  a texel or so).  Tune this with the global float "error_blurring".
+    //
+    //  The pixel quad's sample field covers 12x12 texels, accessed through 6x6
+    //  bilinear (2x2 texel) taps.  Each fragment depends on a window of 10x10
+    //  texels (5x5 bilinear taps), and each fragment is responsible for loading
+    //  a 6x6 texel quadrant as a 3x3 block of bilinear taps, plus 3 more taps
+    //  to use unique bilinear coords for sample0* for each fragment.  This
+    //  diagram illustrates the relative locations of bilinear samples 1-9 for
+    //  each quadrant a, b, c, d (note samples will not be equally spaced):
+    //      8a 7a 6a 6b 7b 8b
+    //      5a 4a 3a 3b 4b 5b
+    //      2a 1a 0a 0b 1b 2b
+    //      2c 1c 0c 0d 1d 2d
+    //      5c 4c 3c 3d 4d 5d
+    //      8c 7c 6c 6d 7d 8d
+    //  The following diagram illustrates the underlying equally spaced texels,
+    //  named after the sample that accesses them and subnamed by their location
+    //  within their 2x2 texel block:
+    //      8a3 8a2 7a3 7a2 6a3 6a2 6b2 6b3 7b2 7b3 8b2 8b3
+    //      8a1 8a0 7a1 7a0 6a1 6a0 6b0 6b1 7b0 7b1 8b0 8b1
+    //      5a3 5a2 4a3 4a2 3a3 3a2 3b2 3b3 4b2 4b3 5b2 5b3
+    //      5a1 5a0 4a1 4a0 3a1 3a0 3b0 3b1 4b0 4b1 5b0 5b1
+    //      2a3 2a2 1a3 1a2 0a3 0a2 0b2 0b3 1b2 1b3 2b2 2b3
+    //      2a1 2a0 1a1 1a0 0a1 0a0 0b0 0b1 1b0 1b1 2b0 2b1
+    //      2c1 2c0 1c1 1c0 0c1 0c0 0d0 0d1 1d0 1d1 2d0 2d1
+    //      2c3 2c2 1c3 1c2 0c3 0c2 0d2 0d3 1d2 1d3 2d2 2d3
+    //      5c1 5c0 4c1 4c0 3c1 3c0 3d0 3d1 4d0 4d1 5d0 5d1
+    //      5c3 5c2 4c3 4c2 3c3 3c2 3d2 3d3 4d2 4d3 5d2 5d3
+    //      8c1 8c0 7c1 7c0 6c1 6c0 6d0 6d1 7d0 7d1 8d0 8d1
+    //      8c3 8c2 7c3 7c2 6c3 6c2 6d2 6d3 7d2 7d3 8d2 8d3
+    //  With this symmetric arrangement, we don't have to know which absolute
+    //  quadrant a sample lies in to assign kernel weights; it's enough to know
+    //  the sample number and the relative quadrant of the sample (relative to
+    //  the current quadrant):
+    //      {current, adjacent x, adjacent y, diagonal}
+
+    //  COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Statically compute sampling offsets within each 2x2 texel block, based
+    //  on appropriate 1D Gaussian sampling ratio between texels [0, 1], [2, 3],
+    //  and [4, 5] away from the fragment, and reuse them independently for both
+    //  dimensions.  Use the sample field center as the estimated destination,
+    //  but nudge the result closer to halfway between texels to blur error.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off   = 1.0;
+    const float w0_5off = exp(-(0.5*0.5) * denom_inv);
+    const float w1off   = exp(-(1.0*1.0) * denom_inv);
+    const float w1_5off = exp(-(1.5*1.5) * denom_inv);
+    const float w2off   = exp(-(2.0*2.0) * denom_inv);
+    const float w2_5off = exp(-(2.5*2.5) * denom_inv);
+    const float w3_5off = exp(-(3.5*3.5) * denom_inv);
+    const float w4_5off = exp(-(4.5*4.5) * denom_inv);
+    const float w5_5off = exp(-(5.5*5.5) * denom_inv);
+    const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
+    const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
+    const float texel4to5ratio = lerp(w5_5off/(w4_5off + w5_5off), 0.5, error_blurring);
+    //  We don't share sample0*, so use the nearest destination fragment:
+    const float texel0to1ratio_nearest = w1off/(w0off + w1off);
+    const float texel1to2ratio_nearest = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the bottom-right fragment to each
+    //  bilinear sample in the bottom-right quadrant:
+    const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample2_texel_offset = float2(4.0, 0.0) + float2(texel4to5ratio, texel0to1ratio);
+    const float2 sample3_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample4_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+    const float2 sample5_texel_offset = float2(4.0, 2.0) + float2(texel4to5ratio, texel2to3ratio);
+    const float2 sample6_texel_offset = float2(0.0, 4.0) + float2(texel0to1ratio, texel4to5ratio);
+    const float2 sample7_texel_offset = float2(2.0, 4.0) + float2(texel2to3ratio, texel4to5ratio);
+    const float2 sample8_texel_offset = float2(4.0, 4.0) + float2(texel4to5ratio, texel4to5ratio);
+
+    //  CALCULATE KERNEL WEIGHTS:
+    //  Statically compute bilinear sample weights at each destination fragment
+    //  based on the sum of their 4 underlying texel weights.  Assume a same-
+    //  resolution blur, so each symmetrically named sample weight will compute
+    //  the same at every fragment in the pixel quad: We can therefore compute
+    //  texel weights based only on the bottom-right quadrant (fragment at 0d0).
+    //  Too avoid too much boilerplate code, use a macro to get all 4 texel
+    //  weights for a bilinear sample based on the offset of its top-left texel:
+    #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
+        (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
+    const float w8diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -6.0);
+    const float w7diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -6.0);
+    const float w6diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -6.0);
+    const float w6adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -6.0);
+    const float w7adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -6.0);
+    const float w8adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -6.0);
+    const float w5diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -4.0);
+    const float w4diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0);
+    const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0);
+    const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0);
+    const float w4adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0);
+    const float w5adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -4.0);
+    const float w2diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -2.0);
+    const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0);
+    const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
+    const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
+    const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
+    const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -2.0);
+    const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 0.0);
+    const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0);
+    const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
+    const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
+    const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
+    const float w2curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 0.0);
+    const float w5adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 2.0);
+    const float w4adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0);
+    const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
+    const float w3curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
+    const float w4curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
+    const float w5curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 2.0);
+    const float w8adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 4.0);
+    const float w7adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 4.0);
+    const float w6adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 4.0);
+    const float w6curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 4.0);
+    const float w7curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 4.0);
+    const float w8curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 4.0);
+    #undef GET_TEXEL_QUAD_WEIGHTS
+    //  Statically pack weights for runtime:
+    const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
+    const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag);
+    const float4 w2 = float4(w2curr, w2adjx, w2adjy, w2diag);
+    const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag);
+    const float4 w4 = float4(w4curr, w4adjx, w4adjy, w4diag);
+    const float4 w5 = float4(w5curr, w5adjx, w5adjy, w5diag);
+    const float4 w6 = float4(w6curr, w6adjx, w6adjy, w6diag);
+    const float4 w7 = float4(w7curr, w7adjx, w7adjy, w7diag);
+    const float4 w8 = float4(w8curr, w8adjx, w8adjy, w8diag);
+    //  Get the weight sum inverse (normalization factor):
+    const float4 weight_sum4 = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8;
+    const float2 weight_sum2 = weight_sum4.xy + weight_sum4.zw;
+    const float weight_sum = weight_sum2.x + weight_sum2.y;
+    const float weight_sum_inv = 1.0/(weight_sum);
+
+    //  LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
+    const float2 dxdy_curr = dxdy * quad_vector.xy;
+    //  Load bilinear samples for the current quadrant (for this fragment):
+    const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
+    const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
+    const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
+    const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
+    const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
+    const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
+    const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
+    const float3 sample4curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample4_texel_offset)).rgb;
+    const float3 sample5curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample5_texel_offset)).rgb;
+    const float3 sample6curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample6_texel_offset)).rgb;
+    const float3 sample7curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample7_texel_offset)).rgb;
+    const float3 sample8curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample8_texel_offset)).rgb;
+
+    //  GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
+    //  Fetch the samples from other fragments in the 2x2 quad:
+    float3 sample1adjx, sample1adjy, sample1diag;
+    float3 sample2adjx, sample2adjy, sample2diag;
+    float3 sample3adjx, sample3adjy, sample3diag;
+    float3 sample4adjx, sample4adjy, sample4diag;
+    float3 sample5adjx, sample5adjy, sample5diag;
+    float3 sample6adjx, sample6adjy, sample6diag;
+    float3 sample7adjx, sample7adjy, sample7diag;
+    float3 sample8adjx, sample8adjy, sample8diag;
+    quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
+    quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
+    quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag);
+    quad_gather(quad_vector, sample4curr, sample4adjx, sample4adjy, sample4diag);
+    quad_gather(quad_vector, sample5curr, sample5adjx, sample5adjy, sample5diag);
+    quad_gather(quad_vector, sample6curr, sample6adjx, sample6adjy, sample6diag);
+    quad_gather(quad_vector, sample7curr, sample7adjx, sample7adjy, sample7diag);
+    quad_gather(quad_vector, sample8curr, sample8adjx, sample8adjy, sample8diag);
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    //  Fill each row of a matrix with an rgb sample and pre-multiply by the
+    //  weights to obtain a weighted result:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
+    sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag));
+    sum += mul(w2, float4x3(sample2curr, sample2adjx, sample2adjy, sample2diag));
+    sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag));
+    sum += mul(w4, float4x3(sample4curr, sample4adjx, sample4adjy, sample4diag));
+    sum += mul(w5, float4x3(sample5curr, sample5adjx, sample5adjy, sample5diag));
+    sum += mul(w6, float4x3(sample6curr, sample6adjx, sample6adjy, sample6diag));
+    sum += mul(w7, float4x3(sample7curr, sample7adjx, sample7adjy, sample7diag));
+    sum += mul(w8, float4x3(sample8curr, sample8adjx, sample8adjy, sample8diag));
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur10x10shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
+    const float sigma)
+{
+    //  Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
+    //  Requires:   Same as tex2Dblur12x12shared()
+    //  Returns:    A blurred texture lookup using a "virtual" 10x10 Gaussian
+    //              blur (a 5x5 blur of carefully selected bilinear samples)
+    //              of the given mip level.  There will be subtle inaccuracies,
+    //              especially for small or high-frequency detailed sources.
+    //  Description:
+    //  First see the description for tex2Dblur12x12shared().  This
+    //  function shares the same concept and sample placement, but each fragment
+    //  only uses 25 of the 36 samples taken across the pixel quad (to cover a
+    //  5x5 sample area, or 10x10 texel area), and it uses a lower standard
+    //  deviation to compensate.  Thanks to symmetry, the 11 omitted samples
+    //  are always the "same:"
+    //      8adjx, 2adjx, 5adjx,
+    //      6adjy, 7adjy, 8adjy,
+    //      2diag, 5diag, 6diag, 7diag, 8diag
+
+    //  COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off   = 1.0;
+    const float w0_5off = exp(-(0.5*0.5) * denom_inv);
+    const float w1off   = exp(-(1.0*1.0) * denom_inv);
+    const float w1_5off = exp(-(1.5*1.5) * denom_inv);
+    const float w2off   = exp(-(2.0*2.0) * denom_inv);
+    const float w2_5off = exp(-(2.5*2.5) * denom_inv);
+    const float w3_5off = exp(-(3.5*3.5) * denom_inv);
+    const float w4_5off = exp(-(4.5*4.5) * denom_inv);
+    const float w5_5off = exp(-(5.5*5.5) * denom_inv);
+    const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
+    const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
+    const float texel4to5ratio = lerp(w5_5off/(w4_5off + w5_5off), 0.5, error_blurring);
+    //  We don't share sample0*, so use the nearest destination fragment:
+    const float texel0to1ratio_nearest = w1off/(w0off + w1off);
+    const float texel1to2ratio_nearest = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the bottom-right fragment to each
+    //  bilinear sample in the bottom-right quadrant:
+    const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample2_texel_offset = float2(4.0, 0.0) + float2(texel4to5ratio, texel0to1ratio);
+    const float2 sample3_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample4_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+    const float2 sample5_texel_offset = float2(4.0, 2.0) + float2(texel4to5ratio, texel2to3ratio);
+    const float2 sample6_texel_offset = float2(0.0, 4.0) + float2(texel0to1ratio, texel4to5ratio);
+    const float2 sample7_texel_offset = float2(2.0, 4.0) + float2(texel2to3ratio, texel4to5ratio);
+    const float2 sample8_texel_offset = float2(4.0, 4.0) + float2(texel4to5ratio, texel4to5ratio);
+
+    //  CALCULATE KERNEL WEIGHTS:
+    //  Statically compute bilinear sample weights at each destination fragment
+    //  from the sum of their 4 texel weights (details in tex2Dblur12x12shared).
+    #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
+        (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
+    //  We only need 25 of the 36 sample weights.  Skip the following weights:
+    //      8adjx, 2adjx, 5adjx,
+    //      6adjy, 7adjy, 8adjy,
+    //      2diag, 5diag, 6diag, 7diag, 8diag
+    const float w4diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0);
+    const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0);
+    const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0);
+    const float w4adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0);
+    const float w5adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -4.0);
+    const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0);
+    const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
+    const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
+    const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
+    const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -2.0);
+    const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0);
+    const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
+    const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
+    const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
+    const float w2curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 0.0);
+    const float w4adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0);
+    const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
+    const float w3curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
+    const float w4curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
+    const float w5curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 2.0);
+    const float w7adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 4.0);
+    const float w6adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 4.0);
+    const float w6curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 4.0);
+    const float w7curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 4.0);
+    const float w8curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 4.0);
+    #undef GET_TEXEL_QUAD_WEIGHTS
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv = 1.0/(w0curr + w1curr + w2curr + w3curr +
+        w4curr + w5curr + w6curr + w7curr + w8curr +
+        w0adjx + w1adjx + w3adjx + w4adjx + w6adjx + w7adjx +
+        w0adjy + w1adjy + w2adjy + w3adjy + w4adjy + w5adjy +
+        w0diag + w1diag + w3diag + w4diag);
+    //  Statically pack most weights for runtime.  Note the mixed packing:
+    const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
+    const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag);
+    const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag);
+    const float4 w4 = float4(w4curr, w4adjx, w4adjy, w4diag);
+    const float4 w2and5 = float4(w2curr, w2adjy, w5curr, w5adjy);
+    const float4 w6and7 = float4(w6curr, w6adjx, w7curr, w7adjx);
+
+    //  LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
+    const float2 dxdy_curr = dxdy * quad_vector.xy;
+    //  Load bilinear samples for the current quadrant (for this fragment):
+    const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
+    const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
+    const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
+    const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
+    const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
+    const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
+    const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
+    const float3 sample4curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample4_texel_offset)).rgb;
+    const float3 sample5curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample5_texel_offset)).rgb;
+    const float3 sample6curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample6_texel_offset)).rgb;
+    const float3 sample7curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample7_texel_offset)).rgb;
+    const float3 sample8curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample8_texel_offset)).rgb;
+
+    //  GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
+    //  Fetch the samples from other fragments in the 2x2 quad in order of need:
+    float3 sample1adjx, sample1adjy, sample1diag;
+    float3 sample2adjx, sample2adjy, sample2diag;
+    float3 sample3adjx, sample3adjy, sample3diag;
+    float3 sample4adjx, sample4adjy, sample4diag;
+    float3 sample5adjx, sample5adjy, sample5diag;
+    float3 sample6adjx, sample6adjy, sample6diag;
+    float3 sample7adjx, sample7adjy, sample7diag;
+    quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
+    quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
+    quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag);
+    quad_gather(quad_vector, sample4curr, sample4adjx, sample4adjy, sample4diag);
+    quad_gather(quad_vector, sample5curr, sample5adjx, sample5adjy, sample5diag);
+    quad_gather(quad_vector, sample6curr, sample6adjx, sample6adjy, sample6diag);
+    quad_gather(quad_vector, sample7curr, sample7adjx, sample7adjy, sample7diag);
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    //  Fill each row of a matrix with an rgb sample and pre-multiply by the
+    //  weights to obtain a weighted result.  First do the simple ones:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
+    sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag));
+    sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag));
+    sum += mul(w4, float4x3(sample4curr, sample4adjx, sample4adjy, sample4diag));
+    //  Now do the mixed-sample ones:
+    sum += mul(w2and5, float4x3(sample2curr, sample2adjy, sample5curr, sample5adjy));
+    sum += mul(w6and7, float4x3(sample6curr, sample6adjx, sample7curr, sample7adjx));
+    sum += w8curr * sample8curr;
+    //  Normalize the sum (so the weights add to 1.0) and return:
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur8x8shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
+    const float sigma)
+{
+    //  Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
+    //  Requires:   Same as tex2Dblur12x12shared()
+    //  Returns:    A blurred texture lookup using a "virtual" 8x8 Gaussian
+    //              blur (a 4x4 blur of carefully selected bilinear samples)
+    //              of the given mip level.  There will be subtle inaccuracies,
+    //              especially for small or high-frequency detailed sources.
+    //  Description:
+    //  First see the description for tex2Dblur12x12shared().  This function
+    //  shares the same concept and a similar sample placement, except each
+    //  quadrant contains 4x4 texels and 2x2 samples instead of 6x6 and 3x3
+    //  respectively.  There could be a total of 16 samples, 4 of which each
+    //  fragment is responsible for, but each fragment loads 0a/0b/0c/0d with
+    //  its own offset to reduce shared sample artifacts, bringing the sample
+    //  count for each fragment to 7.  Sample placement:
+    //      3a 2a 2b 3b
+    //      1a 0a 0b 1b
+    //      1c 0c 0d 1d
+    //      3c 2c 2d 3d
+    //  Texel placement:
+    //      3a3 3a2 2a3 2a2 2b2 2b3 3b2 3b3
+    //      3a1 3a0 2a1 2a0 2b0 2b1 3b0 3b1
+    //      1a3 1a2 0a3 0a2 0b2 0b3 1b2 1b3
+    //      1a1 1a0 0a1 0a0 0b0 0b1 1b0 1b1
+    //      1c1 1c0 0c1 0c0 0d0 0d1 1d0 1d1
+    //      1c3 1c2 0c3 0c2 0d2 0d3 1d2 1d3
+    //      3c1 3c0 2c1 2c0 2d0 2d1 3d0 4d1
+    //      3c3 3c2 2c3 2c2 2d2 2d3 3d2 4d3
+    
+    //  COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off   = 1.0;
+    const float w0_5off = exp(-(0.5*0.5) * denom_inv);
+    const float w1off   = exp(-(1.0*1.0) * denom_inv);
+    const float w1_5off = exp(-(1.5*1.5) * denom_inv);
+    const float w2off   = exp(-(2.0*2.0) * denom_inv);
+    const float w2_5off = exp(-(2.5*2.5) * denom_inv);
+    const float w3_5off = exp(-(3.5*3.5) * denom_inv);
+    const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
+    const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
+    //  We don't share sample0*, so use the nearest destination fragment:
+    const float texel0to1ratio_nearest = w1off/(w0off + w1off);
+    const float texel1to2ratio_nearest = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the bottom-right fragment to each
+    //  bilinear sample in the bottom-right quadrant:
+    const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample2_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample3_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+
+    //  CALCULATE KERNEL WEIGHTS:
+    //  Statically compute bilinear sample weights at each destination fragment
+    //  from the sum of their 4 texel weights (details in tex2Dblur12x12shared).
+    #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
+        (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
+    const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0);
+    const float w2diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0);
+    const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0);
+    const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0);
+    const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0);
+    const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
+    const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
+    const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
+    const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0);
+    const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
+    const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
+    const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
+    const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0);
+    const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
+    const float w2curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
+    const float w3curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
+    #undef GET_TEXEL_QUAD_WEIGHTS
+    //  Statically pack weights for runtime:
+    const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
+    const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag);
+    const float4 w2 = float4(w2curr, w2adjx, w2adjy, w2diag);
+    const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag);
+    //  Get the weight sum inverse (normalization factor):
+    const float4 weight_sum4 = w0 + w1 + w2 + w3;
+    const float2 weight_sum2 = weight_sum4.xy + weight_sum4.zw;
+    const float weight_sum = weight_sum2.x + weight_sum2.y;
+    const float weight_sum_inv = 1.0/(weight_sum);
+
+    //  LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
+    const float2 dxdy_curr = dxdy * quad_vector.xy;
+    //  Load bilinear samples for the current quadrant (for this fragment):
+    const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
+    const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
+    const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
+    const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
+    const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
+    const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
+    const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
+
+    //  GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
+    //  Fetch the samples from other fragments in the 2x2 quad:
+    float3 sample1adjx, sample1adjy, sample1diag;
+    float3 sample2adjx, sample2adjy, sample2diag;
+    float3 sample3adjx, sample3adjy, sample3diag;
+    quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
+    quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
+    quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag);
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    //  Fill each row of a matrix with an rgb sample and pre-multiply by the
+    //  weights to obtain a weighted result:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
+    sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag));
+    sum += mul(w2, float4x3(sample2curr, sample2adjx, sample2adjy, sample2diag));
+    sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag));
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur6x6shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
+    const float sigma)
+{
+    //  Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
+    //  Requires:   Same as tex2Dblur12x12shared()
+    //  Returns:    A blurred texture lookup using a "virtual" 6x6 Gaussian
+    //              blur (a 3x3 blur of carefully selected bilinear samples)
+    //              of the given mip level.  There will be some inaccuracies,subtle inaccuracies,
+    //              especially for small or high-frequency detailed sources.
+    //  Description:
+    //  First see the description for tex2Dblur8x8shared().  This
+    //  function shares the same concept and sample placement, but each fragment
+    //  only uses 9 of the 16 samples taken across the pixel quad (to cover a
+    //  3x3 sample area, or 6x6 texel area), and it uses a lower standard
+    //  deviation to compensate.  Thanks to symmetry, the 7 omitted samples
+    //  are always the "same:"
+    //      1adjx, 3adjx
+    //      2adjy, 3adjy
+    //      1diag, 2diag, 3diag
+
+    //  COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off   = 1.0;
+    const float w0_5off = exp(-(0.5*0.5) * denom_inv);
+    const float w1off   = exp(-(1.0*1.0) * denom_inv);
+    const float w1_5off = exp(-(1.5*1.5) * denom_inv);
+    const float w2off   = exp(-(2.0*2.0) * denom_inv);
+    const float w2_5off = exp(-(2.5*2.5) * denom_inv);
+    const float w3_5off = exp(-(3.5*3.5) * denom_inv);
+    const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
+    const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
+    //  We don't share sample0*, so use the nearest destination fragment:
+    const float texel0to1ratio_nearest = w1off/(w0off + w1off);
+    const float texel1to2ratio_nearest = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the bottom-right fragment to each
+    //  bilinear sample in the bottom-right quadrant:
+    const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample2_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample3_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+
+    //  CALCULATE KERNEL WEIGHTS:
+    //  Statically compute bilinear sample weights at each destination fragment
+    //  from the sum of their 4 texel weights (details in tex2Dblur12x12shared).
+    #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
+        (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
+    //  We only need 9 of the 16 sample weights.  Skip the following weights:
+    //      1adjx, 3adjx
+    //      2adjy, 3adjy
+    //      1diag, 2diag, 3diag
+    const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
+    const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
+    const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
+    const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
+    const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
+    const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
+    const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
+    const float w2curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
+    const float w3curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
+    #undef GET_TEXEL_QUAD_WEIGHTS
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv = 1.0/(w0curr + w1curr + w2curr + w3curr +
+        w0adjx + w2adjx + w0adjy + w1adjy + w0diag);
+    //  Statically pack some weights for runtime:
+    const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
+
+    //  LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
+    const float2 dxdy_curr = dxdy * quad_vector.xy;
+    //  Load bilinear samples for the current quadrant (for this fragment):
+    const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
+    const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
+    const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
+    const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
+    const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
+    const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
+    const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
+
+    //  GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
+    //  Fetch the samples from other fragments in the 2x2 quad:
+    float3 sample1adjx, sample1adjy, sample1diag;
+    float3 sample2adjx, sample2adjy, sample2diag;
+    quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
+    quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    //  Fill each row of a matrix with an rgb sample and pre-multiply by the
+    //  weights to obtain a weighted result for sample1*, and handle the rest
+    //  of the weights more directly/verbosely:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
+    sum += w1curr * sample1curr + w1adjy * sample1adjy + w2curr * sample2curr +
+            w2adjx * sample2adjx + w3curr * sample3curr;
+    return sum * weight_sum_inv;
+}
+
+
+///////////////////////  MAX OPTIMAL SIGMA BLUR WRAPPERS  //////////////////////
+
+//  The following blurs are static wrappers around the dynamic blurs above.
+//  HOPEFULLY, the compiler will be smart enough to do constant-folding.
+
+//  Resizable separable blurs:
+inline float3 tex2Dblur11resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur11resize(tex, tex_uv, dxdy, blur11_std_dev);
+}
+inline float3 tex2Dblur9resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur9resize(tex, tex_uv, dxdy, blur9_std_dev);
+}
+inline float3 tex2Dblur7resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur7resize(tex, tex_uv, dxdy, blur7_std_dev);
+}
+inline float3 tex2Dblur5resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur5resize(tex, tex_uv, dxdy, blur5_std_dev);
+}
+inline float3 tex2Dblur3resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur3resize(tex, tex_uv, dxdy, blur3_std_dev);
+}
+//  Fast separable blurs:
+inline float3 tex2Dblur11fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur11fast(tex, tex_uv, dxdy, blur11_std_dev);
+}
+inline float3 tex2Dblur9fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur9fast(tex, tex_uv, dxdy, blur9_std_dev);
+}
+inline float3 tex2Dblur7fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur7fast(tex, tex_uv, dxdy, blur7_std_dev);
+}
+inline float3 tex2Dblur5fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur5fast(tex, tex_uv, dxdy, blur5_std_dev);
+}
+inline float3 tex2Dblur3fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur3fast(tex, tex_uv, dxdy, blur3_std_dev);
+}
+//  Huge, "fast" separable blurs:
+inline float3 tex2Dblur43fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur43fast(tex, tex_uv, dxdy, blur43_std_dev);
+}
+inline float3 tex2Dblur31fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur31fast(tex, tex_uv, dxdy, blur31_std_dev);
+}
+inline float3 tex2Dblur25fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur25fast(tex, tex_uv, dxdy, blur25_std_dev);
+}
+inline float3 tex2Dblur17fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur17fast(tex, tex_uv, dxdy, blur17_std_dev);
+}
+//  Resizable one-pass blurs:
+inline float3 tex2Dblur3x3resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur3x3resize(tex, tex_uv, dxdy, blur3_std_dev);
+}
+//  "Fast" one-pass blurs:
+inline float3 tex2Dblur9x9(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur9x9(tex, tex_uv, dxdy, blur9_std_dev);
+}
+inline float3 tex2Dblur7x7(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur7x7(tex, tex_uv, dxdy, blur7_std_dev);
+}
+inline float3 tex2Dblur5x5(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur5x5(tex, tex_uv, dxdy, blur5_std_dev);
+}
+inline float3 tex2Dblur3x3(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur3x3(tex, tex_uv, dxdy, blur3_std_dev);
+}
+//  "Fast" shared-sample one-pass blurs:
+inline float3 tex2Dblur12x12shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
+{
+    return tex2Dblur12x12shared(tex, tex_uv, dxdy, quad_vector, blur12_std_dev);
+}
+inline float3 tex2Dblur10x10shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
+{
+    return tex2Dblur10x10shared(tex, tex_uv, dxdy, quad_vector, blur10_std_dev);
+}
+inline float3 tex2Dblur8x8shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
+{
+    return tex2Dblur8x8shared(tex, tex_uv, dxdy, quad_vector, blur8_std_dev);
+}
+inline float3 tex2Dblur6x6shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
+{
+    return tex2Dblur6x6shared(tex, tex_uv, dxdy, quad_vector, blur6_std_dev);
+}
+
+
+#endif  //  BLUR_FUNCTIONS_H
+
+////////////////////////////  END BLUR-FUNCTIONS  ///////////////////////////
+
+void main() {
+    //  Sample the masked scanlines:
+    const float3 intensity_dim =
+        tex2D_linearize(MASKED_SCANLINEStexture, scanline_tex_uv).rgb;
+    //  Get the full intensity, including auto-undimming, and mask compensation:
+    const float auto_dim_factor = levels_autodim_temp;
+    const float undim_factor = 1.0/auto_dim_factor;
+    const float mask_amplify = get_mask_amplify();
+    const float3 intensity = intensity_dim * undim_factor * mask_amplify *
+        levels_contrast;
+
+    //  Sample BLOOM_APPROX to estimate what a straight blur of masked scanlines
+    //  would look like, so we can estimate how much energy we'll receive from
+    //  blooming neighbors:
+    const float3 phosphor_blur_approx = levels_contrast * tex2D_linearize(
+        BLOOM_APPROXtexture, blur3x3_tex_uv).rgb;
+
+    //  Compute the blur weight for the center texel and the maximum energy we
+    //  expect to receive from neighbors:
+    const float bloom_sigma = get_final_bloom_sigma(bloom_sigma_runtime);
+    const float center_weight = get_center_weight(bloom_sigma);
+    const float3 max_area_contribution_approx =
+        max(float3(0.0, 0.0, 0.0), phosphor_blur_approx - center_weight * intensity);
+    //  Assume neighbors will blur 100% of their intensity (blur_ratio = 1.0),
+    //  because it actually gets better results (on top of being very simple),
+    //  but adjust all intensities for the user's desired underestimate factor:
+    const float3 area_contrib_underestimate =
+        bloom_underestimate_levels * max_area_contribution_approx;
+    const float3 intensity_underestimate =
+        bloom_underestimate_levels * intensity;
+    //  Calculate the blur_ratio, the ratio of intensity we want to blur:
+    #ifdef BRIGHTPASS_AREA_BASED
+        //  This area-based version changes blur_ratio more smoothly and blurs
+        //  more, clipping less but offering less phosphor differentiation:
+        const float3 phosphor_blur_underestimate = bloom_underestimate_levels *
+            phosphor_blur_approx;
+        const float3 soft_intensity = max(intensity_underestimate,
+            phosphor_blur_underestimate * mask_amplify);
+        const float3 blur_ratio_temp =
+            ((float3(1.0, 1.0, 1.0) - area_contrib_underestimate) /
+            soft_intensity - float3(1.0, 1.0, 1.0)) / (center_weight - 1.0);
+    #else
+        const float3 blur_ratio_temp =
+            ((float3(1.0, 1.0, 1.0) - area_contrib_underestimate) /
+            intensity_underestimate - float3(1.0, 1.0, 1.0)) / (center_weight - 1.0);
+    #endif
+    const float3 blur_ratio = clamp(blur_ratio_temp, 0.0, 1.0);
+    //  Calculate the brightpass based on the auto-dimmed, unamplified, masked
+    //  scanlines, encode if necessary, and return!
+    const float3 brightpass = intensity_dim *
+        lerp(blur_ratio, float3(1.0, 1.0, 1.0), bloom_excess);
+    FragColor = encode_output(float4(brightpass, 1.0));
+}
\ No newline at end of file
diff --git a/shaders/CRT-Royale.shader/brightpass.vs b/shaders/CRT-Royale.shader/brightpass.vs
new file mode 100644
index 00000000..2d02d72a
--- /dev/null
+++ b/shaders/CRT-Royale.shader/brightpass.vs
@@ -0,0 +1,6551 @@
+#version 150
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+in vec4 position;
+in vec2 texCoord;
+
+out Vertex {
+   vec2 vTexCoord;
+   vec2 scanline_tex_uv;
+   vec2 blur3x3_tex_uv;
+   float bloom_sigma_runtime;
+};
+
+uniform vec4 targetSize;
+uniform vec4 sourceSize[];
+
+// USER SETTINGS BLOCK //
+
+#define crt_gamma 2.500000
+#define lcd_gamma 2.200000
+#define levels_contrast 1.0
+#define halation_weight 0.0
+#define diffusion_weight 0.075
+#define bloom_underestimate_levels 0.8
+#define bloom_excess 0.000000
+#define beam_min_sigma 0.020000
+#define beam_max_sigma 0.300000
+#define beam_spot_power 0.330000
+#define beam_min_shape 2.000000
+#define beam_max_shape 4.000000
+#define beam_shape_power 0.250000
+#define beam_horiz_filter 0.000000
+#define beam_horiz_sigma 0.35
+#define beam_horiz_linear_rgb_weight 1.000000
+#define convergence_offset_x_r -0.000000
+#define convergence_offset_x_g 0.000000
+#define convergence_offset_x_b 0.000000
+#define convergence_offset_y_r 0.000000
+#define convergence_offset_y_g -0.000000
+#define convergence_offset_y_b 0.000000
+#define mask_type 1.000000
+#define mask_sample_mode_desired 0.000000
+#define mask_specify_num_triads 0.000000
+#define mask_triad_size_desired 3.000000
+#define mask_num_triads_desired 480.000000
+#define aa_subpixel_r_offset_x_runtime -0.0
+#define aa_subpixel_r_offset_y_runtime 0.000000
+#define aa_cubic_c 0.500000
+#define aa_gauss_sigma 0.500000
+#define geom_mode_runtime 0.000000
+#define geom_radius 2.000000
+#define geom_view_dist 2.000000
+#define geom_tilt_angle_x 0.000000
+#define geom_tilt_angle_y 0.000000
+#define geom_aspect_ratio_x 432.000000
+#define geom_aspect_ratio_y 329.000000
+#define geom_overscan_x 1.000000
+#define geom_overscan_y 1.000000
+#define border_size 0.015
+#define border_darkness 2.0
+#define border_compress 2.500000
+#define interlace_bff 0.000000
+#define interlace_1080i 0.000000
+
+// END USER SETTINGS BLOCK //
+
+// compatibility macros for transparently converting HLSLisms into GLSLisms
+#define mul(a,b) (b*a)
+#define lerp(a,b,c) mix(a,b,c)
+#define saturate(c) clamp(c, 0.0, 1.0)
+#define frac(x) (fract(x))
+#define float2 vec2
+#define float3 vec3
+#define float4 vec4
+#define bool2 bvec2
+#define bool3 bvec3
+#define bool4 bvec4
+#define float2x2 mat2x2
+#define float3x3 mat3x3
+#define float4x4 mat4x4
+#define float4x3 mat4x3
+#define float2x4 mat2x4
+#define IN params
+#define texture_size sourceSize[0].xy
+#define video_size sourceSize[0].xy
+#define output_size targetSize.xy
+#define frame_count phase
+#define static  
+#define inline  
+#define const  
+#define fmod(x,y) mod(x,y)
+#define ddx(c) dFdx(c)
+#define ddy(c) dFdy(c)
+#define atan2(x,y) atan(y,x)
+#define rsqrt(c) inversesqrt(c)
+
+#define MASKED_SCANLINEStexture source[0]
+#define MASKED_SCANLINEStexture_size sourceSize[0].xy
+#define MASKED_SCANLINESvideo_size sourceSize[0].xy
+#define BLOOM_APPROXtexture source[3]
+#define BLOOM_APPROXtexture_size sourceSize[3].xy
+#define BLOOM_APPROXvideo_size sourceSize[3].xy
+
+#if defined(GL_ES)
+	#define COMPAT_PRECISION mediump
+#else
+	#define COMPAT_PRECISION
+#endif
+
+#if __VERSION__ >= 130
+	#define COMPAT_TEXTURE texture
+#else
+	#define COMPAT_TEXTURE texture2D
+#endif
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+////////////////////////////  END USER-SETTINGS  //////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  END DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////////////
+
+//#include "bind-shader-h"
+
+/////////////////////////////  BEGIN BIND-SHADER-PARAMS  ////////////////////////////
+
+#ifndef BIND_SHADER_PARAMS_H
+#define BIND_SHADER_PARAMS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+/////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+////////////////////   END DERIVED-SETTINGS-AND-CONSTANTS   /////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+//  Override some parameters for gamma-management.h and tex2Dantialias.h:
+#define OVERRIDE_DEVICE_GAMMA
+static const float gba_gamma = 3.5; //  Irrelevant but necessary to define.
+#define ANTIALIAS_OVERRIDE_BASICS
+#define ANTIALIAS_OVERRIDE_PARAMETERS
+
+//  Provide accessors for vector constants that pack scalar uniforms:
+inline float2 get_aspect_vector(const float geom_aspect_ratio)
+{
+    //  Get an aspect ratio vector.  Enforce geom_max_aspect_ratio, and prevent
+    //  the absolute scale from affecting the uv-mapping for curvature:
+    const float geom_clamped_aspect_ratio =
+        min(geom_aspect_ratio, geom_max_aspect_ratio);
+    const float2 geom_aspect =
+        normalize(float2(geom_clamped_aspect_ratio, 1.0));
+    return geom_aspect;
+}
+
+inline float2 get_geom_overscan_vector()
+{
+    return float2(geom_overscan_x, geom_overscan_y);
+}
+
+inline float2 get_geom_tilt_angle_vector()
+{
+    return float2(geom_tilt_angle_x, geom_tilt_angle_y);
+}
+
+inline float3 get_convergence_offsets_x_vector()
+{
+    return float3(convergence_offset_x_r, convergence_offset_x_g,
+        convergence_offset_x_b);
+}
+
+inline float3 get_convergence_offsets_y_vector()
+{
+    return float3(convergence_offset_y_r, convergence_offset_y_g,
+        convergence_offset_y_b);
+}
+
+inline float2 get_convergence_offsets_r_vector()
+{
+    return float2(convergence_offset_x_r, convergence_offset_y_r);
+}
+
+inline float2 get_convergence_offsets_g_vector()
+{
+    return float2(convergence_offset_x_g, convergence_offset_y_g);
+}
+
+inline float2 get_convergence_offsets_b_vector()
+{
+    return float2(convergence_offset_x_b, convergence_offset_y_b);
+}
+
+inline float2 get_aa_subpixel_r_offset()
+{
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+            //  WARNING: THIS IS EXTREMELY EXPENSIVE.
+            return float2(aa_subpixel_r_offset_x_runtime,
+                aa_subpixel_r_offset_y_runtime);
+        #else
+            return aa_subpixel_r_offset_static;
+        #endif
+    #else
+        return aa_subpixel_r_offset_static;
+    #endif
+}
+
+//  Provide accessors settings which still need "cooking:"
+inline float get_mask_amplify()
+{
+    static const float mask_grille_amplify = 1.0/mask_grille_avg_color;
+    static const float mask_slot_amplify = 1.0/mask_slot_avg_color;
+    static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color;
+    return mask_type < 0.5 ? mask_grille_amplify :
+        mask_type < 1.5 ? mask_slot_amplify :
+        mask_shadow_amplify;
+}
+
+inline float get_mask_sample_mode()
+{
+    #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_desired;
+        #else
+            return clamp(mask_sample_mode_desired, 1.0, 2.0);
+        #endif
+    #else
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_static;
+        #else
+            return clamp(mask_sample_mode_static, 1.0, 2.0);
+        #endif
+    #endif
+}
+
+#endif  //  BIND_SHADER_PARAMS_H
+
+////////////////////////////  END BIND-SHADER-PARAMS  ///////////////////////////
+
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
+
+//#include "../../../../include/gamma-management.h"
+
+////////////////////////////  BEGIN GAMMA-MANAGEMENT  //////////////////////////
+
+#ifndef GAMMA_MANAGEMENT_H
+#define GAMMA_MANAGEMENT_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//  
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides gamma-aware tex*D*() and encode_output() functions.
+//  Requires:   Before #include-ing this file, the including file must #define
+//              the following macros when applicable and follow their rules:
+//              1.) #define FIRST_PASS if this is the first pass.
+//              2.) #define LAST_PASS if this is the last pass.
+//              3.) If sRGB is available, set srgb_framebufferN = "true" for
+//                  every pass except the last in your .cgp preset.
+//              4.) If sRGB isn't available but you want gamma-correctness with
+//                  no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
+//              5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
+//              6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
+//              7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
+//              8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
+//              If an option in [5, 8] is #defined in the first or last pass, it
+//              should be #defined for both.  It shouldn't make a difference
+//              whether it's #defined for intermediate passes or not.
+//  Optional:   The including file (or an earlier included file) may optionally
+//              #define a number of macros indicating it will override certain
+//              macros and associated constants are as follows:
+//              static constants with either static or uniform constants.  The
+//              1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
+//                  static const float ntsc_gamma
+//                  static const float pal_gamma
+//                  static const float crt_reference_gamma_high
+//                  static const float crt_reference_gamma_low
+//                  static const float lcd_reference_gamma
+//                  static const float crt_office_gamma
+//                  static const float lcd_office_gamma
+//              2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
+//                  static const float crt_gamma
+//                  static const float gba_gamma
+//                  static const float lcd_gamma
+//              3.) OVERRIDE_FINAL_GAMMA: The user must first define:
+//                  static const float input_gamma
+//                  static const float intermediate_gamma
+//                  static const float output_gamma
+//                  (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
+//              4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
+//                  static const bool assume_opaque_alpha
+//              The gamma constant overrides must be used in every pass or none,
+//              and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
+//              OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
+//  Usage:      After setting macros appropriately, ignore gamma correction and
+//              replace all tex*D*() calls with equivalent gamma-aware
+//              tex*D*_linearize calls, except:
+//              1.) When you read an LUT, use regular tex*D or a gamma-specified
+//                  function, depending on its gamma encoding:
+//                      tex*D*_linearize_gamma (takes a runtime gamma parameter)
+//              2.) If you must read pass0's original input in a later pass, use
+//                  tex2D_linearize_ntsc_gamma.  If you want to read pass0's
+//                  input with gamma-corrected bilinear filtering, consider
+//                  creating a first linearizing pass and reading from the input
+//                  of pass1 later.
+//              Then, return encode_output(color) from every fragment shader.
+//              Finally, use the global gamma_aware_bilinear boolean if you want
+//              to statically branch based on whether bilinear filtering is
+//              gamma-correct or not (e.g. for placing Gaussian blur samples).
+//
+//  Detailed Policy:
+//  tex*D*_linearize() functions enforce a consistent gamma-management policy
+//  based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings.  They assume
+//  their input texture has the same encoding characteristics as the input for
+//  the current pass (which doesn't apply to the exceptions listed above).
+//  Similarly, encode_output() enforces a policy based on the LAST_PASS and
+//  GAMMA_ENCODE_EVERY_FBO settings.  Together, they result in one of the
+//  following two pipelines.
+//  Typical pipeline with intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = linear_color;     //  Automatic sRGB encoding
+//      linear_color = intermediate_output;     //  Automatic sRGB decoding
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Typical pipeline without intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
+//      linear_color = pow(intermediate_output, intermediate_gamma);
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
+//  easily get gamma-correctness without banding on devices where sRGB isn't
+//  supported.
+//
+//  Use This Header to Maximize Code Reuse:
+//  The purpose of this header is to provide a consistent interface for texture
+//  reads and output gamma-encoding that localizes and abstracts away all the
+//  annoying details.  This greatly reduces the amount of code in each shader
+//  pass that depends on the pass number in the .cgp preset or whether sRGB
+//  FBO's are being used: You can trivially change the gamma behavior of your
+//  whole pass by commenting or uncommenting 1-3 #defines.  To reuse the same
+//  code in your first, Nth, and last passes, you can even put it all in another
+//  header file and #include it from skeleton .cg files that #define the
+//  appropriate pass-specific settings.
+//
+//  Rationale for Using Three Macros:
+//  This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
+//  SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
+//  a lower maintenance burden on each pass.  At first glance it seems we could
+//  accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
+//  This works for simple use cases where input_gamma == output_gamma, but it
+//  breaks down for more complex scenarios like CRT simulation, where the pass
+//  number determines the gamma encoding of the input and output.
+
+
+///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
+
+//  Set standard gamma constants, but allow users to override them:
+#ifndef OVERRIDE_STANDARD_GAMMA
+    //  Standard encoding gammas:
+    static const float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
+    static const float pal_gamma = 2.8;     //  Never actually 2.8 in practice
+    //  Typical device decoding gammas (only use for emulating devices):
+    //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
+    //  gammas: The standards purposely undercorrected for an analog CRT's
+    //  assumed 2.5 reference display gamma to maintain contrast in assumed
+    //  [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
+    //  These unstated assumptions about display gamma and perceptual rendering
+    //  intent caused a lot of confusion, and more modern CRT's seemed to target
+    //  NTSC 2.2 gamma with circuitry.  LCD displays seem to have followed suit
+    //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
+    //  displays designed to view sRGB in bright environments.  (Standards are
+    //  also in flux again with BT.1886, but it's underspecified for displays.)
+    static const float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
+    static const float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
+    static const float lcd_reference_gamma = 2.5;       //  To match CRT
+    static const float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
+    static const float lcd_office_gamma = 2.2;  //  Approximates sRGB
+#endif  //  OVERRIDE_STANDARD_GAMMA
+
+//  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
+//  but only if they're aware of it.
+#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
+    static const bool assume_opaque_alpha = false;
+#endif
+
+
+///////////////////////  DERIVED CONSTANTS AS FUNCTIONS  ///////////////////////
+
+//  gamma-management.h should be compatible with overriding gamma values with
+//  runtime user parameters, but we can only define other global constants in
+//  terms of static constants, not uniform user parameters.  To get around this
+//  limitation, we need to define derived constants using functions.
+
+//  Set device gamma constants, but allow users to override them:
+#ifdef OVERRIDE_DEVICE_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_crt_gamma()    {   return crt_gamma;   }
+    inline float get_gba_gamma()    {   return gba_gamma;   }
+    inline float get_lcd_gamma()    {   return lcd_gamma;   }
+#else
+    inline float get_crt_gamma()    {   return crt_reference_gamma_high;    }
+    inline float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
+    inline float get_lcd_gamma()    {   return lcd_office_gamma;            }
+#endif  //  OVERRIDE_DEVICE_GAMMA
+
+//  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
+#ifdef OVERRIDE_FINAL_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_intermediate_gamma()   {   return intermediate_gamma;  }
+    inline float get_input_gamma()          {   return input_gamma;         }
+    inline float get_output_gamma()         {   return output_gamma;        }
+#else
+    //  If we gamma-correct every pass, always use ntsc_gamma between passes to
+    //  ensure middle passes don't need to care if anything is being simulated:
+    inline float get_intermediate_gamma()   {   return ntsc_gamma;          }
+    #ifdef SIMULATE_CRT_ON_LCD
+        inline float get_input_gamma()      {   return get_crt_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_LCD
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_LCD_ON_CRT
+        inline float get_input_gamma()      {   return get_lcd_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_CRT
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else   //  Don't simulate anything:
+        inline float get_input_gamma()      {   return ntsc_gamma;          }
+        inline float get_output_gamma()     {   return ntsc_gamma;          }
+    #endif  //  SIMULATE_GBA_ON_CRT
+    #endif  //  SIMULATE_LCD_ON_CRT
+    #endif  //  SIMULATE_GBA_ON_LCD
+    #endif  //  SIMULATE_CRT_ON_LCD
+#endif  //  OVERRIDE_FINAL_GAMMA
+
+//  Set decoding/encoding gammas for the current pass.  Use static constants for
+//  linearize_input and gamma_encode_output, because they aren't derived, and
+//  they let the compiler do dead-code elimination.
+#ifndef GAMMA_ENCODE_EVERY_FBO
+    #ifdef FIRST_PASS
+        static const bool linearize_input = true;
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        static const bool linearize_input = false;
+        inline float get_pass_input_gamma()     {   return 1.0;                 }
+    #endif
+    #ifdef LAST_PASS
+        static const bool gamma_encode_output = true;
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        static const bool gamma_encode_output = false;
+        inline float get_pass_output_gamma()    {   return 1.0;                 }
+    #endif
+#else
+    static const bool linearize_input = true;
+    static const bool gamma_encode_output = true;
+    #ifdef FIRST_PASS
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        inline float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
+    #endif
+    #ifdef LAST_PASS
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        inline float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
+    #endif
+#endif
+
+//  Users might want to know if bilinear filtering will be gamma-correct:
+static const bool gamma_aware_bilinear = !linearize_input;
+
+
+//////////////////////  COLOR ENCODING/DECODING FUNCTIONS  /////////////////////
+
+inline float4 encode_output(const float4 color)
+{
+    if(gamma_encode_output)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_input(const float4 color)
+{
+    if(linearize_input)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_gamma_input(const float4 color, const float3 gamma)
+{
+    if(assume_opaque_alpha)
+    {
+        return float4(pow(color.rgb, gamma), 1.0);
+    }
+    else
+    {
+        return float4(pow(color.rgb, gamma), color.a);
+    }
+}
+
+//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯
+//#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D)))
+// EDIT: it's the 'const' in front of the coords that's doing it
+
+///////////////////////////  TEXTURE LOOKUP WRAPPERS  //////////////////////////
+
+//  "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a wide array of linearizing texture lookup wrapper functions.  The
+//  Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
+//  lookups are provided for completeness in case that changes someday.  Nobody
+//  is likely to use the *fetch and *proj functions, but they're included just
+//  in case.  The only tex*D texture sampling functions omitted are:
+//      - tex*Dcmpbias
+//      - tex*Dcmplod
+//      - tex*DARRAY*
+//      - tex*DMS*
+//      - Variants returning integers
+//  Standard line length restrictions are ignored below for vertical brevity.
+/*
+//  tex1D:
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex1Dbias:
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dbias(tex, tex_coords));   }
+
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dbias(tex, tex_coords, texel_off));    }
+
+//  tex1Dfetch:
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
+{   return decode_input(tex1Dfetch(tex, tex_coords));  }
+
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex1Dlod:
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dlod(tex, tex_coords));    }
+
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dlod(tex, tex_coords, texel_off));     }
+
+//  tex1Dproj:
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+*/
+//  tex2D:
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords, texel_off));    }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex2Dbias:
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
+//{   return decode_input(tex2Dbias(tex, tex_coords));   }
+
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dbias(tex, tex_coords, texel_off));    }
+
+//  tex2Dfetch:
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
+//{   return decode_input(tex2Dfetch(tex, tex_coords));  }
+
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords)
+{   return decode_input(textureLod(tex, tex_coords.xy, 0.0));    }
+
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));     }
+/*
+//  tex2Dproj:
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+*/
+/*
+//  tex3D:
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
+{   return decode_input(tex3D(tex, tex_coords));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, texel_off));    }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex3Dbias:
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dbias(tex, tex_coords));   }
+
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dbias(tex, tex_coords, texel_off));    }
+
+//  tex3Dfetch:
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
+{   return decode_input(tex3Dfetch(tex, tex_coords));  }
+
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex3Dlod:
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dlod(tex, tex_coords));    }
+
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dlod(tex, tex_coords, texel_off));     }
+
+//  tex3Dproj:
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dproj(tex, tex_coords));   }
+
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dproj(tex, tex_coords, texel_off));    }
+/////////*
+
+//  NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  This narrow selection of nonstandard tex2D* functions can be useful:
+
+//  tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0)));   }
+
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off));    }
+
+
+//  MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a narrower selection of tex2D* wrapper functions that decode an
+//  input sample with a specified gamma value.  These are useful for reading
+//  LUT's and for reading the input of pass0 in a later pass.
+
+//  tex2D:
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma);   }
+
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+/*
+//  tex2Dbias:
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma);   }
+
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma);    }
+
+//  tex2Dfetch:
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma);  }
+
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma);   }
+*/
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma);    }
+
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma);     }
+
+
+#endif  //  GAMMA_MANAGEMENT_H
+
+////////////////////////////  END GAMMA-MANAGEMENT  //////////////////////////
+
+//#include "phosphor-mask-resizing.h"
+
+////////////////////////  BEGIN PHOSPHOR-MASK-RESIZING  ////////////////////////
+
+#ifndef PHOSPHOR_MASK_RESIZING_H
+#define PHOSPHOR_MASK_RESIZING_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//#include "../user-settings.h"
+//#include "derived-settings-and-constants.h"
+
+/////////////////////////////  CODEPATH SELECTION  /////////////////////////////
+
+//  Choose a looping strategy based on what's allowed:
+//  Dynamic loops not allowed: Use a flat static loop.
+//  Dynamic loops accomodated: Coarsely branch around static loops.
+//  Dynamic loops assumed allowed: Use a flat dynamic loop.
+#ifndef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #ifdef ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+        #define BREAK_LOOPS_INTO_PIECES
+    #else
+        #define USE_SINGLE_STATIC_LOOP
+    #endif
+#endif  //  No else needed: Dynamic loops assumed.
+
+
+//////////////////////////////////  CONSTANTS  /////////////////////////////////
+
+//  The larger the resized tile, the fewer samples we'll need for downsizing.
+//  See if we can get a static min tile size > mask_min_allowed_tile_size:
+static const float mask_min_allowed_tile_size = ceil(
+    mask_min_allowed_triad_size * mask_triads_per_tile);
+static const float mask_min_expected_tile_size = 
+        mask_min_allowed_tile_size;
+//  Limit the number of sinc resize taps by the maximum minification factor:
+static const float pi_over_lobes = pi/mask_sinc_lobes;
+static const float max_sinc_resize_samples_float = 2.0 * mask_sinc_lobes *
+    mask_resize_src_lut_size.x/mask_min_expected_tile_size;
+//  Vectorized loops sample in multiples of 4.  Round up to be safe:
+static const float max_sinc_resize_samples_m4 = ceil(
+    max_sinc_resize_samples_float * 0.25) * 4.0;
+
+
+/////////////////////////  RESAMPLING FUNCTION HELPERS  ////////////////////////
+
+inline float get_dynamic_loop_size(const float magnification_scale)
+{
+    //  Requires:   The following global constants must be defined:
+    //              1.) mask_sinc_lobes
+    //              2.) max_sinc_resize_samples_m4
+    //  Returns:    The minimum number of texture samples for a correct downsize
+    //              at magnification_scale.
+    //  We're downsizing, so the filter is sized across 2*lobes output pixels
+    //  (not 2*lobes input texels).  This impacts distance measurements and the
+    //  minimum number of input samples needed.
+    const float min_samples_float = 2.0 * mask_sinc_lobes / magnification_scale;
+    const float min_samples_m4 = ceil(min_samples_float * 0.25) * 4.0;
+    #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+        const float max_samples_m4 = max_sinc_resize_samples_m4;
+    #else   // ifdef BREAK_LOOPS_INTO_PIECES
+        //  Simulating loops with branches imposes a 128-sample limit.
+        const float max_samples_m4 = min(128.0, max_sinc_resize_samples_m4);
+    #endif
+    return min(min_samples_m4, max_samples_m4);
+}
+
+float2 get_first_texel_tile_uv_and_dist(const float2 tex_uv, 
+    const float2 tex_size, const float dr, 
+    const float input_tiles_per_texture_r, const float samples,
+    static const bool vertical)
+{
+    //  Requires:   1.) dr == du == 1.0/texture_size.x or
+    //                  dr == dv == 1.0/texture_size.y
+    //                  (whichever direction we're resampling in).
+    //                  It's a scalar to save register space.
+    //              2.) input_tiles_per_texture_r is the number of input tiles
+    //                  that can fit in the input texture in the direction we're
+    //                  resampling this pass.
+    //              3.) vertical indicates whether we're resampling vertically
+    //                  this pass (or horizontally).
+    //  Returns:    Pack and return the first sample's tile_uv coord in [0, 1]
+    //              and its texel distance from the destination pixel, in the
+    //              resized dimension only.
+    //  We'll start with the topmost or leftmost sample and work down or right,
+    //  so get the first sample location and distance.  Modify both dimensions
+    //  as if we're doing a one-pass 2D resize; we'll throw away the unneeded
+    //  (and incorrect) dimension at the end.
+    const float2 curr_texel = tex_uv * tex_size;
+    const float2 prev_texel =
+        floor(curr_texel - float2(under_half)) + float2(0.5);
+    const float2 first_texel = prev_texel - float2(samples/2.0 - 1.0);
+    const float2 first_texel_uv_wrap_2D = first_texel * dr;
+    const float2 first_texel_dist_2D = curr_texel - first_texel;
+    //  Convert from tex_uv to tile_uv coords so we can sub fracs for fmods.
+    const float2 first_texel_tile_uv_wrap_2D =
+        first_texel_uv_wrap_2D * input_tiles_per_texture_r;
+    //  Project wrapped coordinates to the [0, 1] range.  We'll do this with all
+    //  samples,but the first texel is special, since it might be negative.
+    const float2 coord_negative =
+        float2((first_texel_tile_uv_wrap_2D.x < 0.),(first_texel_tile_uv_wrap_2D.y < 0.));
+    const float2 first_texel_tile_uv_2D =
+        frac(first_texel_tile_uv_wrap_2D) + coord_negative;
+    //  Pack the first texel's tile_uv coord and texel distance in 1D:
+    const float2 tile_u_and_dist =
+        float2(first_texel_tile_uv_2D.x, first_texel_dist_2D.x);
+    const float2 tile_v_and_dist =
+        float2(first_texel_tile_uv_2D.y, first_texel_dist_2D.y);
+    return vertical ? tile_v_and_dist : tile_u_and_dist;
+    //return lerp(tile_u_and_dist, tile_v_and_dist, float(vertical));
+}
+
+inline float4 tex2Dlod0try(const sampler2D tex, const float2 tex_uv)
+{
+    //  Mipmapping and anisotropic filtering get confused by sinc-resampling.
+    //  One [slow] workaround is to select the lowest mip level:
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        return textureLod(tex, float4(tex_uv, 0.0, 0.0).xy);
+    #else
+        #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+            return tex2Dbias(tex, float4(tex_uv, 0.0, -16.0));
+        #else
+            return texture(tex, tex_uv);
+        #endif
+    #endif
+}
+
+
+//////////////////////////////  LOOP BODY MACROS  //////////////////////////////
+
+//  Using inline functions can exceed the temporary register limit, so we're
+//  stuck with #define macros (I'm TRULY sorry).  They're declared here instead
+//  of above to be closer to the actual invocation sites.  Steps:
+//  1.) Get the exact texel location.
+//  2.) Sample the phosphor mask (already assumed encoded in linear RGB).
+//  3.) Get the distance from the current pixel and sinc weight:
+//          sinc(dist) = sin(pi * dist)/(pi * dist)
+//      We can also use the slower/smoother Lanczos instead:
+//          L(x) = sinc(dist) * sinc(dist / lobes)
+//  4.) Accumulate the weight sum in weights, and accumulate the weighted texels
+//      in pixel_color (we'll normalize outside the loop at the end).
+//  We vectorize the loop to help reduce the Lanczos window's cost.
+
+    //  The r coord is the coord in the dimension we're resizing along (u or v),
+    //  and first_texel_tile_uv_rrrr is a float4 of the first texel's u or v
+    //  tile_uv coord in [0, 1].  tex_uv_r will contain the tile_uv u or v coord
+    //  for four new texel samples.
+    #define CALCULATE_R_COORD_FOR_4_SAMPLES                                    \
+        const float4 true_i = float4(i_base + i) + float4(0.0, 1.0, 2.0, 3.0); \
+        const float4 tile_uv_r = frac(                                         \
+            first_texel_tile_uv_rrrr + true_i * tile_dr);                      \
+        const float4 tex_uv_r = tile_uv_r * tile_size_uv_r;
+
+    #ifdef PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+        #define CALCULATE_SINC_RESAMPLE_WEIGHTS                                \
+            const float4 pi_dist_over_lobes = pi_over_lobes * dist;            \
+            const float4 weights = min(sin(pi_dist) * sin(pi_dist_over_lobes) /\
+                (pi_dist*pi_dist_over_lobes), float4(1.0));
+    #else
+        #define CALCULATE_SINC_RESAMPLE_WEIGHTS                                \
+            const float4 weights = min(sin(pi_dist)/pi_dist, float4(1.0));
+    #endif
+
+    #define UPDATE_COLOR_AND_WEIGHT_SUMS                                       \
+        const float4 dist = magnification_scale *                              \
+            abs(first_dist_unscaled - true_i);                                 \
+        const float4 pi_dist = pi * dist;                                      \
+        CALCULATE_SINC_RESAMPLE_WEIGHTS;                                       \
+        pixel_color += new_sample0 * weights.xxx;                              \
+        pixel_color += new_sample1 * weights.yyy;                              \
+        pixel_color += new_sample2 * weights.zzz;                              \
+        pixel_color += new_sample3 * weights.www;                              \
+        weight_sum += weights;
+
+    #define VERTICAL_SINC_RESAMPLE_LOOP_BODY                                   \
+        CALCULATE_R_COORD_FOR_4_SAMPLES;                                       \
+        const float3 new_sample0 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.x)).rgb;                                 \
+        const float3 new_sample1 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.y)).rgb;                                 \
+        const float3 new_sample2 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.z)).rgb;                                 \
+        const float3 new_sample3 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.w)).rgb;                                 \
+        UPDATE_COLOR_AND_WEIGHT_SUMS;
+
+    #define HORIZONTAL_SINC_RESAMPLE_LOOP_BODY                                 \
+        CALCULATE_R_COORD_FOR_4_SAMPLES;                                       \
+        const float3 new_sample0 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.x, tex_uv.y)).rgb;                                 \
+        const float3 new_sample1 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.y, tex_uv.y)).rgb;                                 \
+        const float3 new_sample2 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.z, tex_uv.y)).rgb;                                 \
+        const float3 new_sample3 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.w, tex_uv.y)).rgb;                                 \
+        UPDATE_COLOR_AND_WEIGHT_SUMS;
+
+
+////////////////////////////  RESAMPLING FUNCTIONS  ////////////////////////////
+
+float3 downsample_vertical_sinc_tiled(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size, static const float dr,
+    const float magnification_scale, static const float tile_size_uv_r)
+{
+    //  Requires:   1.) dr == du == 1.0/texture_size.x or
+    //                  dr == dv == 1.0/texture_size.y
+    //                  (whichever direction we're resampling in).
+    //                  It's a scalar to save register space.
+    //              2.) tile_size_uv_r is the number of texels an input tile
+    //                  takes up in the input texture, in the direction we're
+    //                  resampling this pass.
+    //              3.) magnification_scale must be <= 1.0.
+    //  Returns:    Return a [Lanczos] sinc-resampled pixel of a vertically
+    //              downsized input tile embedded in an input texture.  (The
+    //              vertical version is special-cased though: It assumes the
+    //              tile size equals the [static] texture size, since it's used
+    //              on an LUT texture input containing one tile.  For more
+    //              generic use, eliminate the "static" in the parameters.)
+    //  The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension
+    //  we're resizing along, e.g. "dy" in this case.
+    #ifdef USE_SINGLE_STATIC_LOOP
+        //  A static loop can be faster, but it might blur too much from using
+        //  more samples than it should.
+        static const int samples = int(max_sinc_resize_samples_m4);
+    #else
+        const int samples = int(get_dynamic_loop_size(magnification_scale));
+    #endif
+
+    //  Get the first sample location (scalar tile uv coord along the resized
+    //  dimension) and distance from the output location (in texels):
+    static const float input_tiles_per_texture_r = 1.0/tile_size_uv_r;
+    //  true = vertical resize:
+    const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist(
+        tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, true);
+    const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx;
+    const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy;
+    //  Get the tile sample offset:
+    static const float tile_dr = dr * input_tiles_per_texture_r;
+
+    //  Sum up each weight and weighted sample color, varying the looping
+    //  strategy based on our expected dynamic loop capabilities.  See the
+    //  loop body macros above.
+    int i_base = 0;
+    float4 weight_sum = float4(0.0);
+    float3 pixel_color = float3(0.0);
+    static const int i_step = 4;
+    #ifdef BREAK_LOOPS_INTO_PIECES
+        if(samples - i_base >= 64)
+        {
+            for(int i = 0; i < 64; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 64;
+        }
+        if(samples - i_base >= 32)
+        {
+            for(int i = 0; i < 32; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 32;
+        }
+        if(samples - i_base >= 16)
+        {
+            for(int i = 0; i < 16; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 16;
+        }
+        if(samples - i_base >= 8)
+        {
+            for(int i = 0; i < 8; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 8;
+        }
+        if(samples - i_base >= 4)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 4;
+        }
+        //  Do another 4-sample block for a total of 128 max samples.
+        if(samples - i_base > 0)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+        }
+    #else
+        for(int i = 0; i < samples; i += i_step)
+        {
+            VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+        }
+    #endif
+    //  Normalize so the weight_sum == 1.0, and return:
+    const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw;
+    const float3 scalar_weight_sum = float3(weight_sum_reduce.x + 
+        weight_sum_reduce.y);
+    return (pixel_color/scalar_weight_sum);
+}
+
+float3 downsample_horizontal_sinc_tiled(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size, const float dr,
+    const float magnification_scale, const float tile_size_uv_r)
+{
+    //  Differences from downsample_horizontal_sinc_tiled:
+    //  1.) The dr and tile_size_uv_r parameters are not static consts.
+    //  2.) The "vertical" parameter to get_first_texel_tile_uv_and_dist is
+    //      set to false instead of true.
+    //  3.) The horizontal version of the loop body is used.
+    //  TODO: If we can get guaranteed compile-time dead code elimination,
+    //  we can combine the vertical/horizontal downsampling functions by:
+    //  1.) Add an extra static const bool parameter called "vertical."
+    //  2.) Supply it with the result of get_first_texel_tile_uv_and_dist().
+    //  3.) Use a conditional assignment in the loop body macro.  This is the
+    //      tricky part: We DO NOT want to incur the extra conditional
+    //      assignment in the inner loop at runtime!
+    //  The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension
+    //  we're resizing along, e.g. "dx" in this case.
+    #ifdef USE_SINGLE_STATIC_LOOP
+        //  If we have to load all samples, we might as well use them.
+        static const int samples = int(max_sinc_resize_samples_m4);
+    #else
+        const int samples = int(get_dynamic_loop_size(magnification_scale));
+    #endif
+
+    //  Get the first sample location (scalar tile uv coord along resized
+    //  dimension) and distance from the output location (in texels):
+    const float input_tiles_per_texture_r = 1.0/tile_size_uv_r;
+    //  false = horizontal resize:
+    const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist(
+        tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, false);
+    const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx;
+    const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy;
+    //  Get the tile sample offset:
+    const float tile_dr = dr * input_tiles_per_texture_r;
+
+    //  Sum up each weight and weighted sample color, varying the looping
+    //  strategy based on our expected dynamic loop capabilities.  See the
+    //  loop body macros above.
+    int i_base = 0;
+    float4 weight_sum = float4(0.0);
+    float3 pixel_color = float3(0.0);
+    static const int i_step = 4;
+    #ifdef BREAK_LOOPS_INTO_PIECES
+        if(samples - i_base >= 64)
+        {
+            for(int i = 0; i < 64; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 64;
+        }
+        if(samples - i_base >= 32)
+        {
+            for(int i = 0; i < 32; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 32;
+        }
+        if(samples - i_base >= 16)
+        {
+            for(int i = 0; i < 16; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 16;
+        }
+        if(samples - i_base >= 8)
+        {
+            for(int i = 0; i < 8; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 8;
+        }
+        if(samples - i_base >= 4)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 4;
+        }
+        //  Do another 4-sample block for a total of 128 max samples.
+        if(samples - i_base > 0)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+        }
+    #else
+        for(int i = 0; i < samples; i += i_step)
+        {
+            HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+        }
+    #endif
+    //  Normalize so the weight_sum == 1.0, and return:
+    const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw;
+    const float3 scalar_weight_sum = float3(weight_sum_reduce.x +
+        weight_sum_reduce.y);
+    return (pixel_color/scalar_weight_sum);
+}
+
+
+////////////////////////////  TILE SIZE CALCULATION  ///////////////////////////
+
+float2 get_resized_mask_tile_size(const float2 estimated_viewport_size,
+    const float2 estimated_mask_resize_output_size,
+    const bool solemnly_swear_same_inputs_for_every_pass)
+{
+    //  Requires:   The following global constants must be defined according to
+    //              certain constraints:
+    //              1.) mask_resize_num_triads: Must be high enough that our
+    //                  mask sampling method won't have artifacts later
+    //                  (long story; see derived-settings-and-constants.h)
+    //              2.) mask_resize_src_lut_size: Texel size of our mask LUT
+    //              3.) mask_triads_per_tile: Num horizontal triads in our LUT
+    //              4.) mask_min_allowed_triad_size: User setting (the more
+    //                  restrictive it is, the faster the resize will go)
+    //              5.) mask_min_allowed_tile_size_x < mask_resize_src_lut_size.x
+    //              6.) mask_triad_size_desired_{runtime, static}
+    //              7.) mask_num_triads_desired_{runtime, static}
+    //              8.) mask_specify_num_triads must be 0.0/1.0 (false/true)
+    //              The function parameters must be defined as follows:
+    //              1.) estimated_viewport_size == (final viewport size);
+    //                  If mask_specify_num_triads is 1.0/true and the viewport
+    //                  estimate is wrong, the number of triads will differ from
+    //                  the user's preference by about the same factor.
+    //              2.) estimated_mask_resize_output_size: Must equal the
+    //                  output size of the MASK_RESIZE pass.
+    //                  Exception: The x component may be estimated garbage if
+    //                  and only if the caller throws away the x result.
+    //              3.) solemnly_swear_same_inputs_for_every_pass: Set to false,
+    //                  unless you can guarantee that every call across every
+    //                  pass will use the same sizes for the other parameters.
+    //              When calling this across multiple passes, always use the
+    //              same y viewport size/scale, and always use the same x
+    //              viewport size/scale when using the x result.
+    //  Returns:    Return the final size of a manually resized mask tile, after
+    //              constraining the desired size to avoid artifacts.  Under
+    //              unusual circumstances, tiles may become stretched vertically
+    //              (see wall of text below).
+    //  Stated tile properties must be correct:
+    static const float tile_aspect_ratio_inv =
+        mask_resize_src_lut_size.y/mask_resize_src_lut_size.x;
+    static const float tile_aspect_ratio = 1.0/tile_aspect_ratio_inv;
+    static const float2 tile_aspect = float2(1.0, tile_aspect_ratio_inv);
+    //  If mask_specify_num_triads is 1.0/true and estimated_viewport_size.x is
+    //  wrong, the user preference will be misinterpreted:
+    const float desired_tile_size_x = mask_triads_per_tile * lerp(
+        mask_triad_size_desired,
+        estimated_viewport_size.x / mask_num_triads_desired,
+        mask_specify_num_triads);
+    if(get_mask_sample_mode() > 0.5)
+    {
+        //  We don't need constraints unless we're sampling MASK_RESIZE.
+        return desired_tile_size_x * tile_aspect;
+    }
+    //  Make sure we're not upsizing:
+    const float temp_tile_size_x =
+        min(desired_tile_size_x, mask_resize_src_lut_size.x);
+    //  Enforce min_tile_size and max_tile_size in both dimensions:
+    const float2 temp_tile_size = temp_tile_size_x * tile_aspect;
+    static const float2 min_tile_size =
+        mask_min_allowed_tile_size * tile_aspect;
+    const float2 max_tile_size =
+        estimated_mask_resize_output_size / mask_resize_num_tiles;
+    const float2 clamped_tile_size =
+        clamp(temp_tile_size, min_tile_size, max_tile_size);
+    //  Try to maintain tile_aspect_ratio.  This is the tricky part:
+    //  If we're currently resizing in the y dimension, the x components
+    //  could be MEANINGLESS.  (If estimated_mask_resize_output_size.x is
+    //  bogus, then so is max_tile_size.x and clamped_tile_size.x.)
+    //  We can't adjust the y size based on clamped_tile_size.x.  If it
+    //  clamps when it shouldn't, it won't clamp again when later passes
+    //  call this function with the correct sizes, and the discrepancy will
+    //  break the sampling coords in MASKED_SCANLINES.  Instead, we'll limit
+    //  the x size based on the y size, but not vice versa, unless the
+    //  caller swears the parameters were the same (correct) in every pass.
+    //  As a result, triads could appear vertically stretched if:
+    //  a.) mask_resize_src_lut_size.x > mask_resize_src_lut_size.y: Wide
+    //      LUT's might clamp x more than y (all provided LUT's are square)
+    //  b.) true_viewport_size.x < true_viewport_size.y: The user is playing
+    //      with a vertically oriented screen (not accounted for anyway)
+    //  c.) mask_resize_viewport_scale.x < masked_resize_viewport_scale.y:
+    //      Viewport scales are equal by default.
+    //  If any of these are the case, you can fix the stretching by setting:
+    //      mask_resize_viewport_scale.x = mask_resize_viewport_scale.y *
+    //          (1.0 / min_expected_aspect_ratio) *
+    //          (mask_resize_src_lut_size.x / mask_resize_src_lut_size.y)
+    const float x_tile_size_from_y =
+        clamped_tile_size.y * tile_aspect_ratio;
+    const float y_tile_size_from_x = lerp(clamped_tile_size.y,
+        clamped_tile_size.x * tile_aspect_ratio_inv,
+        float(solemnly_swear_same_inputs_for_every_pass));
+    const float2 reclamped_tile_size = float2(
+        min(clamped_tile_size.x, x_tile_size_from_y),
+        min(clamped_tile_size.y, y_tile_size_from_x));
+    //  We need integer tile sizes in both directions for tiled sampling to
+    //  work correctly.  Use floor (to make sure we don't round up), but be
+    //  careful to avoid a rounding bug where floor decreases whole numbers:
+    const float2 final_resized_tile_size =
+        floor(reclamped_tile_size + float2(FIX_ZERO(0.0)));
+    return final_resized_tile_size;
+}
+
+
+/////////////////////////  FINAL MASK SAMPLING HELPERS  ////////////////////////
+
+float4 get_mask_sampling_parameters(const float2 mask_resize_texture_size,
+    const float2 mask_resize_video_size, const float2 true_viewport_size,
+    out float2 mask_tiles_per_screen)
+{
+    //  Requires:   1.) Requirements of get_resized_mask_tile_size() must be
+    //                  met, particularly regarding global constants.
+    //              The function parameters must be defined as follows:
+    //              1.) mask_resize_texture_size == MASK_RESIZE.texture_size
+    //                  if get_mask_sample_mode() is 0 (otherwise anything)
+    //              2.) mask_resize_video_size == MASK_RESIZE.video_size
+    //                  if get_mask_sample_mode() is 0 (otherwise anything)
+    //              3.) true_viewport_size == output_size for a pass set to
+    //                  1.0 viewport scale (i.e. it must be correct)
+    //  Returns:    Return a float4 containing:
+    //                  xy: tex_uv coords for the start of the mask tile
+    //                  zw: tex_uv size of the mask tile from start to end
+    //              mask_tiles_per_screen is an out parameter containing the
+    //              number of mask tiles that will fit on the screen.
+    //  First get the final resized tile size.  The viewport size and mask
+    //  resize viewport scale must be correct, but don't solemnly swear they
+    //  were correct in both mask resize passes unless you know it's true.
+    //  (We can better ensure a correct tile aspect ratio if the parameters are
+    //  guaranteed correct in all passes...but if we lie, we'll get inconsistent
+    //  sizes across passes, resulting in broken texture coordinates.)
+    const float mask_sample_mode = get_mask_sample_mode();
+    const float2 mask_resize_tile_size = get_resized_mask_tile_size(
+        true_viewport_size, mask_resize_video_size, false);
+    if(mask_sample_mode < 0.5)
+    {
+        //  Sample MASK_RESIZE: The resized tile is a fraction of the texture
+        //  size and starts at a nonzero offset to allow for border texels:
+        const float2 mask_tile_uv_size = mask_resize_tile_size /
+            mask_resize_texture_size;
+        const float2 skipped_tiles = mask_start_texels/mask_resize_tile_size;
+        const float2 mask_tile_start_uv = skipped_tiles * mask_tile_uv_size;
+        //  mask_tiles_per_screen must be based on the *true* viewport size:
+        mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size;
+        return float4(mask_tile_start_uv, mask_tile_uv_size);
+    }
+    else
+    {
+        //  If we're tiling at the original size (1:1 pixel:texel), redefine a
+        //  "tile" to be the full texture containing many triads.  Otherwise,
+        //  we're hardware-resampling an LUT, and the texture truly contains a
+        //  single unresized phosphor mask tile anyway.
+        static const float2 mask_tile_uv_size = float2(1.0);
+        static const float2 mask_tile_start_uv = float2(0.0);
+        if(mask_sample_mode > 1.5)
+        {
+            //  Repeat the full LUT at a 1:1 pixel:texel ratio without resizing:
+            mask_tiles_per_screen = true_viewport_size/mask_texture_large_size;
+        }
+        else
+        {
+            //  Hardware-resize the original LUT:
+            mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size;
+        }
+        return float4(mask_tile_start_uv, mask_tile_uv_size);
+    }
+}
+/*
+float2 fix_tiling_discontinuities_normalized(const float2 tile_uv,
+    float2 duv_dx, float2 duv_dy)
+{
+    //  Requires:   1.) duv_dx == ddx(tile_uv)
+    //              2.) duv_dy == ddy(tile_uv)
+    //              3.) tile_uv contains tile-relative uv coords in [0, 1],
+    //                  such that (0.5, 0.5) is the center of a tile, etc.
+    //                  ("Tile" can mean texture, the video embedded in the
+    //                  texture, or some other "tile" embedded in a texture.)
+    //  Returns:    Return new tile_uv coords that contain no discontinuities
+    //              across a 2x2 pixel quad.
+    //  Description:
+    //  When uv coords wrap from 1.0 to 0.0, they create a discontinuity in the
+    //  derivatives, which we assume happened if the absolute difference between
+    //  any fragment in a 2x2 block is > ~half a tile.  If the current block has
+    //  a u or v discontinuity and the current fragment is in the first half of
+    //  the tile along that axis (i.e. it wrapped from 1.0 to 0.0), add a tile
+    //  to that coord to make the 2x2 block continuous.  (It will now have a
+    //  coord > 1.0 in the padding area beyond the tile.)  This function takes
+    //  derivatives as parameters so the caller can reuse them.
+    //  In case we're using high-quality (nVidia-style) derivatives, ensure
+    //  diagonically opposite fragments see each other for correctness:
+    duv_dx = abs(duv_dx) + abs(ddy(duv_dx));
+    duv_dy = abs(duv_dy) + abs(ddx(duv_dy));
+    const float2 pixel_in_first_half_tile = float2((tile_uv.x < 0.5),(tile_uv.y < 0.5));
+    const float2 jump_exists = float2(((duv_dx + duv_dy).x > 0.5),((duv_dx + duv_dy).y > 0.5));
+    return tile_uv + jump_exists * pixel_in_first_half_tile;
+}
+*/
+float2 convert_phosphor_tile_uv_wrap_to_tex_uv(const float2 tile_uv_wrap,
+    const float4 mask_tile_start_uv_and_size)
+{
+    //  Requires:   1.) tile_uv_wrap contains tile-relative uv coords, where the
+    //                  tile spans from [0, 1], such that (0.5, 0.5) is at the
+    //                  tile center.  The input coords can range from [0, inf],
+    //                  and their fractional parts map to a repeated tile.
+    //                  ("Tile" can mean texture, the video embedded in the
+    //                  texture, or some other "tile" embedded in a texture.)
+    //              2.) mask_tile_start_uv_and_size.xy contains tex_uv coords
+    //                  for the start of the embedded tile in the full texture.
+    //              3.) mask_tile_start_uv_and_size.zw contains the [fractional]
+    //                  tex_uv size of the embedded tile in the full texture.
+    //  Returns:    Return tex_uv coords (used for texture sampling)
+    //              corresponding to tile_uv_wrap.
+    if(get_mask_sample_mode() < 0.5)
+    {
+        //  Manually repeat the resized mask tile to fill the screen:
+        //  First get fractional tile_uv coords.  Using frac/fmod on coords
+        //  confuses anisotropic filtering; fix it as user options dictate.
+        //  derived-settings-and-constants.h disables incompatible options.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            float2 tile_uv = frac(tile_uv_wrap * 0.5) * 2.0;
+        #else
+            float2 tile_uv = frac(tile_uv_wrap);
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            const float2 tile_uv_dx = ddx(tile_uv);
+            const float2 tile_uv_dy = ddy(tile_uv);
+            tile_uv = fix_tiling_discontinuities_normalized(tile_uv,
+                tile_uv_dx, tile_uv_dy);
+        #endif
+        //  The tile is embedded in a padded FBO, and it may start at a
+        //  nonzero offset if border texels are used to avoid artifacts:
+        const float2 mask_tex_uv = mask_tile_start_uv_and_size.xy +
+            tile_uv * mask_tile_start_uv_and_size.zw;
+        return mask_tex_uv;
+    }
+    else
+    {
+        //  Sample from the input phosphor mask texture with hardware tiling.
+        //  If we're tiling at the original size (mode 2), the "tile" is the
+        //  whole texture, and it contains a large number of triads mapped with
+        //  a 1:1 pixel:texel ratio.  OTHERWISE, the texture contains a single
+        //  unresized tile.  tile_uv_wrap already has correct coords for both!
+        return tile_uv_wrap;
+    }
+}
+
+
+#endif  //  PHOSPHOR_MASK_RESIZING_H
+
+/////////////////////////  END PHOSPHOR-MASK-RESIZING  /////////////////////////
+
+//#include "scanline-functions.h"
+
+/////////////////////////////  BEGIN SCANLINE-FUNCTIONS  ////////////////////////////
+
+#ifndef SCANLINE_FUNCTIONS_H
+#define SCANLINE_FUNCTIONS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+////////////////////////////  END USER-SETTINGS  //////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  END DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////////////
+
+//#include "../../../../include/special-functions.h"
+
+///////////////////////////  BEGIN SPECIAL-FUNCTIONS  //////////////////////////
+
+#ifndef SPECIAL_FUNCTIONS_H
+#define SPECIAL_FUNCTIONS_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file implements the following mathematical special functions:
+//  1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2))
+//  2.) gamma(s), a real-numbered extension of the integer factorial function
+//  It also implements normalized_ligamma(s, z), a normalized lower incomplete
+//  gamma function for s < 0.5 only.  Both gamma() and normalized_ligamma() can
+//  be called with an _impl suffix to use an implementation version with a few
+//  extra precomputed parameters (which may be useful for the caller to reuse).
+//  See below for details.
+//
+//  Design Rationale:
+//  Pretty much every line of code in this file is duplicated four times for
+//  different input types (float4/float3/float2/float).  This is unfortunate,
+//  but Cg doesn't allow function templates.  Macros would be far less verbose,
+//  but they would make the code harder to document and read.  I don't expect
+//  these functions will require a whole lot of maintenance changes unless
+//  someone ever has need for more robust incomplete gamma functions, so code
+//  duplication seems to be the lesser evil in this case.
+
+
+///////////////////////////  GAUSSIAN ERROR FUNCTION  //////////////////////////
+
+float4 erf6(float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Return an Abramowitz/Stegun approximation of erf(), where:
+    //                  erf(x) = 2/sqrt(pi) * integral(e**(-x**2))
+    //              This approximation has a max absolute error of 2.5*10**-5
+    //              with solid numerical robustness and efficiency.  See:
+	//                  https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions
+	static const float4 one = float4(1.0);
+	const float4 sign_x = sign(x);
+	const float4 t = one/(one + 0.47047*abs(x));
+	const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float3 erf6(const float3 x)
+{
+    //  Float3 version:
+	static const float3 one = float3(1.0);
+	const float3 sign_x = sign(x);
+	const float3 t = one/(one + 0.47047*abs(x));
+	const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float2 erf6(const float2 x)
+{
+    //  Float2 version:
+	static const float2 one = float2(1.0);
+	const float2 sign_x = sign(x);
+	const float2 t = one/(one + 0.47047*abs(x));
+	const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float erf6(const float x)
+{
+    //  Float version:
+	const float sign_x = sign(x);
+	const float t = 1.0/(1.0 + 0.47047*abs(x));
+	const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float4 erft(const float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Approximate erf() with the hyperbolic tangent.  The error is
+    //              visually noticeable, but it's blazing fast and perceptually
+    //              close...at least on ATI hardware.  See:
+    //                  http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html
+    //  Warning:    Only use this if your hardware drivers correctly implement
+    //              tanh(): My nVidia 8800GTS returns garbage output.
+	return tanh(1.202760580 * x);
+}
+
+float3 erft(const float3 x)
+{
+    //  Float3 version:
+	return tanh(1.202760580 * x);
+}
+
+float2 erft(const float2 x)
+{
+    //  Float2 version:
+	return tanh(1.202760580 * x);
+}
+
+float erft(const float x)
+{
+    //  Float version:
+	return tanh(1.202760580 * x);
+}
+
+inline float4 erf(const float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Some approximation of erf(x), depending on user settings.
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float3 erf(const float3 x)
+{
+    //  Float3 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float2 erf(const float2 x)
+{
+    //  Float2 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float erf(const float x)
+{
+    //  Float version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+
+///////////////////////////  COMPLETE GAMMA FUNCTION  //////////////////////////
+
+float4 gamma_impl(const float4 s, const float4 s_inv)
+{
+    //  Requires:   1.) s is the standard parameter to the gamma function, and
+    //                  it should lie in the [0, 36] range.
+    //              2.) s_inv = 1.0/s.  This implementation function requires
+    //                  the caller to precompute this value, giving users the
+    //                  opportunity to reuse it.
+    //  Returns:    Return approximate gamma function (real-numbered factorial)
+    //              output using the Lanczos approximation with two coefficients
+    //              calculated using Paul Godfrey's method here:
+    //                  http://my.fit.edu/~gabdo/gamma.txt
+    //              An optimal g value for s in [0, 36] is ~1.12906830989, with
+    //              a maximum relative error of 0.000463 for 2**16 equally
+    //              evals.  We could use three coeffs (0.0000346 error) without
+    //              hurting latency, but this allows more parallelism with
+    //              outside instructions.
+	static const float4 g = float4(1.12906830989);
+	static const float4 c0 = float4(0.8109119309638332633713423362694399653724431);
+	static const float4 c1 = float4(0.4808354605142681877121661197951496120000040);
+	static const float4 e = float4(2.71828182845904523536028747135266249775724709);
+	const float4 sph = s + float4(0.5);
+	const float4 lanczos_sum = c0 + c1/(s + float4(1.0));
+	const float4 base = (sph + g)/e;  //  or (s + g + float4(0.5))/e
+	//  gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s).
+	//  This has less error for small s's than (s -= 1.0) at the beginning.
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float3 gamma_impl(const float3 s, const float3 s_inv)
+{
+    //  Float3 version:
+	static const float3 g = float3(1.12906830989);
+	static const float3 c0 = float3(0.8109119309638332633713423362694399653724431);
+	static const float3 c1 = float3(0.4808354605142681877121661197951496120000040);
+	static const float3 e = float3(2.71828182845904523536028747135266249775724709);
+	const float3 sph = s + float3(0.5);
+	const float3 lanczos_sum = c0 + c1/(s + float3(1.0));
+	const float3 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float2 gamma_impl(const float2 s, const float2 s_inv)
+{
+    //  Float2 version:
+	static const float2 g = float2(1.12906830989);
+	static const float2 c0 = float2(0.8109119309638332633713423362694399653724431);
+	static const float2 c1 = float2(0.4808354605142681877121661197951496120000040);
+	static const float2 e = float2(2.71828182845904523536028747135266249775724709);
+	const float2 sph = s + float2(0.5);
+	const float2 lanczos_sum = c0 + c1/(s + float2(1.0));
+	const float2 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float gamma_impl(const float s, const float s_inv)
+{
+    //  Float version:
+	static const float g = 1.12906830989;
+	static const float c0 = 0.8109119309638332633713423362694399653724431;
+	static const float c1 = 0.4808354605142681877121661197951496120000040;
+	static const float e = 2.71828182845904523536028747135266249775724709;
+	const float sph = s + 0.5;
+	const float lanczos_sum = c0 + c1/(s + 1.0);
+	const float base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float4 gamma(const float4 s)
+{
+    //  Requires:   s is the standard parameter to the gamma function, and it
+    //              should lie in the [0, 36] range.
+    //  Returns:    Return approximate gamma function output with a maximum
+    //              relative error of 0.000463.  See gamma_impl for details.
+	return gamma_impl(s, float4(1.0)/s);
+}
+
+float3 gamma(const float3 s)
+{
+    //  Float3 version:
+	return gamma_impl(s, float3(1.0)/s);
+}
+
+float2 gamma(const float2 s)
+{
+    //  Float2 version:
+	return gamma_impl(s, float2(1.0)/s);
+}
+
+float gamma(const float s)
+{
+    //  Float version:
+	return gamma_impl(s, 1.0/s);
+}
+
+
+////////////////  INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT)  ///////////////
+
+//  Lower incomplete gamma function for small s and z (implementation):
+float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z <= ~0.775075
+    //              3.) s_inv = 1.0/s (precomputed for outside reuse)
+    //  Returns:    A series representation for the lower incomplete gamma
+    //              function for small s and small z (4 terms).
+    //  The actual "rolled up" summation looks like:
+	//      last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0;
+	//      sum = last_sign * last_pow / ((s + k) * last_factorial)
+	//      for(int i = 0; i < 4; ++i)
+	//      {
+	//          last_sign *= -1.0; last_pow *= z; last_factorial *= i;
+	//          sum += last_sign * last_pow / ((s + k) * last_factorial);
+	//      }
+	//  Unrolled, constant-unfolded and arranged for madds and parallelism:
+	const float4 scale = pow(z, s);
+	float4 sum = s_inv;  //  Summation iteration 0 result
+	//  Summation iterations 1, 2, and 3:
+	const float4 z_sq = z*z;
+	const float4 denom1 = s + float4(1.0);
+	const float4 denom2 = 2.0*s + float4(4.0);
+	const float4 denom3 = 6.0*s + float4(18.0);
+	//float4 denom4 = 24.0*s + float4(96.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	//sum += z_sq * z_sq / denom4;
+	//  Scale and return:
+	return scale * sum;
+}
+
+float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv)
+{
+    //  Float3 version:
+	const float3 scale = pow(z, s);
+	float3 sum = s_inv;
+	const float3 z_sq = z*z;
+	const float3 denom1 = s + float3(1.0);
+	const float3 denom2 = 2.0*s + float3(4.0);
+	const float3 denom3 = 6.0*s + float3(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv)
+{
+    //  Float2 version:
+	const float2 scale = pow(z, s);
+	float2 sum = s_inv;
+	const float2 z_sq = z*z;
+	const float2 denom1 = s + float2(1.0);
+	const float2 denom2 = 2.0*s + float2(4.0);
+	const float2 denom3 = 6.0*s + float2(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float ligamma_small_z_impl(const float s, const float z, const float s_inv)
+{
+    //  Float version:
+	const float scale = pow(z, s);
+	float sum = s_inv;
+	const float z_sq = z*z;
+	const float denom1 = s + 1.0;
+	const float denom2 = 2.0*s + 4.0;
+	const float denom3 = 6.0*s + 18.0;
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+//  Upper incomplete gamma function for small s and large z (implementation):
+float4 uigamma_large_z_impl(const float4 s, const float4 z)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z > ~0.775075
+    //  Returns:    Gauss's continued fraction representation for the upper
+    //              incomplete gamma function (4 terms).
+	//  The "rolled up" continued fraction looks like this.  The denominator
+    //  is truncated, and it's calculated "from the bottom up:"
+	//      denom = float4('inf');
+	//      float4 one = float4(1.0);
+	//      for(int i = 4; i > 0; --i)
+	//      {
+	//          denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom;
+	//      }
+	//  Unrolled and constant-unfolded for madds and parallelism:
+	const float4 numerator = pow(z, s) * exp(-z);
+	float4 denom = float4(7.0) + z - s;
+	denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom;
+	denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom;
+	denom = float4(1.0) + z - s + (s - float4(1.0))/denom;
+	return numerator / denom;
+}
+
+float3 uigamma_large_z_impl(const float3 s, const float3 z)
+{
+    //  Float3 version:
+	const float3 numerator = pow(z, s) * exp(-z);
+	float3 denom = float3(7.0) + z - s;
+	denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom;
+	denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom;
+	denom = float3(1.0) + z - s + (s - float3(1.0))/denom;
+	return numerator / denom;
+}
+
+float2 uigamma_large_z_impl(const float2 s, const float2 z)
+{
+    //  Float2 version:
+	const float2 numerator = pow(z, s) * exp(-z);
+	float2 denom = float2(7.0) + z - s;
+	denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom;
+	denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom;
+	denom = float2(1.0) + z - s + (s - float2(1.0))/denom;
+	return numerator / denom;
+}
+
+float uigamma_large_z_impl(const float s, const float z)
+{
+    //  Float version:
+	const float numerator = pow(z, s) * exp(-z);
+	float denom = 7.0 + z - s;
+	denom = 5.0 + z - s + (3.0*s - 9.0)/denom;
+	denom = 3.0 + z - s + (2.0*s - 4.0)/denom;
+	denom = 1.0 + z - s + (s - 1.0)/denom;
+	return numerator / denom;
+}
+
+//  Normalized lower incomplete gamma function for small s (implementation):
+float4 normalized_ligamma_impl(const float4 s, const float4 z,
+    const float4 s_inv, const float4 gamma_s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) s_inv = 1/s (precomputed for outside reuse)
+    //              3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse)
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  Since we only care about s < 0.5, we only need
+    //              to evaluate two branches (not four) based on z.  Each branch
+    //              uses four terms, with a max relative error of ~0.00182.  The
+    //              branch threshold and specifics were adapted for fewer terms
+    //              from Gil/Segura/Temme's paper here:
+    //                  http://oai.cwi.nl/oai/asset/20433/20433B.pdf
+	//  Evaluate both branches: Real branches test slower even when available.
+	static const float4 thresh = float4(0.775075);
+	bool4 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	z_is_large.w = z.w > thresh.w;
+	const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	//  Combine the results from both branches:
+	bool4 inverse_z_is_large = not(z_is_large);
+	return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large);
+}
+
+float3 normalized_ligamma_impl(const float3 s, const float3 z,
+    const float3 s_inv, const float3 gamma_s_inv)
+{
+    //  Float3 version:
+	static const float3 thresh = float3(0.775075);
+	bool3 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool3 inverse_z_is_large = not(z_is_large);
+	return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large);
+}
+
+float2 normalized_ligamma_impl(const float2 s, const float2 z,
+    const float2 s_inv, const float2 gamma_s_inv)
+{
+    //  Float2 version:
+	static const float2 thresh = float2(0.775075);
+	bool2 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool2 inverse_z_is_large = not(z_is_large);
+	return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large);
+}
+
+float normalized_ligamma_impl(const float s, const float z,
+    const float s_inv, const float gamma_s_inv)
+{
+    //  Float version:
+	static const float thresh = 0.775075;
+	const bool z_is_large = z > thresh;
+	const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	return large_z * float(z_is_large) + small_z * float(!z_is_large);
+}
+
+//  Normalized lower incomplete gamma function for small s:
+float4 normalized_ligamma(const float4 s, const float4 z)
+{
+    //  Requires:   s < ~0.5
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  See normalized_ligamma_impl() for details.
+	const float4 s_inv = float4(1.0)/s;
+	const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float3 normalized_ligamma(const float3 s, const float3 z)
+{
+    //  Float3 version:
+	const float3 s_inv = float3(1.0)/s;
+	const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float2 normalized_ligamma(const float2 s, const float2 z)
+{
+    //  Float2 version:
+	const float2 s_inv = float2(1.0)/s;
+	const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float normalized_ligamma(const float s, const float z)
+{
+    //  Float version:
+	const float s_inv = 1.0/s;
+	const float gamma_s_inv = 1.0/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+#endif  //  SPECIAL_FUNCTIONS_H
+
+////////////////////////////  END SPECIAL-FUNCTIONS  ///////////////////////////
+
+//#include "../../../../include/gamma-management.h"
+
+////////////////////////////  BEGIN GAMMA-MANAGEMENT  //////////////////////////
+
+#ifndef GAMMA_MANAGEMENT_H
+#define GAMMA_MANAGEMENT_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//  
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides gamma-aware tex*D*() and encode_output() functions.
+//  Requires:   Before #include-ing this file, the including file must #define
+//              the following macros when applicable and follow their rules:
+//              1.) #define FIRST_PASS if this is the first pass.
+//              2.) #define LAST_PASS if this is the last pass.
+//              3.) If sRGB is available, set srgb_framebufferN = "true" for
+//                  every pass except the last in your .cgp preset.
+//              4.) If sRGB isn't available but you want gamma-correctness with
+//                  no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
+//              5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
+//              6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
+//              7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
+//              8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
+//              If an option in [5, 8] is #defined in the first or last pass, it
+//              should be #defined for both.  It shouldn't make a difference
+//              whether it's #defined for intermediate passes or not.
+//  Optional:   The including file (or an earlier included file) may optionally
+//              #define a number of macros indicating it will override certain
+//              macros and associated constants are as follows:
+//              static constants with either static or uniform constants.  The
+//              1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
+//                  static const float ntsc_gamma
+//                  static const float pal_gamma
+//                  static const float crt_reference_gamma_high
+//                  static const float crt_reference_gamma_low
+//                  static const float lcd_reference_gamma
+//                  static const float crt_office_gamma
+//                  static const float lcd_office_gamma
+//              2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
+//                  static const float crt_gamma
+//                  static const float gba_gamma
+//                  static const float lcd_gamma
+//              3.) OVERRIDE_FINAL_GAMMA: The user must first define:
+//                  static const float input_gamma
+//                  static const float intermediate_gamma
+//                  static const float output_gamma
+//                  (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
+//              4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
+//                  static const bool assume_opaque_alpha
+//              The gamma constant overrides must be used in every pass or none,
+//              and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
+//              OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
+//  Usage:      After setting macros appropriately, ignore gamma correction and
+//              replace all tex*D*() calls with equivalent gamma-aware
+//              tex*D*_linearize calls, except:
+//              1.) When you read an LUT, use regular tex*D or a gamma-specified
+//                  function, depending on its gamma encoding:
+//                      tex*D*_linearize_gamma (takes a runtime gamma parameter)
+//              2.) If you must read pass0's original input in a later pass, use
+//                  tex2D_linearize_ntsc_gamma.  If you want to read pass0's
+//                  input with gamma-corrected bilinear filtering, consider
+//                  creating a first linearizing pass and reading from the input
+//                  of pass1 later.
+//              Then, return encode_output(color) from every fragment shader.
+//              Finally, use the global gamma_aware_bilinear boolean if you want
+//              to statically branch based on whether bilinear filtering is
+//              gamma-correct or not (e.g. for placing Gaussian blur samples).
+//
+//  Detailed Policy:
+//  tex*D*_linearize() functions enforce a consistent gamma-management policy
+//  based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings.  They assume
+//  their input texture has the same encoding characteristics as the input for
+//  the current pass (which doesn't apply to the exceptions listed above).
+//  Similarly, encode_output() enforces a policy based on the LAST_PASS and
+//  GAMMA_ENCODE_EVERY_FBO settings.  Together, they result in one of the
+//  following two pipelines.
+//  Typical pipeline with intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = linear_color;     //  Automatic sRGB encoding
+//      linear_color = intermediate_output;     //  Automatic sRGB decoding
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Typical pipeline without intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
+//      linear_color = pow(intermediate_output, intermediate_gamma);
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
+//  easily get gamma-correctness without banding on devices where sRGB isn't
+//  supported.
+//
+//  Use This Header to Maximize Code Reuse:
+//  The purpose of this header is to provide a consistent interface for texture
+//  reads and output gamma-encoding that localizes and abstracts away all the
+//  annoying details.  This greatly reduces the amount of code in each shader
+//  pass that depends on the pass number in the .cgp preset or whether sRGB
+//  FBO's are being used: You can trivially change the gamma behavior of your
+//  whole pass by commenting or uncommenting 1-3 #defines.  To reuse the same
+//  code in your first, Nth, and last passes, you can even put it all in another
+//  header file and #include it from skeleton .cg files that #define the
+//  appropriate pass-specific settings.
+//
+//  Rationale for Using Three Macros:
+//  This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
+//  SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
+//  a lower maintenance burden on each pass.  At first glance it seems we could
+//  accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
+//  This works for simple use cases where input_gamma == output_gamma, but it
+//  breaks down for more complex scenarios like CRT simulation, where the pass
+//  number determines the gamma encoding of the input and output.
+
+
+///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
+
+//  Set standard gamma constants, but allow users to override them:
+#ifndef OVERRIDE_STANDARD_GAMMA
+    //  Standard encoding gammas:
+    static const float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
+    static const float pal_gamma = 2.8;     //  Never actually 2.8 in practice
+    //  Typical device decoding gammas (only use for emulating devices):
+    //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
+    //  gammas: The standards purposely undercorrected for an analog CRT's
+    //  assumed 2.5 reference display gamma to maintain contrast in assumed
+    //  [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
+    //  These unstated assumptions about display gamma and perceptual rendering
+    //  intent caused a lot of confusion, and more modern CRT's seemed to target
+    //  NTSC 2.2 gamma with circuitry.  LCD displays seem to have followed suit
+    //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
+    //  displays designed to view sRGB in bright environments.  (Standards are
+    //  also in flux again with BT.1886, but it's underspecified for displays.)
+    static const float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
+    static const float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
+    static const float lcd_reference_gamma = 2.5;       //  To match CRT
+    static const float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
+    static const float lcd_office_gamma = 2.2;  //  Approximates sRGB
+#endif  //  OVERRIDE_STANDARD_GAMMA
+
+//  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
+//  but only if they're aware of it.
+#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
+    static const bool assume_opaque_alpha = false;
+#endif
+
+
+///////////////////////  DERIVED CONSTANTS AS FUNCTIONS  ///////////////////////
+
+//  gamma-management.h should be compatible with overriding gamma values with
+//  runtime user parameters, but we can only define other global constants in
+//  terms of static constants, not uniform user parameters.  To get around this
+//  limitation, we need to define derived constants using functions.
+
+//  Set device gamma constants, but allow users to override them:
+#ifdef OVERRIDE_DEVICE_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_crt_gamma()    {   return crt_gamma;   }
+    inline float get_gba_gamma()    {   return gba_gamma;   }
+    inline float get_lcd_gamma()    {   return lcd_gamma;   }
+#else
+    inline float get_crt_gamma()    {   return crt_reference_gamma_high;    }
+    inline float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
+    inline float get_lcd_gamma()    {   return lcd_office_gamma;            }
+#endif  //  OVERRIDE_DEVICE_GAMMA
+
+//  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
+#ifdef OVERRIDE_FINAL_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_intermediate_gamma()   {   return intermediate_gamma;  }
+    inline float get_input_gamma()          {   return input_gamma;         }
+    inline float get_output_gamma()         {   return output_gamma;        }
+#else
+    //  If we gamma-correct every pass, always use ntsc_gamma between passes to
+    //  ensure middle passes don't need to care if anything is being simulated:
+    inline float get_intermediate_gamma()   {   return ntsc_gamma;          }
+    #ifdef SIMULATE_CRT_ON_LCD
+        inline float get_input_gamma()      {   return get_crt_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_LCD
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_LCD_ON_CRT
+        inline float get_input_gamma()      {   return get_lcd_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_CRT
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else   //  Don't simulate anything:
+        inline float get_input_gamma()      {   return ntsc_gamma;          }
+        inline float get_output_gamma()     {   return ntsc_gamma;          }
+    #endif  //  SIMULATE_GBA_ON_CRT
+    #endif  //  SIMULATE_LCD_ON_CRT
+    #endif  //  SIMULATE_GBA_ON_LCD
+    #endif  //  SIMULATE_CRT_ON_LCD
+#endif  //  OVERRIDE_FINAL_GAMMA
+
+//  Set decoding/encoding gammas for the current pass.  Use static constants for
+//  linearize_input and gamma_encode_output, because they aren't derived, and
+//  they let the compiler do dead-code elimination.
+#ifndef GAMMA_ENCODE_EVERY_FBO
+    #ifdef FIRST_PASS
+        static const bool linearize_input = true;
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        static const bool linearize_input = false;
+        inline float get_pass_input_gamma()     {   return 1.0;                 }
+    #endif
+    #ifdef LAST_PASS
+        static const bool gamma_encode_output = true;
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        static const bool gamma_encode_output = false;
+        inline float get_pass_output_gamma()    {   return 1.0;                 }
+    #endif
+#else
+    static const bool linearize_input = true;
+    static const bool gamma_encode_output = true;
+    #ifdef FIRST_PASS
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        inline float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
+    #endif
+    #ifdef LAST_PASS
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        inline float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
+    #endif
+#endif
+
+//  Users might want to know if bilinear filtering will be gamma-correct:
+static const bool gamma_aware_bilinear = !linearize_input;
+
+
+//////////////////////  COLOR ENCODING/DECODING FUNCTIONS  /////////////////////
+
+inline float4 encode_output(const float4 color)
+{
+    if(gamma_encode_output)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_input(const float4 color)
+{
+    if(linearize_input)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_gamma_input(const float4 color, const float3 gamma)
+{
+    if(assume_opaque_alpha)
+    {
+        return float4(pow(color.rgb, gamma), 1.0);
+    }
+    else
+    {
+        return float4(pow(color.rgb, gamma), color.a);
+    }
+}
+
+//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯
+//#define tex2D_linearize(C, D) decode_input(vec4(texture(C, D)))
+// EDIT: it's the 'const' in front of the coords that's doing it
+
+///////////////////////////  TEXTURE LOOKUP WRAPPERS  //////////////////////////
+
+//  "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a wide array of linearizing texture lookup wrapper functions.  The
+//  Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
+//  lookups are provided for completeness in case that changes someday.  Nobody
+//  is likely to use the *fetch and *proj functions, but they're included just
+//  in case.  The only tex*D texture sampling functions omitted are:
+//      - tex*Dcmpbias
+//      - tex*Dcmplod
+//      - tex*DARRAY*
+//      - tex*DMS*
+//      - Variants returning integers
+//  Standard line length restrictions are ignored below for vertical brevity.
+/*
+//  tex1D:
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex1Dbias:
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dbias(tex, tex_coords));   }
+
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dbias(tex, tex_coords, texel_off));    }
+
+//  tex1Dfetch:
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
+{   return decode_input(tex1Dfetch(tex, tex_coords));  }
+
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex1Dlod:
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dlod(tex, tex_coords));    }
+
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dlod(tex, tex_coords, texel_off));     }
+
+//  tex1Dproj:
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+*/
+//  tex2D:
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords, texel_off));    }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex2Dbias:
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
+//{   return decode_input(tex2Dbias(tex, tex_coords));   }
+
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dbias(tex, tex_coords, texel_off));    }
+
+//  tex2Dfetch:
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
+//{   return decode_input(tex2Dfetch(tex, tex_coords));  }
+
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords)
+{   return decode_input(textureLod(tex, tex_coords.xy, 0.0));    }
+
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));     }
+/*
+//  tex2Dproj:
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+*/
+/*
+//  tex3D:
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
+{   return decode_input(tex3D(tex, tex_coords));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, texel_off));    }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex3Dbias:
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dbias(tex, tex_coords));   }
+
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dbias(tex, tex_coords, texel_off));    }
+
+//  tex3Dfetch:
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
+{   return decode_input(tex3Dfetch(tex, tex_coords));  }
+
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex3Dlod:
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dlod(tex, tex_coords));    }
+
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dlod(tex, tex_coords, texel_off));     }
+
+//  tex3Dproj:
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dproj(tex, tex_coords));   }
+
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dproj(tex, tex_coords, texel_off));    }
+/////////*
+
+//  NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  This narrow selection of nonstandard tex2D* functions can be useful:
+
+//  tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0)));   }
+
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off));    }
+
+
+//  MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a narrower selection of tex2D* wrapper functions that decode an
+//  input sample with a specified gamma value.  These are useful for reading
+//  LUT's and for reading the input of pass0 in a later pass.
+
+//  tex2D:
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma);   }
+
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+/*
+//  tex2Dbias:
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma);   }
+
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma);    }
+
+//  tex2Dfetch:
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma);  }
+
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma);   }
+*/
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma);    }
+
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma);     }
+
+
+#endif  //  GAMMA_MANAGEMENT_H
+
+////////////////////////////  END GAMMA-MANAGEMENT  //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+/////////////////////////////  SCANLINE FUNCTIONS  /////////////////////////////
+
+inline float3 get_gaussian_sigma(const float3 color, const float sigma_range)
+{
+    //  Requires:   Globals:
+    //              1.) beam_min_sigma and beam_max_sigma are global floats
+    //                  containing the desired minimum and maximum beam standard
+    //                  deviations, for dim and bright colors respectively.
+    //              2.) beam_max_sigma must be > 0.0
+    //              3.) beam_min_sigma must be in (0.0, beam_max_sigma]
+    //              4.) beam_spot_power must be defined as a global float.
+    //              Parameters:
+    //              1.) color is the underlying source color along a scanline
+    //              2.) sigma_range = beam_max_sigma - beam_min_sigma; we take
+    //                  sigma_range as a parameter to avoid repeated computation
+    //                  when beam_{min, max}_sigma are runtime shader parameters
+    //  Optional:   Users may set beam_spot_shape_function to 1 to define the
+    //              inner f(color) subfunction (see below) as:
+    //                  f(color) = sqrt(1.0 - (color - 1.0)*(color - 1.0))
+    //              Otherwise (technically, if beam_spot_shape_function < 0.5):
+    //                  f(color) = pow(color, beam_spot_power)
+    //  Returns:    The standard deviation of the Gaussian beam for "color:"
+    //                  sigma = beam_min_sigma + sigma_range * f(color)
+    //  Details/Discussion:
+    //  The beam's spot shape vaguely resembles an aspect-corrected f() in the
+    //  range [0, 1] (not quite, but it's related).  f(color) = color makes
+    //  spots look like diamonds, and a spherical function or cube balances
+    //  between variable width and a soft/realistic shape.   A beam_spot_power
+    //  > 1.0 can produce an ugly spot shape and more initial clipping, but the
+    //  final shape also differs based on the horizontal resampling filter and
+    //  the phosphor bloom.  For instance, resampling horizontally in nonlinear
+    //  light and/or with a sharp (e.g. Lanczos) filter will sharpen the spot
+    //  shape, but a sixth root is still quite soft.  A power function (default
+    //  1.0/3.0 beam_spot_power) is most flexible, but a fixed spherical curve
+    //  has the highest variability without an awful spot shape.
+    //
+    //  beam_min_sigma affects scanline sharpness/aliasing in dim areas, and its
+    //  difference from beam_max_sigma affects beam width variability.  It only
+    //  affects clipping [for pure Gaussians] if beam_spot_power > 1.0 (which is
+    //  a conservative estimate for a more complex constraint).
+    //
+    //  beam_max_sigma affects clipping and increasing scanline width/softness
+    //  as color increases.  The wider this is, the more scanlines need to be
+    //  evaluated to avoid distortion.  For a pure Gaussian, the max_beam_sigma
+    //  at which the first unused scanline always has a weight < 1.0/255.0 is:
+    //      num scanlines = 2, max_beam_sigma = 0.2089; distortions begin ~0.34
+    //      num scanlines = 3, max_beam_sigma = 0.3879; distortions begin ~0.52
+    //      num scanlines = 4, max_beam_sigma = 0.5723; distortions begin ~0.70
+    //      num scanlines = 5, max_beam_sigma = 0.7591; distortions begin ~0.89
+    //      num scanlines = 6, max_beam_sigma = 0.9483; distortions begin ~1.08
+    //  Generalized Gaussians permit more leeway here as steepness increases.
+    if(beam_spot_shape_function < 0.5)
+    {
+        //  Use a power function:
+        return float3(beam_min_sigma) + sigma_range *
+            pow(color, float3(beam_spot_power));
+    }
+    else
+    {
+        //  Use a spherical function:
+        const float3 color_minus_1 = color - float3(1.0);
+        return float3(beam_min_sigma) + sigma_range *
+            sqrt(float3(1.0) - color_minus_1*color_minus_1);
+    }
+}
+
+inline float3 get_generalized_gaussian_beta(const float3 color,
+    const float shape_range)
+{
+    //  Requires:   Globals:
+    //              1.) beam_min_shape and beam_max_shape are global floats
+    //                  containing the desired min/max generalized Gaussian
+    //                  beta parameters, for dim and bright colors respectively.
+    //              2.) beam_max_shape must be >= 2.0
+    //              3.) beam_min_shape must be in [2.0, beam_max_shape]
+    //              4.) beam_shape_power must be defined as a global float.
+    //              Parameters:
+    //              1.) color is the underlying source color along a scanline
+    //              2.) shape_range = beam_max_shape - beam_min_shape; we take
+    //                  shape_range as a parameter to avoid repeated computation
+    //                  when beam_{min, max}_shape are runtime shader parameters
+    //  Returns:    The type-I generalized Gaussian "shape" parameter beta for
+    //              the given color.
+    //  Details/Discussion:
+    //  Beta affects the scanline distribution as follows:
+    //  a.) beta < 2.0 narrows the peak to a spike with a discontinuous slope
+    //  b.) beta == 2.0 just degenerates to a Gaussian
+    //  c.) beta > 2.0 flattens and widens the peak, then drops off more steeply
+    //      than a Gaussian.  Whereas high sigmas widen and soften peaks, high
+    //      beta widen and sharpen peaks at the risk of aliasing.
+    //  Unlike high beam_spot_powers, high beam_shape_powers actually soften shape
+    //  transitions, whereas lower ones sharpen them (at the risk of aliasing).
+    return beam_min_shape + shape_range * pow(color, float3(beam_shape_power));
+}
+
+float3 scanline_gaussian_integral_contrib(const float3 dist,
+    const float3 color, const float pixel_height, const float sigma_range)
+{
+    //  Requires:   1.) dist is the distance of the [potentially separate R/G/B]
+    //                  point(s) from a scanline in units of scanlines, where
+    //                  1.0 means the sample point straddles the next scanline.
+    //              2.) color is the underlying source color along a scanline.
+    //              3.) pixel_height is the output pixel height in scanlines.
+    //              4.) Requirements of get_gaussian_sigma() must be met.
+    //  Returns:    Return a scanline's light output over a given pixel.
+    //  Details:
+    //  The CRT beam profile follows a roughly Gaussian distribution which is
+    //  wider for bright colors than dark ones.  The integral over the full
+    //  range of a Gaussian function is always 1.0, so we can vary the beam
+    //  with a standard deviation without affecting brightness.  'x' = distance:
+    //      gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2))
+    //      gaussian integral = 0.5 (1.0 + erf(x/(sigma * sqrt(2))))
+    //  Use a numerical approximation of the "error function" (the Gaussian
+    //  indefinite integral) to find the definite integral of the scanline's
+    //  average brightness over a given pixel area.  Even if curved coords were
+    //  used in this pass, a flat scalar pixel height works almost as well as a
+    //  pixel height computed from a full pixel-space to scanline-space matrix.
+    const float3 sigma = get_gaussian_sigma(color, sigma_range);
+    const float3 ph_offset = float3(pixel_height * 0.5);
+    const float3 denom_inv = 1.0/(sigma*sqrt(2.0));
+    const float3 integral_high = erf((dist + ph_offset)*denom_inv);
+    const float3 integral_low = erf((dist - ph_offset)*denom_inv);
+    return color * 0.5*(integral_high - integral_low)/pixel_height;
+}
+
+float3 scanline_generalized_gaussian_integral_contrib(float3 dist,
+    float3 color, float pixel_height, float sigma_range,
+    float shape_range)
+{
+    //  Requires:   1.) Requirements of scanline_gaussian_integral_contrib()
+    //                  must be met.
+    //              2.) Requirements of get_gaussian_sigma() must be met.
+    //              3.) Requirements of get_generalized_gaussian_beta() must be
+    //                  met.
+    //  Returns:    Return a scanline's light output over a given pixel.
+    //  A generalized Gaussian distribution allows the shape (beta) to vary
+    //  as well as the width (alpha).  "gamma" refers to the gamma function:
+    //      generalized sample =
+    //          beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta)
+    //  ligamma(s, z) is the lower incomplete gamma function, for which we only
+    //  implement two of four branches (because we keep 1/beta <= 0.5):
+    //      generalized integral = 0.5 + 0.5* sign(x) *
+    //          ligamma(1/beta, (|x|/alpha)**beta)/gamma(1/beta)
+    //  See get_generalized_gaussian_beta() for a discussion of beta.
+    //  We base alpha on the intended Gaussian sigma, but it only strictly
+    //  models models standard deviation at beta == 2, because the standard
+    //  deviation depends on both alpha and beta (keeping alpha independent is
+    //  faster and preserves intuitive behavior and a full spectrum of results).
+    const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
+    const float3 beta = get_generalized_gaussian_beta(color, shape_range);
+    const float3 alpha_inv = float3(1.0)/alpha;
+    const float3 s = float3(1.0)/beta;
+    const float3 ph_offset = float3(pixel_height * 0.5);
+    //  Pass beta to gamma_impl to avoid repeated divides.  Similarly pass
+    //  beta (i.e. 1/s) and 1/gamma(s) to normalized_ligamma_impl.
+    const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, beta);
+    const float3 dist1 = dist + ph_offset;
+    const float3 dist0 = dist - ph_offset;
+    const float3 integral_high = sign(dist1) * normalized_ligamma_impl(
+        s, pow(abs(dist1)*alpha_inv, beta), beta, gamma_s_inv);
+    const float3 integral_low = sign(dist0) * normalized_ligamma_impl(
+        s, pow(abs(dist0)*alpha_inv, beta), beta, gamma_s_inv);
+    return color * 0.5*(integral_high - integral_low)/pixel_height;
+}
+
+float3 scanline_gaussian_sampled_contrib(const float3 dist, const float3 color,
+    const float pixel_height, const float sigma_range)
+{
+    //  See scanline_gaussian integral_contrib() for detailed comments!
+    //  gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2))
+    const float3 sigma = get_gaussian_sigma(color, sigma_range);
+    //  Avoid repeated divides:
+    const float3 sigma_inv = float3(1.0)/sigma;
+    const float3 inner_denom_inv = 0.5 * sigma_inv * sigma_inv;
+    const float3 outer_denom_inv = sigma_inv/sqrt(2.0*pi);
+    if(beam_antialias_level > 0.5)
+    {
+        //  Sample 1/3 pixel away in each direction as well:
+        const float3 sample_offset = float3(pixel_height/3.0);
+        const float3 dist2 = dist + sample_offset;
+        const float3 dist3 = abs(dist - sample_offset);
+        //  Average three pure Gaussian samples:
+        const float3 scale = color/3.0 * outer_denom_inv;
+        const float3 weight1 = exp(-(dist*dist)*inner_denom_inv);
+        const float3 weight2 = exp(-(dist2*dist2)*inner_denom_inv);
+        const float3 weight3 = exp(-(dist3*dist3)*inner_denom_inv);
+        return scale * (weight1 + weight2 + weight3);
+    }
+    else
+    {
+        return color*exp(-(dist*dist)*inner_denom_inv)*outer_denom_inv;
+    }
+}
+
+float3 scanline_generalized_gaussian_sampled_contrib(float3 dist,
+    float3 color, float pixel_height, float sigma_range,
+    float shape_range)
+{
+    //  See scanline_generalized_gaussian_integral_contrib() for details!
+    //  generalized sample =
+    //      beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta)
+    const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
+    const float3 beta = get_generalized_gaussian_beta(color, shape_range);
+    //  Avoid repeated divides:
+    const float3 alpha_inv = float3(1.0)/alpha;
+    const float3 beta_inv = float3(1.0)/beta;
+    const float3 scale = color * beta * 0.5 * alpha_inv /
+        gamma_impl(beta_inv, beta);
+    if(beam_antialias_level > 0.5)
+    {
+        //  Sample 1/3 pixel closer to and farther from the scanline too.
+        const float3 sample_offset = float3(pixel_height/3.0);
+        const float3 dist2 = dist + sample_offset;
+        const float3 dist3 = abs(dist - sample_offset);
+        //  Average three generalized Gaussian samples:
+        const float3 weight1 = exp(-pow(abs(dist*alpha_inv), beta));
+        const float3 weight2 = exp(-pow(abs(dist2*alpha_inv), beta));
+        const float3 weight3 = exp(-pow(abs(dist3*alpha_inv), beta));
+        return scale/3.0 * (weight1 + weight2 + weight3);
+    }
+    else
+    {
+        return scale * exp(-pow(abs(dist*alpha_inv), beta));
+    }
+}
+
+inline float3 scanline_contrib(float3 dist, float3 color,
+    float pixel_height, const float sigma_range, const float shape_range)
+{
+    //  Requires:   1.) Requirements of scanline_gaussian_integral_contrib()
+    //                  must be met.
+    //              2.) Requirements of get_gaussian_sigma() must be met.
+    //              3.) Requirements of get_generalized_gaussian_beta() must be
+    //                  met.
+    //  Returns:    Return a scanline's light output over a given pixel, using
+    //              a generalized or pure Gaussian distribution and sampling or
+    //              integrals as desired by user codepath choices.
+    if(beam_generalized_gaussian)
+    {
+        if(beam_antialias_level > 1.5)
+        {
+            return scanline_generalized_gaussian_integral_contrib(
+                dist, color, pixel_height, sigma_range, shape_range);
+        }
+        else
+        {
+            return scanline_generalized_gaussian_sampled_contrib(
+                dist, color, pixel_height, sigma_range, shape_range);
+        }
+    }
+    else
+    {
+        if(beam_antialias_level > 1.5)
+        {
+            return scanline_gaussian_integral_contrib(
+                dist, color, pixel_height, sigma_range);
+        }
+        else
+        {
+            return scanline_gaussian_sampled_contrib(
+                dist, color, pixel_height, sigma_range);
+        }
+    }
+}
+
+inline float3 get_raw_interpolated_color(const float3 color0,
+    const float3 color1, const float3 color2, const float3 color3,
+    const float4 weights)
+{
+    //  Use max to avoid bizarre artifacts from negative colors:
+    return max(mul(weights, float4x3(color0, color1, color2, color3)), 0.0);
+}
+
+float3 get_interpolated_linear_color(const float3 color0, const float3 color1,
+    const float3 color2, const float3 color3, const float4 weights)
+{
+    //  Requires:   1.) Requirements of include/gamma-management.h must be met:
+    //                  intermediate_gamma must be globally defined, and input
+    //                  colors are interpreted as linear RGB unless you #define
+    //                  GAMMA_ENCODE_EVERY_FBO (in which case they are
+    //                  interpreted as gamma-encoded with intermediate_gamma).
+    //              2.) color0-3 are colors sampled from a texture with tex2D().
+    //                  They are interpreted as defined in requirement 1.
+    //              3.) weights contains weights for each color, summing to 1.0.
+    //              4.) beam_horiz_linear_rgb_weight must be defined as a global
+    //                  float in [0.0, 1.0] describing how much blending should
+    //                  be done in linear RGB (rest is gamma-corrected RGB).
+    //              5.) RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE must be #defined
+    //                  if beam_horiz_linear_rgb_weight is anything other than a
+    //                  static constant, or we may try branching at runtime
+    //                  without dynamic branches allowed (slow).
+    //  Returns:    Return an interpolated color lookup between the four input
+    //              colors based on the weights in weights.  The final color will
+    //              be a linear RGB value, but the blending will be done as
+    //              indicated above.
+    const float intermediate_gamma = get_intermediate_gamma();
+    //  Branch if beam_horiz_linear_rgb_weight is static (for free) or if the
+    //  profile allows dynamic branches (faster than computing extra pows):
+    #ifndef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+    #else
+        #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+            #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+        #endif
+    #endif
+    #ifdef SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+        //  beam_horiz_linear_rgb_weight is static, so we can branch:
+        #ifdef GAMMA_ENCODE_EVERY_FBO
+            const float3 gamma_mixed_color = pow(get_raw_interpolated_color(
+                color0, color1, color2, color3, weights), float3(intermediate_gamma));
+            if(beam_horiz_linear_rgb_weight > 0.0)
+            {
+                const float3 linear_mixed_color = get_raw_interpolated_color(
+                    pow(color0, float3(intermediate_gamma)),
+                    pow(color1, float3(intermediate_gamma)),
+                    pow(color2, float3(intermediate_gamma)),
+                    pow(color3, float3(intermediate_gamma)),
+                    weights);
+                return lerp(gamma_mixed_color, linear_mixed_color,
+                    beam_horiz_linear_rgb_weight);
+            }
+            else
+            {
+                return gamma_mixed_color;
+            }
+        #else
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                color0, color1, color2, color3, weights);
+            if(beam_horiz_linear_rgb_weight < 1.0)
+            {
+                const float3 gamma_mixed_color = get_raw_interpolated_color(
+                    pow(color0, float3(1.0/intermediate_gamma)),
+                    pow(color1, float3(1.0/intermediate_gamma)),
+                    pow(color2, float3(1.0/intermediate_gamma)),
+                    pow(color3, float3(1.0/intermediate_gamma)),
+                    weights);
+                return lerp(gamma_mixed_color, linear_mixed_color,
+                    beam_horiz_linear_rgb_weight);
+            }
+            else
+            {
+                return linear_mixed_color;
+            }
+        #endif  //  GAMMA_ENCODE_EVERY_FBO
+    #else
+        #ifdef GAMMA_ENCODE_EVERY_FBO
+            //  Inputs: color0-3 are colors in gamma-encoded RGB.
+            const float3 gamma_mixed_color = pow(get_raw_interpolated_color(
+                color0, color1, color2, color3, weights), intermediate_gamma);
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                pow(color0, float3(intermediate_gamma)),
+                pow(color1, float3(intermediate_gamma)),
+                pow(color2, float3(intermediate_gamma)),
+                pow(color3, float3(intermediate_gamma)),
+                weights);
+            return lerp(gamma_mixed_color, linear_mixed_color,
+                beam_horiz_linear_rgb_weight);
+        #else
+            //  Inputs: color0-3 are colors in linear RGB.
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                color0, color1, color2, color3, weights);
+            const float3 gamma_mixed_color = get_raw_interpolated_color(
+                    pow(color0, float3(1.0/intermediate_gamma)),
+                    pow(color1, float3(1.0/intermediate_gamma)),
+                    pow(color2, float3(1.0/intermediate_gamma)),
+                    pow(color3, float3(1.0/intermediate_gamma)),
+                    weights);
+			// wtf fixme
+//			const float beam_horiz_linear_rgb_weight1 = 1.0;
+            return lerp(gamma_mixed_color, linear_mixed_color,
+                beam_horiz_linear_rgb_weight);
+        #endif  //  GAMMA_ENCODE_EVERY_FBO
+    #endif  //  SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+}
+
+float3 get_scanline_color(const sampler2D tex, const float2 scanline_uv,
+    const float2 uv_step_x, const float4 weights)
+{
+    //  Requires:   1.) scanline_uv must be vertically snapped to the caller's
+    //                  desired line or scanline and horizontally snapped to the
+    //                  texel just left of the output pixel (color1)
+    //              2.) uv_step_x must contain the horizontal uv distance
+    //                  between texels.
+    //              3.) weights must contain interpolation filter weights for
+    //                  color0, color1, color2, and color3, where color1 is just
+    //                  left of the output pixel.
+    //  Returns:    Return a horizontally interpolated texture lookup using 2-4
+    //              nearby texels, according to weights and the conventions of
+    //              get_interpolated_linear_color().
+    //  We can ignore the outside texture lookups for Quilez resampling.
+    const float3 color1 = COMPAT_TEXTURE(tex, scanline_uv).rgb;
+    const float3 color2 = COMPAT_TEXTURE(tex, scanline_uv + uv_step_x).rgb;
+    float3 color0 = float3(0.0);
+    float3 color3 = float3(0.0);
+    if(beam_horiz_filter > 0.5)
+    {
+        color0 = COMPAT_TEXTURE(tex, scanline_uv - uv_step_x).rgb;
+        color3 = COMPAT_TEXTURE(tex, scanline_uv + 2.0 * uv_step_x).rgb;
+    }
+    //  Sample the texture as-is, whether it's linear or gamma-encoded:
+    //  get_interpolated_linear_color() will handle the difference.
+    return get_interpolated_linear_color(color0, color1, color2, color3, weights);
+}
+
+float3 sample_single_scanline_horizontal(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size,
+    const float2 texture_size_inv)
+{
+    //  TODO: Add function requirements.
+    //  Snap to the previous texel and get sample dists from 2/4 nearby texels:
+    const float2 curr_texel = tex_uv * tex_size;
+    //  Use under_half to fix a rounding bug right around exact texel locations.
+    const float2 prev_texel =
+        floor(curr_texel - float2(under_half)) + float2(0.5);
+    const float2 prev_texel_hor = float2(prev_texel.x, curr_texel.y);
+    const float2 prev_texel_hor_uv = prev_texel_hor * texture_size_inv;
+    const float prev_dist = curr_texel.x - prev_texel_hor.x;
+    const float4 sample_dists = float4(1.0 + prev_dist, prev_dist,
+        1.0 - prev_dist, 2.0 - prev_dist);
+    //  Get Quilez, Lanczos2, or Gaussian resize weights for 2/4 nearby texels:
+    float4 weights;
+    if(beam_horiz_filter < 0.5)
+    {
+        //  Quilez:
+        const float x = sample_dists.y;
+        const float w2 = x*x*x*(x*(x*6.0 - 15.0) + 10.0);
+        weights = float4(0.0, 1.0 - w2, w2, 0.0);
+    }
+    else if(beam_horiz_filter < 1.5)
+    {
+        //  Gaussian:
+        float inner_denom_inv = 1.0/(2.0*beam_horiz_sigma*beam_horiz_sigma);
+        weights = exp(-(sample_dists*sample_dists)*inner_denom_inv);
+    }
+    else
+    {
+        //  Lanczos2:
+        const float4 pi_dists = FIX_ZERO(sample_dists * pi);
+        weights = 2.0 * sin(pi_dists) * sin(pi_dists * 0.5) /
+            (pi_dists * pi_dists);
+    }
+    //  Ensure the weight sum == 1.0:
+    const float4 final_weights = weights/dot(weights, float4(1.0));
+    //  Get the interpolated horizontal scanline color:
+    const float2 uv_step_x = float2(texture_size_inv.x, 0.0);
+    return get_scanline_color(
+        tex, prev_texel_hor_uv, uv_step_x, final_weights);
+}
+
+float3 sample_rgb_scanline_horizontal(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size,
+    const float2 texture_size_inv)
+{
+    //  TODO: Add function requirements.
+    //  Rely on a helper to make convergence easier.
+    if(beam_misconvergence)
+    {
+        const float3 convergence_offsets_rgb =
+            get_convergence_offsets_x_vector();
+        const float3 offset_u_rgb =
+            convergence_offsets_rgb * texture_size_inv.xxx;
+        const float2 scanline_uv_r = tex_uv - float2(offset_u_rgb.r, 0.0);
+        const float2 scanline_uv_g = tex_uv - float2(offset_u_rgb.g, 0.0);
+        const float2 scanline_uv_b = tex_uv - float2(offset_u_rgb.b, 0.0);
+        const float3 sample_r = sample_single_scanline_horizontal(
+            tex, scanline_uv_r, tex_size, texture_size_inv);
+        const float3 sample_g = sample_single_scanline_horizontal(
+            tex, scanline_uv_g, tex_size, texture_size_inv);
+        const float3 sample_b = sample_single_scanline_horizontal(
+            tex, scanline_uv_b, tex_size, texture_size_inv);
+        return float3(sample_r.r, sample_g.g, sample_b.b);
+    }
+    else
+    {
+        return sample_single_scanline_horizontal(tex, tex_uv, tex_size,
+            texture_size_inv);
+    }
+}
+
+float2 get_last_scanline_uv(const float2 tex_uv, const float2 tex_size,
+    const float2 texture_size_inv, const float2 il_step_multiple,
+    const float frame_count, out float dist)
+{
+    //  Compute texture coords for the last/upper scanline, accounting for
+    //  interlacing: With interlacing, only consider even/odd scanlines every
+    //  other frame.  Top-field first (TFF) order puts even scanlines on even
+    //  frames, and BFF order puts them on odd frames.  Texels are centered at:
+    //      frac(tex_uv * tex_size) == x.5
+    //  Caution: If these coordinates ever seem incorrect, first make sure it's
+    //  not because anisotropic filtering is blurring across field boundaries.
+    //  Note: TFF/BFF won't matter for sources that double-weave or similar.
+	// wtf fixme
+//	const float interlace_bff1 = 1.0;
+    const float field_offset = floor(il_step_multiple.y * 0.75) *
+        fmod(frame_count + float(interlace_bff), 2.0);
+    const float2 curr_texel = tex_uv * tex_size;
+    //  Use under_half to fix a rounding bug right around exact texel locations.
+    const float2 prev_texel_num = floor(curr_texel - float2(under_half));
+    const float wrong_field = fmod(
+        prev_texel_num.y + field_offset, il_step_multiple.y);
+    const float2 scanline_texel_num = prev_texel_num - float2(0.0, wrong_field);
+    //  Snap to the center of the previous scanline in the current field:
+    const float2 scanline_texel = scanline_texel_num + float2(0.5);
+    const float2 scanline_uv = scanline_texel * texture_size_inv;
+    //  Save the sample's distance from the scanline, in units of scanlines:
+    dist = (curr_texel.y - scanline_texel.y)/il_step_multiple.y;
+    return scanline_uv;
+}
+
+inline bool is_interlaced(float num_lines)
+{
+    //  Detect interlacing based on the number of lines in the source.
+    if(interlace_detect)
+    {
+        //  NTSC: 525 lines, 262.5/field; 486 active (2 half-lines), 243/field
+        //  NTSC Emulators: Typically 224 or 240 lines
+        //  PAL: 625 lines, 312.5/field; 576 active (typical), 288/field
+        //  PAL Emulators: ?
+        //  ATSC: 720p, 1080i, 1080p
+        //  Where do we place our cutoffs?  Assumptions:
+        //  1.) We only need to care about active lines.
+        //  2.) Anything > 288 and <= 576 lines is probably interlaced.
+        //  3.) Anything > 576 lines is probably not interlaced...
+        //  4.) ...except 1080 lines, which is a crapshoot (user decision).
+        //  5.) Just in case the main program uses calculated video sizes,
+        //      we should nudge the float thresholds a bit.
+        const bool sd_interlace = ((num_lines > 288.5) && (num_lines < 576.5));
+        const bool hd_interlace = bool(interlace_1080i) ?
+            ((num_lines > 1079.5) && (num_lines < 1080.5)) :
+            false;
+        return (sd_interlace || hd_interlace);
+    }
+    else
+    {
+        return false;
+    }
+}
+
+#endif  //  SCANLINE_FUNCTIONS_H
+
+/////////////////////////////  END SCANLINE-FUNCTIONS  ////////////////////////////
+
+///////////////////////////////  END VERTEX-INCLUDES  /////////////////////////////
+
+#undef COMPAT_PRECISION
+#undef COMPAT_TEXTURE
+
+float bloom_approx_scale_x = targetSize.x / sourceSize[0].y;
+const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0);
+const float bloom_diff_thresh_ = 1.0/256.0;
+
+// copied from bloom-functions.h
+inline float get_min_sigma_to_blur_triad(const float triad_size,
+    const float thresh)
+{
+    //  Requires:   1.) triad_size is the final phosphor triad size in pixels
+    //              2.) thresh is the max desired pixel difference in the
+    //                  blurred triad (e.g. 1.0/256.0).
+    //  Returns:    Return the minimum sigma that will fully blur a phosphor
+    //              triad on the screen to an even color, within thresh.
+    //              This closed-form function was found by curve-fitting data.
+    //  Estimate: max error = ~0.086036, mean sq. error = ~0.0013387:
+    return -0.05168 + 0.6113*triad_size -
+        1.122*triad_size*sqrt(0.000416 + thresh);
+    //  Estimate: max error = ~0.16486, mean sq. error = ~0.0041041:
+    //return 0.5985*triad_size - triad_size*sqrt(thresh)
+}
+
+void main() {
+   gl_Position = position;
+   vTexCoord = texCoord * 1.00001;
+   float2 tex_uv = vTexCoord.xy;
+    //  Our various input textures use different coords:
+    float2 video_uv = tex_uv * texture_size/video_size;
+    //video_uv = video_uv;
+    scanline_tex_uv = video_uv * MASKED_SCANLINESvideo_size /
+        MASKED_SCANLINEStexture_size;
+    blur3x3_tex_uv = video_uv;// * BLOOM_APPROXvideo_size / BLOOM_APPROXtexture_size;
+
+    //  Calculate a runtime bloom_sigma in case it's needed:
+    const float mask_tile_size_x = get_resized_mask_tile_size(
+        output_size, output_size * mask_resize_viewport_scale, false).x;
+    bloom_sigma_runtime = get_min_sigma_to_blur_triad(
+        mask_tile_size_x / mask_triads_per_tile, bloom_diff_thresh_);
+}
\ No newline at end of file
diff --git a/shaders/CRT-Royale.shader/first-pass-linearize-crt-gamma-bob-fields.fs b/shaders/CRT-Royale.shader/first-pass-linearize-crt-gamma-bob-fields.fs
new file mode 100644
index 00000000..c89e4671
--- /dev/null
+++ b/shaders/CRT-Royale.shader/first-pass-linearize-crt-gamma-bob-fields.fs
@@ -0,0 +1,4748 @@
+#version 150
+
+uniform sampler2D source[];
+uniform vec4 sourceSize[];
+uniform vec4 targetSize;
+uniform int phase;
+
+in Vertex {
+   vec2 vTexCoord;
+   vec2 uv_step;
+   float interlaced;
+};
+
+out vec4 FragColor;
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+// USER SETTINGS BLOCK //
+
+#define crt_gamma 2.500000
+#define lcd_gamma 2.200000
+#define levels_contrast 1.0
+#define halation_weight 0.0
+#define diffusion_weight 0.075
+#define bloom_underestimate_levels 0.8
+#define bloom_excess 0.000000
+#define beam_min_sigma 0.020000
+#define beam_max_sigma 0.300000
+#define beam_spot_power 0.330000
+#define beam_min_shape 2.000000
+#define beam_max_shape 4.000000
+#define beam_shape_power 0.250000
+#define beam_horiz_filter 0.000000
+#define beam_horiz_sigma 0.35
+#define beam_horiz_linear_rgb_weight 1.000000
+#define convergence_offset_x_r -0.000000
+#define convergence_offset_x_g 0.000000
+#define convergence_offset_x_b 0.000000
+#define convergence_offset_y_r 0.000000
+#define convergence_offset_y_g -0.000000
+#define convergence_offset_y_b 0.000000
+#define mask_type 1.000000
+#define mask_sample_mode_desired 0.000000
+#define mask_specify_num_triads 0.000000
+#define mask_triad_size_desired 3.000000
+#define mask_num_triads_desired 480.000000
+#define aa_subpixel_r_offset_x_runtime -0.0
+#define aa_subpixel_r_offset_y_runtime 0.000000
+#define aa_cubic_c 0.500000
+#define aa_gauss_sigma 0.500000
+#define geom_mode_runtime 0.000000
+#define geom_radius 2.000000
+#define geom_view_dist 2.000000
+#define geom_tilt_angle_x 0.000000
+#define geom_tilt_angle_y 0.000000
+#define geom_aspect_ratio_x 432.000000
+#define geom_aspect_ratio_y 329.000000
+#define geom_overscan_x 1.000000
+#define geom_overscan_y 1.000000
+#define border_size 0.015
+#define border_darkness 2.0
+#define border_compress 2.500000
+#define interlace_bff 0.000000
+#define interlace_1080i 0.000000
+
+// END USER SETTINGS BLOCK //
+
+// compatibility macros for transparently converting HLSLisms into GLSLisms
+#define mul(a,b) (b*a)
+#define lerp(a,b,c) mix(a,b,c)
+#define saturate(c) clamp(c, 0.0, 1.0)
+#define frac(x) (fract(x))
+#define float2 vec2
+#define float3 vec3
+#define float4 vec4
+#define bool2 bvec2
+#define bool3 bvec3
+#define bool4 bvec4
+#define float2x2 mat2x2
+#define float3x3 mat3x3
+#define float4x4 mat4x4
+#define float4x3 mat4x3
+#define float2x4 mat2x4
+#define IN params
+#define texture_size sourceSize[0].xy
+#define video_size sourceSize[0].xy
+#define output_size targetSize.xy
+#define frame_count phase
+#define static  
+#define inline  
+#define const  
+#define fmod(x,y) mod(x,y)
+#define ddx(c) dFdx(c)
+#define ddy(c) dFdy(c)
+#define atan2(x,y) atan(y,x)
+#define rsqrt(c) inversesqrt(c)
+
+#define input_texture source[0]
+
+#ifdef GL_ES
+#ifdef GL_FRAGMENT_PRECISION_HIGH
+precision highp float;
+#else
+precision mediump float;
+#endif
+#define COMPAT_PRECISION mediump
+#else
+#define COMPAT_PRECISION
+#endif
+
+#if __VERSION__ >= 130
+#define COMPAT_VARYING in
+#define COMPAT_TEXTURE texture
+#else
+#define COMPAT_VARYING varying
+#define FragColor gl_FragColor
+#define COMPAT_TEXTURE texture2D
+#endif
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+//  PASS SETTINGS:
+//  gamma-management.h needs to know what kind of pipeline we're using and
+//  what pass this is in that pipeline.  This will become obsolete if/when we
+//  can #define things like this in the .cgp preset file.
+#define FIRST_PASS
+#define SIMULATE_CRT_ON_LCD
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//#include "bind-shader-h"
+
+/////////////////////////////  BEGIN BIND-SHADER-PARAMS  ////////////////////////////
+
+#ifndef BIND_SHADER_PARAMS_H
+#define BIND_SHADER_PARAMS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+/////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+////////////////////   END DERIVED-SETTINGS-AND-CONSTANTS   /////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+//  Override some parameters for gamma-management.h and tex2Dantialias.h:
+#define OVERRIDE_DEVICE_GAMMA
+static const float gba_gamma = 3.5; //  Irrelevant but necessary to define.
+#define ANTIALIAS_OVERRIDE_BASICS
+#define ANTIALIAS_OVERRIDE_PARAMETERS
+
+//  Provide accessors for vector constants that pack scalar uniforms:
+inline float2 get_aspect_vector(const float geom_aspect_ratio)
+{
+    //  Get an aspect ratio vector.  Enforce geom_max_aspect_ratio, and prevent
+    //  the absolute scale from affecting the uv-mapping for curvature:
+    const float geom_clamped_aspect_ratio =
+        min(geom_aspect_ratio, geom_max_aspect_ratio);
+    const float2 geom_aspect =
+        normalize(float2(geom_clamped_aspect_ratio, 1.0));
+    return geom_aspect;
+}
+
+inline float2 get_geom_overscan_vector()
+{
+    return float2(geom_overscan_x, geom_overscan_y);
+}
+
+inline float2 get_geom_tilt_angle_vector()
+{
+    return float2(geom_tilt_angle_x, geom_tilt_angle_y);
+}
+
+inline float3 get_convergence_offsets_x_vector()
+{
+    return float3(convergence_offset_x_r, convergence_offset_x_g,
+        convergence_offset_x_b);
+}
+
+inline float3 get_convergence_offsets_y_vector()
+{
+    return float3(convergence_offset_y_r, convergence_offset_y_g,
+        convergence_offset_y_b);
+}
+
+inline float2 get_convergence_offsets_r_vector()
+{
+    return float2(convergence_offset_x_r, convergence_offset_y_r);
+}
+
+inline float2 get_convergence_offsets_g_vector()
+{
+    return float2(convergence_offset_x_g, convergence_offset_y_g);
+}
+
+inline float2 get_convergence_offsets_b_vector()
+{
+    return float2(convergence_offset_x_b, convergence_offset_y_b);
+}
+
+inline float2 get_aa_subpixel_r_offset()
+{
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+            //  WARNING: THIS IS EXTREMELY EXPENSIVE.
+            return float2(aa_subpixel_r_offset_x_runtime,
+                aa_subpixel_r_offset_y_runtime);
+        #else
+            return aa_subpixel_r_offset_static;
+        #endif
+    #else
+        return aa_subpixel_r_offset_static;
+    #endif
+}
+
+//  Provide accessors settings which still need "cooking:"
+inline float get_mask_amplify()
+{
+    static const float mask_grille_amplify = 1.0/mask_grille_avg_color;
+    static const float mask_slot_amplify = 1.0/mask_slot_avg_color;
+    static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color;
+    return mask_type < 0.5 ? mask_grille_amplify :
+        mask_type < 1.5 ? mask_slot_amplify :
+        mask_shadow_amplify;
+}
+
+inline float get_mask_sample_mode()
+{
+    #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_desired;
+        #else
+            return clamp(mask_sample_mode_desired, 1.0, 2.0);
+        #endif
+    #else
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_static;
+        #else
+            return clamp(mask_sample_mode_static, 1.0, 2.0);
+        #endif
+    #endif
+}
+
+#endif  //  BIND_SHADER_PARAMS_H
+
+////////////////////////////  END BIND-SHADER-PARAMS  ///////////////////////////
+
+//#include "../../../../include/gamma-management.h"
+
+////////////////////////////  BEGIN GAMMA-MANAGEMENT  //////////////////////////
+
+#ifndef GAMMA_MANAGEMENT_H
+#define GAMMA_MANAGEMENT_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//  
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides gamma-aware tex*D*() and encode_output() functions.
+//  Requires:   Before #include-ing this file, the including file must #define
+//              the following macros when applicable and follow their rules:
+//              1.) #define FIRST_PASS if this is the first pass.
+//              2.) #define LAST_PASS if this is the last pass.
+//              3.) If sRGB is available, set srgb_framebufferN = "true" for
+//                  every pass except the last in your .cgp preset.
+//              4.) If sRGB isn't available but you want gamma-correctness with
+//                  no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
+//              5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
+//              6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
+//              7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
+//              8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
+//              If an option in [5, 8] is #defined in the first or last pass, it
+//              should be #defined for both.  It shouldn't make a difference
+//              whether it's #defined for intermediate passes or not.
+//  Optional:   The including file (or an earlier included file) may optionally
+//              #define a number of macros indicating it will override certain
+//              macros and associated constants are as follows:
+//              static constants with either static or uniform constants.  The
+//              1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
+//                  static const float ntsc_gamma
+//                  static const float pal_gamma
+//                  static const float crt_reference_gamma_high
+//                  static const float crt_reference_gamma_low
+//                  static const float lcd_reference_gamma
+//                  static const float crt_office_gamma
+//                  static const float lcd_office_gamma
+//              2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
+//                  static const float crt_gamma
+//                  static const float gba_gamma
+//                  static const float lcd_gamma
+//              3.) OVERRIDE_FINAL_GAMMA: The user must first define:
+//                  static const float input_gamma
+//                  static const float intermediate_gamma
+//                  static const float output_gamma
+//                  (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
+//              4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
+//                  static const bool assume_opaque_alpha
+//              The gamma constant overrides must be used in every pass or none,
+//              and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
+//              OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
+//  Usage:      After setting macros appropriately, ignore gamma correction and
+//              replace all tex*D*() calls with equivalent gamma-aware
+//              tex*D*_linearize calls, except:
+//              1.) When you read an LUT, use regular tex*D or a gamma-specified
+//                  function, depending on its gamma encoding:
+//                      tex*D*_linearize_gamma (takes a runtime gamma parameter)
+//              2.) If you must read pass0's original input in a later pass, use
+//                  tex2D_linearize_ntsc_gamma.  If you want to read pass0's
+//                  input with gamma-corrected bilinear filtering, consider
+//                  creating a first linearizing pass and reading from the input
+//                  of pass1 later.
+//              Then, return encode_output(color) from every fragment shader.
+//              Finally, use the global gamma_aware_bilinear boolean if you want
+//              to statically branch based on whether bilinear filtering is
+//              gamma-correct or not (e.g. for placing Gaussian blur samples).
+//
+//  Detailed Policy:
+//  tex*D*_linearize() functions enforce a consistent gamma-management policy
+//  based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings.  They assume
+//  their input texture has the same encoding characteristics as the input for
+//  the current pass (which doesn't apply to the exceptions listed above).
+//  Similarly, encode_output() enforces a policy based on the LAST_PASS and
+//  GAMMA_ENCODE_EVERY_FBO settings.  Together, they result in one of the
+//  following two pipelines.
+//  Typical pipeline with intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = linear_color;     //  Automatic sRGB encoding
+//      linear_color = intermediate_output;     //  Automatic sRGB decoding
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Typical pipeline without intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
+//      linear_color = pow(intermediate_output, intermediate_gamma);
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
+//  easily get gamma-correctness without banding on devices where sRGB isn't
+//  supported.
+//
+//  Use This Header to Maximize Code Reuse:
+//  The purpose of this header is to provide a consistent interface for texture
+//  reads and output gamma-encoding that localizes and abstracts away all the
+//  annoying details.  This greatly reduces the amount of code in each shader
+//  pass that depends on the pass number in the .cgp preset or whether sRGB
+//  FBO's are being used: You can trivially change the gamma behavior of your
+//  whole pass by commenting or uncommenting 1-3 #defines.  To reuse the same
+//  code in your first, Nth, and last passes, you can even put it all in another
+//  header file and #include it from skeleton .cg files that #define the
+//  appropriate pass-specific settings.
+//
+//  Rationale for Using Three Macros:
+//  This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
+//  SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
+//  a lower maintenance burden on each pass.  At first glance it seems we could
+//  accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
+//  This works for simple use cases where input_gamma == output_gamma, but it
+//  breaks down for more complex scenarios like CRT simulation, where the pass
+//  number determines the gamma encoding of the input and output.
+
+
+///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
+
+//  Set standard gamma constants, but allow users to override them:
+#ifndef OVERRIDE_STANDARD_GAMMA
+    //  Standard encoding gammas:
+    static const float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
+    static const float pal_gamma = 2.8;     //  Never actually 2.8 in practice
+    //  Typical device decoding gammas (only use for emulating devices):
+    //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
+    //  gammas: The standards purposely undercorrected for an analog CRT's
+    //  assumed 2.5 reference display gamma to maintain contrast in assumed
+    //  [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
+    //  These unstated assumptions about display gamma and perceptual rendering
+    //  intent caused a lot of confusion, and more modern CRT's seemed to target
+    //  NTSC 2.2 gamma with circuitry.  LCD displays seem to have followed suit
+    //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
+    //  displays designed to view sRGB in bright environments.  (Standards are
+    //  also in flux again with BT.1886, but it's underspecified for displays.)
+    static const float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
+    static const float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
+    static const float lcd_reference_gamma = 2.5;       //  To match CRT
+    static const float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
+    static const float lcd_office_gamma = 2.2;  //  Approximates sRGB
+#endif  //  OVERRIDE_STANDARD_GAMMA
+
+//  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
+//  but only if they're aware of it.
+#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
+    static const bool assume_opaque_alpha = false;
+#endif
+
+
+///////////////////////  DERIVED CONSTANTS AS FUNCTIONS  ///////////////////////
+
+//  gamma-management.h should be compatible with overriding gamma values with
+//  runtime user parameters, but we can only define other global constants in
+//  terms of static constants, not uniform user parameters.  To get around this
+//  limitation, we need to define derived constants using functions.
+
+//  Set device gamma constants, but allow users to override them:
+#ifdef OVERRIDE_DEVICE_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_crt_gamma()    {   return crt_gamma;   }
+    inline float get_gba_gamma()    {   return gba_gamma;   }
+    inline float get_lcd_gamma()    {   return lcd_gamma;   }
+#else
+    inline float get_crt_gamma()    {   return crt_reference_gamma_high;    }
+    inline float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
+    inline float get_lcd_gamma()    {   return lcd_office_gamma;            }
+#endif  //  OVERRIDE_DEVICE_GAMMA
+
+//  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
+#ifdef OVERRIDE_FINAL_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_intermediate_gamma()   {   return intermediate_gamma;  }
+    inline float get_input_gamma()          {   return input_gamma;         }
+    inline float get_output_gamma()         {   return output_gamma;        }
+#else
+    //  If we gamma-correct every pass, always use ntsc_gamma between passes to
+    //  ensure middle passes don't need to care if anything is being simulated:
+    inline float get_intermediate_gamma()   {   return ntsc_gamma;          }
+    #ifdef SIMULATE_CRT_ON_LCD
+        inline float get_input_gamma()      {   return get_crt_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_LCD
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_LCD_ON_CRT
+        inline float get_input_gamma()      {   return get_lcd_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_CRT
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else   //  Don't simulate anything:
+        inline float get_input_gamma()      {   return ntsc_gamma;          }
+        inline float get_output_gamma()     {   return ntsc_gamma;          }
+    #endif  //  SIMULATE_GBA_ON_CRT
+    #endif  //  SIMULATE_LCD_ON_CRT
+    #endif  //  SIMULATE_GBA_ON_LCD
+    #endif  //  SIMULATE_CRT_ON_LCD
+#endif  //  OVERRIDE_FINAL_GAMMA
+
+//  Set decoding/encoding gammas for the current pass.  Use static constants for
+//  linearize_input and gamma_encode_output, because they aren't derived, and
+//  they let the compiler do dead-code elimination.
+#ifndef GAMMA_ENCODE_EVERY_FBO
+    #ifdef FIRST_PASS
+        static const bool linearize_input = true;
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        static const bool linearize_input = false;
+        inline float get_pass_input_gamma()     {   return 1.0;                 }
+    #endif
+    #ifdef LAST_PASS
+        static const bool gamma_encode_output = true;
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        static const bool gamma_encode_output = false;
+        inline float get_pass_output_gamma()    {   return 1.0;                 }
+    #endif
+#else
+    static const bool linearize_input = true;
+    static const bool gamma_encode_output = true;
+    #ifdef FIRST_PASS
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        inline float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
+    #endif
+    #ifdef LAST_PASS
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        inline float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
+    #endif
+#endif
+
+//  Users might want to know if bilinear filtering will be gamma-correct:
+static const bool gamma_aware_bilinear = !linearize_input;
+
+
+//////////////////////  COLOR ENCODING/DECODING FUNCTIONS  /////////////////////
+
+inline float4 encode_output(const float4 color)
+{
+    if(gamma_encode_output)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_input(const float4 color)
+{
+    if(linearize_input)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_gamma_input(const float4 color, const float3 gamma)
+{
+    if(assume_opaque_alpha)
+    {
+        return float4(pow(color.rgb, gamma), 1.0);
+    }
+    else
+    {
+        return float4(pow(color.rgb, gamma), color.a);
+    }
+}
+
+//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯
+//#define tex2D_linearize(C, D) decode_input(vec4(texture(C, D)))
+// EDIT: it's the 'const' in front of the coords that's doing it
+
+///////////////////////////  TEXTURE LOOKUP WRAPPERS  //////////////////////////
+
+//  "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a wide array of linearizing texture lookup wrapper functions.  The
+//  Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
+//  lookups are provided for completeness in case that changes someday.  Nobody
+//  is likely to use the *fetch and *proj functions, but they're included just
+//  in case.  The only tex*D texture sampling functions omitted are:
+//      - tex*Dcmpbias
+//      - tex*Dcmplod
+//      - tex*DARRAY*
+//      - tex*DMS*
+//      - Variants returning integers
+//  Standard line length restrictions are ignored below for vertical brevity.
+/*
+//  tex1D:
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex1Dbias:
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dbias(tex, tex_coords));   }
+
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dbias(tex, tex_coords, texel_off));    }
+
+//  tex1Dfetch:
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
+{   return decode_input(tex1Dfetch(tex, tex_coords));  }
+
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex1Dlod:
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dlod(tex, tex_coords));    }
+
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dlod(tex, tex_coords, texel_off));     }
+
+//  tex1Dproj:
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+*/
+//  tex2D:
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords, texel_off));    }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex2Dbias:
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
+//{   return decode_input(tex2Dbias(tex, tex_coords));   }
+
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dbias(tex, tex_coords, texel_off));    }
+
+//  tex2Dfetch:
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
+//{   return decode_input(tex2Dfetch(tex, tex_coords));  }
+
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords)
+{   return decode_input(textureLod(tex, tex_coords.xy, 0.0));    }
+
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));     }
+/*
+//  tex2Dproj:
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+*/
+/*
+//  tex3D:
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
+{   return decode_input(tex3D(tex, tex_coords));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, texel_off));    }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex3Dbias:
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dbias(tex, tex_coords));   }
+
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dbias(tex, tex_coords, texel_off));    }
+
+//  tex3Dfetch:
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
+{   return decode_input(tex3Dfetch(tex, tex_coords));  }
+
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex3Dlod:
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dlod(tex, tex_coords));    }
+
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dlod(tex, tex_coords, texel_off));     }
+
+//  tex3Dproj:
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dproj(tex, tex_coords));   }
+
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dproj(tex, tex_coords, texel_off));    }
+/////////*
+
+//  NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  This narrow selection of nonstandard tex2D* functions can be useful:
+
+//  tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0)));   }
+
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off));    }
+
+
+//  MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a narrower selection of tex2D* wrapper functions that decode an
+//  input sample with a specified gamma value.  These are useful for reading
+//  LUT's and for reading the input of pass0 in a later pass.
+
+//  tex2D:
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma);   }
+
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+/*
+//  tex2Dbias:
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma);   }
+
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma);    }
+
+//  tex2Dfetch:
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma);  }
+
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma);   }
+*/
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma);    }
+
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma);     }
+
+
+#endif  //  GAMMA_MANAGEMENT_H
+
+////////////////////////////  END GAMMA-MANAGEMENT  //////////////////////////
+
+//#include "scanline-functions.h"
+
+/////////////////////////////  BEGIN SCANLINE-FUNCTIONS  ////////////////////////////
+
+#ifndef SCANLINE_FUNCTIONS_H
+#define SCANLINE_FUNCTIONS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+////////////////////////////  END USER-SETTINGS  //////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  END DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////////////
+
+//#include "../../../../include/special-functions.h"
+
+///////////////////////////  BEGIN SPECIAL-FUNCTIONS  //////////////////////////
+
+#ifndef SPECIAL_FUNCTIONS_H
+#define SPECIAL_FUNCTIONS_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file implements the following mathematical special functions:
+//  1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2))
+//  2.) gamma(s), a real-numbered extension of the integer factorial function
+//  It also implements normalized_ligamma(s, z), a normalized lower incomplete
+//  gamma function for s < 0.5 only.  Both gamma() and normalized_ligamma() can
+//  be called with an _impl suffix to use an implementation version with a few
+//  extra precomputed parameters (which may be useful for the caller to reuse).
+//  See below for details.
+//
+//  Design Rationale:
+//  Pretty much every line of code in this file is duplicated four times for
+//  different input types (float4/float3/float2/float).  This is unfortunate,
+//  but Cg doesn't allow function templates.  Macros would be far less verbose,
+//  but they would make the code harder to document and read.  I don't expect
+//  these functions will require a whole lot of maintenance changes unless
+//  someone ever has need for more robust incomplete gamma functions, so code
+//  duplication seems to be the lesser evil in this case.
+
+
+///////////////////////////  GAUSSIAN ERROR FUNCTION  //////////////////////////
+
+float4 erf6(float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Return an Abramowitz/Stegun approximation of erf(), where:
+    //                  erf(x) = 2/sqrt(pi) * integral(e**(-x**2))
+    //              This approximation has a max absolute error of 2.5*10**-5
+    //              with solid numerical robustness and efficiency.  See:
+	//                  https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions
+	static const float4 one = float4(1.0);
+	const float4 sign_x = sign(x);
+	const float4 t = one/(one + 0.47047*abs(x));
+	const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float3 erf6(const float3 x)
+{
+    //  Float3 version:
+	static const float3 one = float3(1.0);
+	const float3 sign_x = sign(x);
+	const float3 t = one/(one + 0.47047*abs(x));
+	const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float2 erf6(const float2 x)
+{
+    //  Float2 version:
+	static const float2 one = float2(1.0);
+	const float2 sign_x = sign(x);
+	const float2 t = one/(one + 0.47047*abs(x));
+	const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float erf6(const float x)
+{
+    //  Float version:
+	const float sign_x = sign(x);
+	const float t = 1.0/(1.0 + 0.47047*abs(x));
+	const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float4 erft(const float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Approximate erf() with the hyperbolic tangent.  The error is
+    //              visually noticeable, but it's blazing fast and perceptually
+    //              close...at least on ATI hardware.  See:
+    //                  http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html
+    //  Warning:    Only use this if your hardware drivers correctly implement
+    //              tanh(): My nVidia 8800GTS returns garbage output.
+	return tanh(1.202760580 * x);
+}
+
+float3 erft(const float3 x)
+{
+    //  Float3 version:
+	return tanh(1.202760580 * x);
+}
+
+float2 erft(const float2 x)
+{
+    //  Float2 version:
+	return tanh(1.202760580 * x);
+}
+
+float erft(const float x)
+{
+    //  Float version:
+	return tanh(1.202760580 * x);
+}
+
+inline float4 erf(const float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Some approximation of erf(x), depending on user settings.
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float3 erf(const float3 x)
+{
+    //  Float3 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float2 erf(const float2 x)
+{
+    //  Float2 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float erf(const float x)
+{
+    //  Float version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+
+///////////////////////////  COMPLETE GAMMA FUNCTION  //////////////////////////
+
+float4 gamma_impl(const float4 s, const float4 s_inv)
+{
+    //  Requires:   1.) s is the standard parameter to the gamma function, and
+    //                  it should lie in the [0, 36] range.
+    //              2.) s_inv = 1.0/s.  This implementation function requires
+    //                  the caller to precompute this value, giving users the
+    //                  opportunity to reuse it.
+    //  Returns:    Return approximate gamma function (real-numbered factorial)
+    //              output using the Lanczos approximation with two coefficients
+    //              calculated using Paul Godfrey's method here:
+    //                  http://my.fit.edu/~gabdo/gamma.txt
+    //              An optimal g value for s in [0, 36] is ~1.12906830989, with
+    //              a maximum relative error of 0.000463 for 2**16 equally
+    //              evals.  We could use three coeffs (0.0000346 error) without
+    //              hurting latency, but this allows more parallelism with
+    //              outside instructions.
+	static const float4 g = float4(1.12906830989);
+	static const float4 c0 = float4(0.8109119309638332633713423362694399653724431);
+	static const float4 c1 = float4(0.4808354605142681877121661197951496120000040);
+	static const float4 e = float4(2.71828182845904523536028747135266249775724709);
+	const float4 sph = s + float4(0.5);
+	const float4 lanczos_sum = c0 + c1/(s + float4(1.0));
+	const float4 base = (sph + g)/e;  //  or (s + g + float4(0.5))/e
+	//  gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s).
+	//  This has less error for small s's than (s -= 1.0) at the beginning.
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float3 gamma_impl(const float3 s, const float3 s_inv)
+{
+    //  Float3 version:
+	static const float3 g = float3(1.12906830989);
+	static const float3 c0 = float3(0.8109119309638332633713423362694399653724431);
+	static const float3 c1 = float3(0.4808354605142681877121661197951496120000040);
+	static const float3 e = float3(2.71828182845904523536028747135266249775724709);
+	const float3 sph = s + float3(0.5);
+	const float3 lanczos_sum = c0 + c1/(s + float3(1.0));
+	const float3 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float2 gamma_impl(const float2 s, const float2 s_inv)
+{
+    //  Float2 version:
+	static const float2 g = float2(1.12906830989);
+	static const float2 c0 = float2(0.8109119309638332633713423362694399653724431);
+	static const float2 c1 = float2(0.4808354605142681877121661197951496120000040);
+	static const float2 e = float2(2.71828182845904523536028747135266249775724709);
+	const float2 sph = s + float2(0.5);
+	const float2 lanczos_sum = c0 + c1/(s + float2(1.0));
+	const float2 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float gamma_impl(const float s, const float s_inv)
+{
+    //  Float version:
+	static const float g = 1.12906830989;
+	static const float c0 = 0.8109119309638332633713423362694399653724431;
+	static const float c1 = 0.4808354605142681877121661197951496120000040;
+	static const float e = 2.71828182845904523536028747135266249775724709;
+	const float sph = s + 0.5;
+	const float lanczos_sum = c0 + c1/(s + 1.0);
+	const float base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float4 gamma(const float4 s)
+{
+    //  Requires:   s is the standard parameter to the gamma function, and it
+    //              should lie in the [0, 36] range.
+    //  Returns:    Return approximate gamma function output with a maximum
+    //              relative error of 0.000463.  See gamma_impl for details.
+	return gamma_impl(s, float4(1.0)/s);
+}
+
+float3 gamma(const float3 s)
+{
+    //  Float3 version:
+	return gamma_impl(s, float3(1.0)/s);
+}
+
+float2 gamma(const float2 s)
+{
+    //  Float2 version:
+	return gamma_impl(s, float2(1.0)/s);
+}
+
+float gamma(const float s)
+{
+    //  Float version:
+	return gamma_impl(s, 1.0/s);
+}
+
+
+////////////////  INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT)  ///////////////
+
+//  Lower incomplete gamma function for small s and z (implementation):
+float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z <= ~0.775075
+    //              3.) s_inv = 1.0/s (precomputed for outside reuse)
+    //  Returns:    A series representation for the lower incomplete gamma
+    //              function for small s and small z (4 terms).
+    //  The actual "rolled up" summation looks like:
+	//      last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0;
+	//      sum = last_sign * last_pow / ((s + k) * last_factorial)
+	//      for(int i = 0; i < 4; ++i)
+	//      {
+	//          last_sign *= -1.0; last_pow *= z; last_factorial *= i;
+	//          sum += last_sign * last_pow / ((s + k) * last_factorial);
+	//      }
+	//  Unrolled, constant-unfolded and arranged for madds and parallelism:
+	const float4 scale = pow(z, s);
+	float4 sum = s_inv;  //  Summation iteration 0 result
+	//  Summation iterations 1, 2, and 3:
+	const float4 z_sq = z*z;
+	const float4 denom1 = s + float4(1.0);
+	const float4 denom2 = 2.0*s + float4(4.0);
+	const float4 denom3 = 6.0*s + float4(18.0);
+	//float4 denom4 = 24.0*s + float4(96.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	//sum += z_sq * z_sq / denom4;
+	//  Scale and return:
+	return scale * sum;
+}
+
+float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv)
+{
+    //  Float3 version:
+	const float3 scale = pow(z, s);
+	float3 sum = s_inv;
+	const float3 z_sq = z*z;
+	const float3 denom1 = s + float3(1.0);
+	const float3 denom2 = 2.0*s + float3(4.0);
+	const float3 denom3 = 6.0*s + float3(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv)
+{
+    //  Float2 version:
+	const float2 scale = pow(z, s);
+	float2 sum = s_inv;
+	const float2 z_sq = z*z;
+	const float2 denom1 = s + float2(1.0);
+	const float2 denom2 = 2.0*s + float2(4.0);
+	const float2 denom3 = 6.0*s + float2(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float ligamma_small_z_impl(const float s, const float z, const float s_inv)
+{
+    //  Float version:
+	const float scale = pow(z, s);
+	float sum = s_inv;
+	const float z_sq = z*z;
+	const float denom1 = s + 1.0;
+	const float denom2 = 2.0*s + 4.0;
+	const float denom3 = 6.0*s + 18.0;
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+//  Upper incomplete gamma function for small s and large z (implementation):
+float4 uigamma_large_z_impl(const float4 s, const float4 z)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z > ~0.775075
+    //  Returns:    Gauss's continued fraction representation for the upper
+    //              incomplete gamma function (4 terms).
+	//  The "rolled up" continued fraction looks like this.  The denominator
+    //  is truncated, and it's calculated "from the bottom up:"
+	//      denom = float4('inf');
+	//      float4 one = float4(1.0);
+	//      for(int i = 4; i > 0; --i)
+	//      {
+	//          denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom;
+	//      }
+	//  Unrolled and constant-unfolded for madds and parallelism:
+	const float4 numerator = pow(z, s) * exp(-z);
+	float4 denom = float4(7.0) + z - s;
+	denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom;
+	denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom;
+	denom = float4(1.0) + z - s + (s - float4(1.0))/denom;
+	return numerator / denom;
+}
+
+float3 uigamma_large_z_impl(const float3 s, const float3 z)
+{
+    //  Float3 version:
+	const float3 numerator = pow(z, s) * exp(-z);
+	float3 denom = float3(7.0) + z - s;
+	denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom;
+	denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom;
+	denom = float3(1.0) + z - s + (s - float3(1.0))/denom;
+	return numerator / denom;
+}
+
+float2 uigamma_large_z_impl(const float2 s, const float2 z)
+{
+    //  Float2 version:
+	const float2 numerator = pow(z, s) * exp(-z);
+	float2 denom = float2(7.0) + z - s;
+	denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom;
+	denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom;
+	denom = float2(1.0) + z - s + (s - float2(1.0))/denom;
+	return numerator / denom;
+}
+
+float uigamma_large_z_impl(const float s, const float z)
+{
+    //  Float version:
+	const float numerator = pow(z, s) * exp(-z);
+	float denom = 7.0 + z - s;
+	denom = 5.0 + z - s + (3.0*s - 9.0)/denom;
+	denom = 3.0 + z - s + (2.0*s - 4.0)/denom;
+	denom = 1.0 + z - s + (s - 1.0)/denom;
+	return numerator / denom;
+}
+
+//  Normalized lower incomplete gamma function for small s (implementation):
+float4 normalized_ligamma_impl(const float4 s, const float4 z,
+    const float4 s_inv, const float4 gamma_s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) s_inv = 1/s (precomputed for outside reuse)
+    //              3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse)
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  Since we only care about s < 0.5, we only need
+    //              to evaluate two branches (not four) based on z.  Each branch
+    //              uses four terms, with a max relative error of ~0.00182.  The
+    //              branch threshold and specifics were adapted for fewer terms
+    //              from Gil/Segura/Temme's paper here:
+    //                  http://oai.cwi.nl/oai/asset/20433/20433B.pdf
+	//  Evaluate both branches: Real branches test slower even when available.
+	static const float4 thresh = float4(0.775075);
+	bool4 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	z_is_large.w = z.w > thresh.w;
+	const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	//  Combine the results from both branches:
+	bool4 inverse_z_is_large = not(z_is_large);
+	return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large);
+}
+
+float3 normalized_ligamma_impl(const float3 s, const float3 z,
+    const float3 s_inv, const float3 gamma_s_inv)
+{
+    //  Float3 version:
+	static const float3 thresh = float3(0.775075);
+	bool3 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool3 inverse_z_is_large = not(z_is_large);
+	return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large);
+}
+
+float2 normalized_ligamma_impl(const float2 s, const float2 z,
+    const float2 s_inv, const float2 gamma_s_inv)
+{
+    //  Float2 version:
+	static const float2 thresh = float2(0.775075);
+	bool2 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool2 inverse_z_is_large = not(z_is_large);
+	return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large);
+}
+
+float normalized_ligamma_impl(const float s, const float z,
+    const float s_inv, const float gamma_s_inv)
+{
+    //  Float version:
+	static const float thresh = 0.775075;
+	const bool z_is_large = z > thresh;
+	const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	return large_z * float(z_is_large) + small_z * float(!z_is_large);
+}
+
+//  Normalized lower incomplete gamma function for small s:
+float4 normalized_ligamma(const float4 s, const float4 z)
+{
+    //  Requires:   s < ~0.5
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  See normalized_ligamma_impl() for details.
+	const float4 s_inv = float4(1.0)/s;
+	const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float3 normalized_ligamma(const float3 s, const float3 z)
+{
+    //  Float3 version:
+	const float3 s_inv = float3(1.0)/s;
+	const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float2 normalized_ligamma(const float2 s, const float2 z)
+{
+    //  Float2 version:
+	const float2 s_inv = float2(1.0)/s;
+	const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float normalized_ligamma(const float s, const float z)
+{
+    //  Float version:
+	const float s_inv = 1.0/s;
+	const float gamma_s_inv = 1.0/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+#endif  //  SPECIAL_FUNCTIONS_H
+
+////////////////////////////  END SPECIAL-FUNCTIONS  ///////////////////////////
+
+//#include "../../../../include/gamma-management.h"
+
+////////////////////////////  BEGIN GAMMA-MANAGEMENT  //////////////////////////
+
+#ifndef GAMMA_MANAGEMENT_H
+#define GAMMA_MANAGEMENT_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//  
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides gamma-aware tex*D*() and encode_output() functions.
+//  Requires:   Before #include-ing this file, the including file must #define
+//              the following macros when applicable and follow their rules:
+//              1.) #define FIRST_PASS if this is the first pass.
+//              2.) #define LAST_PASS if this is the last pass.
+//              3.) If sRGB is available, set srgb_framebufferN = "true" for
+//                  every pass except the last in your .cgp preset.
+//              4.) If sRGB isn't available but you want gamma-correctness with
+//                  no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
+//              5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
+//              6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
+//              7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
+//              8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
+//              If an option in [5, 8] is #defined in the first or last pass, it
+//              should be #defined for both.  It shouldn't make a difference
+//              whether it's #defined for intermediate passes or not.
+//  Optional:   The including file (or an earlier included file) may optionally
+//              #define a number of macros indicating it will override certain
+//              macros and associated constants are as follows:
+//              static constants with either static or uniform constants.  The
+//              1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
+//                  static const float ntsc_gamma
+//                  static const float pal_gamma
+//                  static const float crt_reference_gamma_high
+//                  static const float crt_reference_gamma_low
+//                  static const float lcd_reference_gamma
+//                  static const float crt_office_gamma
+//                  static const float lcd_office_gamma
+//              2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
+//                  static const float crt_gamma
+//                  static const float gba_gamma
+//                  static const float lcd_gamma
+//              3.) OVERRIDE_FINAL_GAMMA: The user must first define:
+//                  static const float input_gamma
+//                  static const float intermediate_gamma
+//                  static const float output_gamma
+//                  (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
+//              4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
+//                  static const bool assume_opaque_alpha
+//              The gamma constant overrides must be used in every pass or none,
+//              and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
+//              OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
+//  Usage:      After setting macros appropriately, ignore gamma correction and
+//              replace all tex*D*() calls with equivalent gamma-aware
+//              tex*D*_linearize calls, except:
+//              1.) When you read an LUT, use regular tex*D or a gamma-specified
+//                  function, depending on its gamma encoding:
+//                      tex*D*_linearize_gamma (takes a runtime gamma parameter)
+//              2.) If you must read pass0's original input in a later pass, use
+//                  tex2D_linearize_ntsc_gamma.  If you want to read pass0's
+//                  input with gamma-corrected bilinear filtering, consider
+//                  creating a first linearizing pass and reading from the input
+//                  of pass1 later.
+//              Then, return encode_output(color) from every fragment shader.
+//              Finally, use the global gamma_aware_bilinear boolean if you want
+//              to statically branch based on whether bilinear filtering is
+//              gamma-correct or not (e.g. for placing Gaussian blur samples).
+//
+//  Detailed Policy:
+//  tex*D*_linearize() functions enforce a consistent gamma-management policy
+//  based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings.  They assume
+//  their input texture has the same encoding characteristics as the input for
+//  the current pass (which doesn't apply to the exceptions listed above).
+//  Similarly, encode_output() enforces a policy based on the LAST_PASS and
+//  GAMMA_ENCODE_EVERY_FBO settings.  Together, they result in one of the
+//  following two pipelines.
+//  Typical pipeline with intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = linear_color;     //  Automatic sRGB encoding
+//      linear_color = intermediate_output;     //  Automatic sRGB decoding
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Typical pipeline without intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
+//      linear_color = pow(intermediate_output, intermediate_gamma);
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
+//  easily get gamma-correctness without banding on devices where sRGB isn't
+//  supported.
+//
+//  Use This Header to Maximize Code Reuse:
+//  The purpose of this header is to provide a consistent interface for texture
+//  reads and output gamma-encoding that localizes and abstracts away all the
+//  annoying details.  This greatly reduces the amount of code in each shader
+//  pass that depends on the pass number in the .cgp preset or whether sRGB
+//  FBO's are being used: You can trivially change the gamma behavior of your
+//  whole pass by commenting or uncommenting 1-3 #defines.  To reuse the same
+//  code in your first, Nth, and last passes, you can even put it all in another
+//  header file and #include it from skeleton .cg files that #define the
+//  appropriate pass-specific settings.
+//
+//  Rationale for Using Three Macros:
+//  This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
+//  SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
+//  a lower maintenance burden on each pass.  At first glance it seems we could
+//  accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
+//  This works for simple use cases where input_gamma == output_gamma, but it
+//  breaks down for more complex scenarios like CRT simulation, where the pass
+//  number determines the gamma encoding of the input and output.
+
+
+///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
+
+//  Set standard gamma constants, but allow users to override them:
+#ifndef OVERRIDE_STANDARD_GAMMA
+    //  Standard encoding gammas:
+    static const float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
+    static const float pal_gamma = 2.8;     //  Never actually 2.8 in practice
+    //  Typical device decoding gammas (only use for emulating devices):
+    //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
+    //  gammas: The standards purposely undercorrected for an analog CRT's
+    //  assumed 2.5 reference display gamma to maintain contrast in assumed
+    //  [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
+    //  These unstated assumptions about display gamma and perceptual rendering
+    //  intent caused a lot of confusion, and more modern CRT's seemed to target
+    //  NTSC 2.2 gamma with circuitry.  LCD displays seem to have followed suit
+    //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
+    //  displays designed to view sRGB in bright environments.  (Standards are
+    //  also in flux again with BT.1886, but it's underspecified for displays.)
+    static const float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
+    static const float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
+    static const float lcd_reference_gamma = 2.5;       //  To match CRT
+    static const float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
+    static const float lcd_office_gamma = 2.2;  //  Approximates sRGB
+#endif  //  OVERRIDE_STANDARD_GAMMA
+
+//  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
+//  but only if they're aware of it.
+#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
+    static const bool assume_opaque_alpha = false;
+#endif
+
+
+///////////////////////  DERIVED CONSTANTS AS FUNCTIONS  ///////////////////////
+
+//  gamma-management.h should be compatible with overriding gamma values with
+//  runtime user parameters, but we can only define other global constants in
+//  terms of static constants, not uniform user parameters.  To get around this
+//  limitation, we need to define derived constants using functions.
+
+//  Set device gamma constants, but allow users to override them:
+#ifdef OVERRIDE_DEVICE_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_crt_gamma()    {   return crt_gamma;   }
+    inline float get_gba_gamma()    {   return gba_gamma;   }
+    inline float get_lcd_gamma()    {   return lcd_gamma;   }
+#else
+    inline float get_crt_gamma()    {   return crt_reference_gamma_high;    }
+    inline float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
+    inline float get_lcd_gamma()    {   return lcd_office_gamma;            }
+#endif  //  OVERRIDE_DEVICE_GAMMA
+
+//  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
+#ifdef OVERRIDE_FINAL_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_intermediate_gamma()   {   return intermediate_gamma;  }
+    inline float get_input_gamma()          {   return input_gamma;         }
+    inline float get_output_gamma()         {   return output_gamma;        }
+#else
+    //  If we gamma-correct every pass, always use ntsc_gamma between passes to
+    //  ensure middle passes don't need to care if anything is being simulated:
+    inline float get_intermediate_gamma()   {   return ntsc_gamma;          }
+    #ifdef SIMULATE_CRT_ON_LCD
+        inline float get_input_gamma()      {   return get_crt_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_LCD
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_LCD_ON_CRT
+        inline float get_input_gamma()      {   return get_lcd_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_CRT
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else   //  Don't simulate anything:
+        inline float get_input_gamma()      {   return ntsc_gamma;          }
+        inline float get_output_gamma()     {   return ntsc_gamma;          }
+    #endif  //  SIMULATE_GBA_ON_CRT
+    #endif  //  SIMULATE_LCD_ON_CRT
+    #endif  //  SIMULATE_GBA_ON_LCD
+    #endif  //  SIMULATE_CRT_ON_LCD
+#endif  //  OVERRIDE_FINAL_GAMMA
+
+//  Set decoding/encoding gammas for the current pass.  Use static constants for
+//  linearize_input and gamma_encode_output, because they aren't derived, and
+//  they let the compiler do dead-code elimination.
+#ifndef GAMMA_ENCODE_EVERY_FBO
+    #ifdef FIRST_PASS
+        static const bool linearize_input = true;
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        static const bool linearize_input = false;
+        inline float get_pass_input_gamma()     {   return 1.0;                 }
+    #endif
+    #ifdef LAST_PASS
+        static const bool gamma_encode_output = true;
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        static const bool gamma_encode_output = false;
+        inline float get_pass_output_gamma()    {   return 1.0;                 }
+    #endif
+#else
+    static const bool linearize_input = true;
+    static const bool gamma_encode_output = true;
+    #ifdef FIRST_PASS
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        inline float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
+    #endif
+    #ifdef LAST_PASS
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        inline float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
+    #endif
+#endif
+
+//  Users might want to know if bilinear filtering will be gamma-correct:
+static const bool gamma_aware_bilinear = !linearize_input;
+
+
+//////////////////////  COLOR ENCODING/DECODING FUNCTIONS  /////////////////////
+
+inline float4 encode_output(const float4 color)
+{
+    if(gamma_encode_output)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_input(const float4 color)
+{
+    if(linearize_input)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_gamma_input(const float4 color, const float3 gamma)
+{
+    if(assume_opaque_alpha)
+    {
+        return float4(pow(color.rgb, gamma), 1.0);
+    }
+    else
+    {
+        return float4(pow(color.rgb, gamma), color.a);
+    }
+}
+
+//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯
+//#define tex2D_linearize(C, D) decode_input(vec4(texture(C, D)))
+// EDIT: it's the 'const' in front of the coords that's doing it
+
+///////////////////////////  TEXTURE LOOKUP WRAPPERS  //////////////////////////
+
+//  "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a wide array of linearizing texture lookup wrapper functions.  The
+//  Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
+//  lookups are provided for completeness in case that changes someday.  Nobody
+//  is likely to use the *fetch and *proj functions, but they're included just
+//  in case.  The only tex*D texture sampling functions omitted are:
+//      - tex*Dcmpbias
+//      - tex*Dcmplod
+//      - tex*DARRAY*
+//      - tex*DMS*
+//      - Variants returning integers
+//  Standard line length restrictions are ignored below for vertical brevity.
+/*
+//  tex1D:
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex1Dbias:
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dbias(tex, tex_coords));   }
+
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dbias(tex, tex_coords, texel_off));    }
+
+//  tex1Dfetch:
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
+{   return decode_input(tex1Dfetch(tex, tex_coords));  }
+
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex1Dlod:
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dlod(tex, tex_coords));    }
+
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dlod(tex, tex_coords, texel_off));     }
+
+//  tex1Dproj:
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+*/
+//  tex2D:
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords, texel_off));    }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex2Dbias:
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
+//{   return decode_input(tex2Dbias(tex, tex_coords));   }
+
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dbias(tex, tex_coords, texel_off));    }
+
+//  tex2Dfetch:
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
+//{   return decode_input(tex2Dfetch(tex, tex_coords));  }
+
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords)
+{   return decode_input(textureLod(tex, tex_coords.xy, 0.0));    }
+
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));     }
+/*
+//  tex2Dproj:
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+*/
+/*
+//  tex3D:
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
+{   return decode_input(tex3D(tex, tex_coords));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, texel_off));    }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex3Dbias:
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dbias(tex, tex_coords));   }
+
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dbias(tex, tex_coords, texel_off));    }
+
+//  tex3Dfetch:
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
+{   return decode_input(tex3Dfetch(tex, tex_coords));  }
+
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex3Dlod:
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dlod(tex, tex_coords));    }
+
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dlod(tex, tex_coords, texel_off));     }
+
+//  tex3Dproj:
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dproj(tex, tex_coords));   }
+
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dproj(tex, tex_coords, texel_off));    }
+/////////*
+
+//  NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  This narrow selection of nonstandard tex2D* functions can be useful:
+
+//  tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0)));   }
+
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off));    }
+
+
+//  MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a narrower selection of tex2D* wrapper functions that decode an
+//  input sample with a specified gamma value.  These are useful for reading
+//  LUT's and for reading the input of pass0 in a later pass.
+
+//  tex2D:
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma);   }
+
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+/*
+//  tex2Dbias:
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma);   }
+
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma);    }
+
+//  tex2Dfetch:
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma);  }
+
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma);   }
+*/
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma);    }
+
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma);     }
+
+
+#endif  //  GAMMA_MANAGEMENT_H
+
+////////////////////////////  END GAMMA-MANAGEMENT  //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+/////////////////////////////  SCANLINE FUNCTIONS  /////////////////////////////
+
+inline float3 get_gaussian_sigma(const float3 color, const float sigma_range)
+{
+    //  Requires:   Globals:
+    //              1.) beam_min_sigma and beam_max_sigma are global floats
+    //                  containing the desired minimum and maximum beam standard
+    //                  deviations, for dim and bright colors respectively.
+    //              2.) beam_max_sigma must be > 0.0
+    //              3.) beam_min_sigma must be in (0.0, beam_max_sigma]
+    //              4.) beam_spot_power must be defined as a global float.
+    //              Parameters:
+    //              1.) color is the underlying source color along a scanline
+    //              2.) sigma_range = beam_max_sigma - beam_min_sigma; we take
+    //                  sigma_range as a parameter to avoid repeated computation
+    //                  when beam_{min, max}_sigma are runtime shader parameters
+    //  Optional:   Users may set beam_spot_shape_function to 1 to define the
+    //              inner f(color) subfunction (see below) as:
+    //                  f(color) = sqrt(1.0 - (color - 1.0)*(color - 1.0))
+    //              Otherwise (technically, if beam_spot_shape_function < 0.5):
+    //                  f(color) = pow(color, beam_spot_power)
+    //  Returns:    The standard deviation of the Gaussian beam for "color:"
+    //                  sigma = beam_min_sigma + sigma_range * f(color)
+    //  Details/Discussion:
+    //  The beam's spot shape vaguely resembles an aspect-corrected f() in the
+    //  range [0, 1] (not quite, but it's related).  f(color) = color makes
+    //  spots look like diamonds, and a spherical function or cube balances
+    //  between variable width and a soft/realistic shape.   A beam_spot_power
+    //  > 1.0 can produce an ugly spot shape and more initial clipping, but the
+    //  final shape also differs based on the horizontal resampling filter and
+    //  the phosphor bloom.  For instance, resampling horizontally in nonlinear
+    //  light and/or with a sharp (e.g. Lanczos) filter will sharpen the spot
+    //  shape, but a sixth root is still quite soft.  A power function (default
+    //  1.0/3.0 beam_spot_power) is most flexible, but a fixed spherical curve
+    //  has the highest variability without an awful spot shape.
+    //
+    //  beam_min_sigma affects scanline sharpness/aliasing in dim areas, and its
+    //  difference from beam_max_sigma affects beam width variability.  It only
+    //  affects clipping [for pure Gaussians] if beam_spot_power > 1.0 (which is
+    //  a conservative estimate for a more complex constraint).
+    //
+    //  beam_max_sigma affects clipping and increasing scanline width/softness
+    //  as color increases.  The wider this is, the more scanlines need to be
+    //  evaluated to avoid distortion.  For a pure Gaussian, the max_beam_sigma
+    //  at which the first unused scanline always has a weight < 1.0/255.0 is:
+    //      num scanlines = 2, max_beam_sigma = 0.2089; distortions begin ~0.34
+    //      num scanlines = 3, max_beam_sigma = 0.3879; distortions begin ~0.52
+    //      num scanlines = 4, max_beam_sigma = 0.5723; distortions begin ~0.70
+    //      num scanlines = 5, max_beam_sigma = 0.7591; distortions begin ~0.89
+    //      num scanlines = 6, max_beam_sigma = 0.9483; distortions begin ~1.08
+    //  Generalized Gaussians permit more leeway here as steepness increases.
+    if(beam_spot_shape_function < 0.5)
+    {
+        //  Use a power function:
+        return float3(beam_min_sigma) + sigma_range *
+            pow(color, float3(beam_spot_power));
+    }
+    else
+    {
+        //  Use a spherical function:
+        const float3 color_minus_1 = color - float3(1.0);
+        return float3(beam_min_sigma) + sigma_range *
+            sqrt(float3(1.0) - color_minus_1*color_minus_1);
+    }
+}
+
+inline float3 get_generalized_gaussian_beta(const float3 color,
+    const float shape_range)
+{
+    //  Requires:   Globals:
+    //              1.) beam_min_shape and beam_max_shape are global floats
+    //                  containing the desired min/max generalized Gaussian
+    //                  beta parameters, for dim and bright colors respectively.
+    //              2.) beam_max_shape must be >= 2.0
+    //              3.) beam_min_shape must be in [2.0, beam_max_shape]
+    //              4.) beam_shape_power must be defined as a global float.
+    //              Parameters:
+    //              1.) color is the underlying source color along a scanline
+    //              2.) shape_range = beam_max_shape - beam_min_shape; we take
+    //                  shape_range as a parameter to avoid repeated computation
+    //                  when beam_{min, max}_shape are runtime shader parameters
+    //  Returns:    The type-I generalized Gaussian "shape" parameter beta for
+    //              the given color.
+    //  Details/Discussion:
+    //  Beta affects the scanline distribution as follows:
+    //  a.) beta < 2.0 narrows the peak to a spike with a discontinuous slope
+    //  b.) beta == 2.0 just degenerates to a Gaussian
+    //  c.) beta > 2.0 flattens and widens the peak, then drops off more steeply
+    //      than a Gaussian.  Whereas high sigmas widen and soften peaks, high
+    //      beta widen and sharpen peaks at the risk of aliasing.
+    //  Unlike high beam_spot_powers, high beam_shape_powers actually soften shape
+    //  transitions, whereas lower ones sharpen them (at the risk of aliasing).
+    return beam_min_shape + shape_range * pow(color, float3(beam_shape_power));
+}
+
+float3 scanline_gaussian_integral_contrib(const float3 dist,
+    const float3 color, const float pixel_height, const float sigma_range)
+{
+    //  Requires:   1.) dist is the distance of the [potentially separate R/G/B]
+    //                  point(s) from a scanline in units of scanlines, where
+    //                  1.0 means the sample point straddles the next scanline.
+    //              2.) color is the underlying source color along a scanline.
+    //              3.) pixel_height is the output pixel height in scanlines.
+    //              4.) Requirements of get_gaussian_sigma() must be met.
+    //  Returns:    Return a scanline's light output over a given pixel.
+    //  Details:
+    //  The CRT beam profile follows a roughly Gaussian distribution which is
+    //  wider for bright colors than dark ones.  The integral over the full
+    //  range of a Gaussian function is always 1.0, so we can vary the beam
+    //  with a standard deviation without affecting brightness.  'x' = distance:
+    //      gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2))
+    //      gaussian integral = 0.5 (1.0 + erf(x/(sigma * sqrt(2))))
+    //  Use a numerical approximation of the "error function" (the Gaussian
+    //  indefinite integral) to find the definite integral of the scanline's
+    //  average brightness over a given pixel area.  Even if curved coords were
+    //  used in this pass, a flat scalar pixel height works almost as well as a
+    //  pixel height computed from a full pixel-space to scanline-space matrix.
+    const float3 sigma = get_gaussian_sigma(color, sigma_range);
+    const float3 ph_offset = float3(pixel_height * 0.5);
+    const float3 denom_inv = 1.0/(sigma*sqrt(2.0));
+    const float3 integral_high = erf((dist + ph_offset)*denom_inv);
+    const float3 integral_low = erf((dist - ph_offset)*denom_inv);
+    return color * 0.5*(integral_high - integral_low)/pixel_height;
+}
+
+float3 scanline_generalized_gaussian_integral_contrib(float3 dist,
+    float3 color, float pixel_height, float sigma_range,
+    float shape_range)
+{
+    //  Requires:   1.) Requirements of scanline_gaussian_integral_contrib()
+    //                  must be met.
+    //              2.) Requirements of get_gaussian_sigma() must be met.
+    //              3.) Requirements of get_generalized_gaussian_beta() must be
+    //                  met.
+    //  Returns:    Return a scanline's light output over a given pixel.
+    //  A generalized Gaussian distribution allows the shape (beta) to vary
+    //  as well as the width (alpha).  "gamma" refers to the gamma function:
+    //      generalized sample =
+    //          beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta)
+    //  ligamma(s, z) is the lower incomplete gamma function, for which we only
+    //  implement two of four branches (because we keep 1/beta <= 0.5):
+    //      generalized integral = 0.5 + 0.5* sign(x) *
+    //          ligamma(1/beta, (|x|/alpha)**beta)/gamma(1/beta)
+    //  See get_generalized_gaussian_beta() for a discussion of beta.
+    //  We base alpha on the intended Gaussian sigma, but it only strictly
+    //  models models standard deviation at beta == 2, because the standard
+    //  deviation depends on both alpha and beta (keeping alpha independent is
+    //  faster and preserves intuitive behavior and a full spectrum of results).
+    const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
+    const float3 beta = get_generalized_gaussian_beta(color, shape_range);
+    const float3 alpha_inv = float3(1.0)/alpha;
+    const float3 s = float3(1.0)/beta;
+    const float3 ph_offset = float3(pixel_height * 0.5);
+    //  Pass beta to gamma_impl to avoid repeated divides.  Similarly pass
+    //  beta (i.e. 1/s) and 1/gamma(s) to normalized_ligamma_impl.
+    const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, beta);
+    const float3 dist1 = dist + ph_offset;
+    const float3 dist0 = dist - ph_offset;
+    const float3 integral_high = sign(dist1) * normalized_ligamma_impl(
+        s, pow(abs(dist1)*alpha_inv, beta), beta, gamma_s_inv);
+    const float3 integral_low = sign(dist0) * normalized_ligamma_impl(
+        s, pow(abs(dist0)*alpha_inv, beta), beta, gamma_s_inv);
+    return color * 0.5*(integral_high - integral_low)/pixel_height;
+}
+
+float3 scanline_gaussian_sampled_contrib(const float3 dist, const float3 color,
+    const float pixel_height, const float sigma_range)
+{
+    //  See scanline_gaussian integral_contrib() for detailed comments!
+    //  gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2))
+    const float3 sigma = get_gaussian_sigma(color, sigma_range);
+    //  Avoid repeated divides:
+    const float3 sigma_inv = float3(1.0)/sigma;
+    const float3 inner_denom_inv = 0.5 * sigma_inv * sigma_inv;
+    const float3 outer_denom_inv = sigma_inv/sqrt(2.0*pi);
+    if(beam_antialias_level > 0.5)
+    {
+        //  Sample 1/3 pixel away in each direction as well:
+        const float3 sample_offset = float3(pixel_height/3.0);
+        const float3 dist2 = dist + sample_offset;
+        const float3 dist3 = abs(dist - sample_offset);
+        //  Average three pure Gaussian samples:
+        const float3 scale = color/3.0 * outer_denom_inv;
+        const float3 weight1 = exp(-(dist*dist)*inner_denom_inv);
+        const float3 weight2 = exp(-(dist2*dist2)*inner_denom_inv);
+        const float3 weight3 = exp(-(dist3*dist3)*inner_denom_inv);
+        return scale * (weight1 + weight2 + weight3);
+    }
+    else
+    {
+        return color*exp(-(dist*dist)*inner_denom_inv)*outer_denom_inv;
+    }
+}
+
+float3 scanline_generalized_gaussian_sampled_contrib(float3 dist,
+    float3 color, float pixel_height, float sigma_range,
+    float shape_range)
+{
+    //  See scanline_generalized_gaussian_integral_contrib() for details!
+    //  generalized sample =
+    //      beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta)
+    const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
+    const float3 beta = get_generalized_gaussian_beta(color, shape_range);
+    //  Avoid repeated divides:
+    const float3 alpha_inv = float3(1.0)/alpha;
+    const float3 beta_inv = float3(1.0)/beta;
+    const float3 scale = color * beta * 0.5 * alpha_inv /
+        gamma_impl(beta_inv, beta);
+    if(beam_antialias_level > 0.5)
+    {
+        //  Sample 1/3 pixel closer to and farther from the scanline too.
+        const float3 sample_offset = float3(pixel_height/3.0);
+        const float3 dist2 = dist + sample_offset;
+        const float3 dist3 = abs(dist - sample_offset);
+        //  Average three generalized Gaussian samples:
+        const float3 weight1 = exp(-pow(abs(dist*alpha_inv), beta));
+        const float3 weight2 = exp(-pow(abs(dist2*alpha_inv), beta));
+        const float3 weight3 = exp(-pow(abs(dist3*alpha_inv), beta));
+        return scale/3.0 * (weight1 + weight2 + weight3);
+    }
+    else
+    {
+        return scale * exp(-pow(abs(dist*alpha_inv), beta));
+    }
+}
+
+inline float3 scanline_contrib(float3 dist, float3 color,
+    float pixel_height, const float sigma_range, const float shape_range)
+{
+    //  Requires:   1.) Requirements of scanline_gaussian_integral_contrib()
+    //                  must be met.
+    //              2.) Requirements of get_gaussian_sigma() must be met.
+    //              3.) Requirements of get_generalized_gaussian_beta() must be
+    //                  met.
+    //  Returns:    Return a scanline's light output over a given pixel, using
+    //              a generalized or pure Gaussian distribution and sampling or
+    //              integrals as desired by user codepath choices.
+    if(beam_generalized_gaussian)
+    {
+        if(beam_antialias_level > 1.5)
+        {
+            return scanline_generalized_gaussian_integral_contrib(
+                dist, color, pixel_height, sigma_range, shape_range);
+        }
+        else
+        {
+            return scanline_generalized_gaussian_sampled_contrib(
+                dist, color, pixel_height, sigma_range, shape_range);
+        }
+    }
+    else
+    {
+        if(beam_antialias_level > 1.5)
+        {
+            return scanline_gaussian_integral_contrib(
+                dist, color, pixel_height, sigma_range);
+        }
+        else
+        {
+            return scanline_gaussian_sampled_contrib(
+                dist, color, pixel_height, sigma_range);
+        }
+    }
+}
+
+inline float3 get_raw_interpolated_color(const float3 color0,
+    const float3 color1, const float3 color2, const float3 color3,
+    const float4 weights)
+{
+    //  Use max to avoid bizarre artifacts from negative colors:
+    return max(mul(weights, float4x3(color0, color1, color2, color3)), 0.0);
+}
+
+float3 get_interpolated_linear_color(const float3 color0, const float3 color1,
+    const float3 color2, const float3 color3, const float4 weights)
+{
+    //  Requires:   1.) Requirements of include/gamma-management.h must be met:
+    //                  intermediate_gamma must be globally defined, and input
+    //                  colors are interpreted as linear RGB unless you #define
+    //                  GAMMA_ENCODE_EVERY_FBO (in which case they are
+    //                  interpreted as gamma-encoded with intermediate_gamma).
+    //              2.) color0-3 are colors sampled from a texture with tex2D().
+    //                  They are interpreted as defined in requirement 1.
+    //              3.) weights contains weights for each color, summing to 1.0.
+    //              4.) beam_horiz_linear_rgb_weight must be defined as a global
+    //                  float in [0.0, 1.0] describing how much blending should
+    //                  be done in linear RGB (rest is gamma-corrected RGB).
+    //              5.) RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE must be #defined
+    //                  if beam_horiz_linear_rgb_weight is anything other than a
+    //                  static constant, or we may try branching at runtime
+    //                  without dynamic branches allowed (slow).
+    //  Returns:    Return an interpolated color lookup between the four input
+    //              colors based on the weights in weights.  The final color will
+    //              be a linear RGB value, but the blending will be done as
+    //              indicated above.
+    const float intermediate_gamma = get_intermediate_gamma();
+    //  Branch if beam_horiz_linear_rgb_weight is static (for free) or if the
+    //  profile allows dynamic branches (faster than computing extra pows):
+    #ifndef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+    #else
+        #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+            #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+        #endif
+    #endif
+    #ifdef SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+        //  beam_horiz_linear_rgb_weight is static, so we can branch:
+        #ifdef GAMMA_ENCODE_EVERY_FBO
+            const float3 gamma_mixed_color = pow(get_raw_interpolated_color(
+                color0, color1, color2, color3, weights), float3(intermediate_gamma));
+            if(beam_horiz_linear_rgb_weight > 0.0)
+            {
+                const float3 linear_mixed_color = get_raw_interpolated_color(
+                    pow(color0, float3(intermediate_gamma)),
+                    pow(color1, float3(intermediate_gamma)),
+                    pow(color2, float3(intermediate_gamma)),
+                    pow(color3, float3(intermediate_gamma)),
+                    weights);
+                return lerp(gamma_mixed_color, linear_mixed_color,
+                    beam_horiz_linear_rgb_weight);
+            }
+            else
+            {
+                return gamma_mixed_color;
+            }
+        #else
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                color0, color1, color2, color3, weights);
+            if(beam_horiz_linear_rgb_weight < 1.0)
+            {
+                const float3 gamma_mixed_color = get_raw_interpolated_color(
+                    pow(color0, float3(1.0/intermediate_gamma)),
+                    pow(color1, float3(1.0/intermediate_gamma)),
+                    pow(color2, float3(1.0/intermediate_gamma)),
+                    pow(color3, float3(1.0/intermediate_gamma)),
+                    weights);
+                return lerp(gamma_mixed_color, linear_mixed_color,
+                    beam_horiz_linear_rgb_weight);
+            }
+            else
+            {
+                return linear_mixed_color;
+            }
+        #endif  //  GAMMA_ENCODE_EVERY_FBO
+    #else
+        #ifdef GAMMA_ENCODE_EVERY_FBO
+            //  Inputs: color0-3 are colors in gamma-encoded RGB.
+            const float3 gamma_mixed_color = pow(get_raw_interpolated_color(
+                color0, color1, color2, color3, weights), intermediate_gamma);
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                pow(color0, float3(intermediate_gamma)),
+                pow(color1, float3(intermediate_gamma)),
+                pow(color2, float3(intermediate_gamma)),
+                pow(color3, float3(intermediate_gamma)),
+                weights);
+            return lerp(gamma_mixed_color, linear_mixed_color,
+                beam_horiz_linear_rgb_weight);
+        #else
+            //  Inputs: color0-3 are colors in linear RGB.
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                color0, color1, color2, color3, weights);
+            const float3 gamma_mixed_color = get_raw_interpolated_color(
+                    pow(color0, float3(1.0/intermediate_gamma)),
+                    pow(color1, float3(1.0/intermediate_gamma)),
+                    pow(color2, float3(1.0/intermediate_gamma)),
+                    pow(color3, float3(1.0/intermediate_gamma)),
+                    weights);
+			// wtf fixme
+//			const float beam_horiz_linear_rgb_weight1 = 1.0;
+            return lerp(gamma_mixed_color, linear_mixed_color,
+                beam_horiz_linear_rgb_weight);
+        #endif  //  GAMMA_ENCODE_EVERY_FBO
+    #endif  //  SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+}
+
+float3 get_scanline_color(const sampler2D tex, const float2 scanline_uv,
+    const float2 uv_step_x, const float4 weights)
+{
+    //  Requires:   1.) scanline_uv must be vertically snapped to the caller's
+    //                  desired line or scanline and horizontally snapped to the
+    //                  texel just left of the output pixel (color1)
+    //              2.) uv_step_x must contain the horizontal uv distance
+    //                  between texels.
+    //              3.) weights must contain interpolation filter weights for
+    //                  color0, color1, color2, and color3, where color1 is just
+    //                  left of the output pixel.
+    //  Returns:    Return a horizontally interpolated texture lookup using 2-4
+    //              nearby texels, according to weights and the conventions of
+    //              get_interpolated_linear_color().
+    //  We can ignore the outside texture lookups for Quilez resampling.
+    const float3 color1 = COMPAT_TEXTURE(tex, scanline_uv).rgb;
+    const float3 color2 = COMPAT_TEXTURE(tex, scanline_uv + uv_step_x).rgb;
+    float3 color0 = float3(0.0);
+    float3 color3 = float3(0.0);
+    if(beam_horiz_filter > 0.5)
+    {
+        color0 = COMPAT_TEXTURE(tex, scanline_uv - uv_step_x).rgb;
+        color3 = COMPAT_TEXTURE(tex, scanline_uv + 2.0 * uv_step_x).rgb;
+    }
+    //  Sample the texture as-is, whether it's linear or gamma-encoded:
+    //  get_interpolated_linear_color() will handle the difference.
+    return get_interpolated_linear_color(color0, color1, color2, color3, weights);
+}
+
+float3 sample_single_scanline_horizontal(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size,
+    const float2 texture_size_inv)
+{
+    //  TODO: Add function requirements.
+    //  Snap to the previous texel and get sample dists from 2/4 nearby texels:
+    const float2 curr_texel = tex_uv * tex_size;
+    //  Use under_half to fix a rounding bug right around exact texel locations.
+    const float2 prev_texel =
+        floor(curr_texel - float2(under_half)) + float2(0.5);
+    const float2 prev_texel_hor = float2(prev_texel.x, curr_texel.y);
+    const float2 prev_texel_hor_uv = prev_texel_hor * texture_size_inv;
+    const float prev_dist = curr_texel.x - prev_texel_hor.x;
+    const float4 sample_dists = float4(1.0 + prev_dist, prev_dist,
+        1.0 - prev_dist, 2.0 - prev_dist);
+    //  Get Quilez, Lanczos2, or Gaussian resize weights for 2/4 nearby texels:
+    float4 weights;
+    if(beam_horiz_filter < 0.5)
+    {
+        //  Quilez:
+        const float x = sample_dists.y;
+        const float w2 = x*x*x*(x*(x*6.0 - 15.0) + 10.0);
+        weights = float4(0.0, 1.0 - w2, w2, 0.0);
+    }
+    else if(beam_horiz_filter < 1.5)
+    {
+        //  Gaussian:
+        float inner_denom_inv = 1.0/(2.0*beam_horiz_sigma*beam_horiz_sigma);
+        weights = exp(-(sample_dists*sample_dists)*inner_denom_inv);
+    }
+    else
+    {
+        //  Lanczos2:
+        const float4 pi_dists = FIX_ZERO(sample_dists * pi);
+        weights = 2.0 * sin(pi_dists) * sin(pi_dists * 0.5) /
+            (pi_dists * pi_dists);
+    }
+    //  Ensure the weight sum == 1.0:
+    const float4 final_weights = weights/dot(weights, float4(1.0));
+    //  Get the interpolated horizontal scanline color:
+    const float2 uv_step_x = float2(texture_size_inv.x, 0.0);
+    return get_scanline_color(
+        tex, prev_texel_hor_uv, uv_step_x, final_weights);
+}
+
+float3 sample_rgb_scanline_horizontal(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size,
+    const float2 texture_size_inv)
+{
+    //  TODO: Add function requirements.
+    //  Rely on a helper to make convergence easier.
+    if(beam_misconvergence)
+    {
+        const float3 convergence_offsets_rgb =
+            get_convergence_offsets_x_vector();
+        const float3 offset_u_rgb =
+            convergence_offsets_rgb * texture_size_inv.xxx;
+        const float2 scanline_uv_r = tex_uv - float2(offset_u_rgb.r, 0.0);
+        const float2 scanline_uv_g = tex_uv - float2(offset_u_rgb.g, 0.0);
+        const float2 scanline_uv_b = tex_uv - float2(offset_u_rgb.b, 0.0);
+        const float3 sample_r = sample_single_scanline_horizontal(
+            tex, scanline_uv_r, tex_size, texture_size_inv);
+        const float3 sample_g = sample_single_scanline_horizontal(
+            tex, scanline_uv_g, tex_size, texture_size_inv);
+        const float3 sample_b = sample_single_scanline_horizontal(
+            tex, scanline_uv_b, tex_size, texture_size_inv);
+        return float3(sample_r.r, sample_g.g, sample_b.b);
+    }
+    else
+    {
+        return sample_single_scanline_horizontal(tex, tex_uv, tex_size,
+            texture_size_inv);
+    }
+}
+
+float2 get_last_scanline_uv(const float2 tex_uv, const float2 tex_size,
+    const float2 texture_size_inv, const float2 il_step_multiple,
+    const float frame_count, out float dist)
+{
+    //  Compute texture coords for the last/upper scanline, accounting for
+    //  interlacing: With interlacing, only consider even/odd scanlines every
+    //  other frame.  Top-field first (TFF) order puts even scanlines on even
+    //  frames, and BFF order puts them on odd frames.  Texels are centered at:
+    //      frac(tex_uv * tex_size) == x.5
+    //  Caution: If these coordinates ever seem incorrect, first make sure it's
+    //  not because anisotropic filtering is blurring across field boundaries.
+    //  Note: TFF/BFF won't matter for sources that double-weave or similar.
+	// wtf fixme
+//	const float interlace_bff1 = 1.0;
+    const float field_offset = floor(il_step_multiple.y * 0.75) *
+        fmod(frame_count + float(interlace_bff), 2.0);
+    const float2 curr_texel = tex_uv * tex_size;
+    //  Use under_half to fix a rounding bug right around exact texel locations.
+    const float2 prev_texel_num = floor(curr_texel - float2(under_half));
+    const float wrong_field = fmod(
+        prev_texel_num.y + field_offset, il_step_multiple.y);
+    const float2 scanline_texel_num = prev_texel_num - float2(0.0, wrong_field);
+    //  Snap to the center of the previous scanline in the current field:
+    const float2 scanline_texel = scanline_texel_num + float2(0.5);
+    const float2 scanline_uv = scanline_texel * texture_size_inv;
+    //  Save the sample's distance from the scanline, in units of scanlines:
+    dist = (curr_texel.y - scanline_texel.y)/il_step_multiple.y;
+    return scanline_uv;
+}
+
+inline bool is_interlaced(float num_lines)
+{
+    //  Detect interlacing based on the number of lines in the source.
+    if(interlace_detect)
+    {
+        //  NTSC: 525 lines, 262.5/field; 486 active (2 half-lines), 243/field
+        //  NTSC Emulators: Typically 224 or 240 lines
+        //  PAL: 625 lines, 312.5/field; 576 active (typical), 288/field
+        //  PAL Emulators: ?
+        //  ATSC: 720p, 1080i, 1080p
+        //  Where do we place our cutoffs?  Assumptions:
+        //  1.) We only need to care about active lines.
+        //  2.) Anything > 288 and <= 576 lines is probably interlaced.
+        //  3.) Anything > 576 lines is probably not interlaced...
+        //  4.) ...except 1080 lines, which is a crapshoot (user decision).
+        //  5.) Just in case the main program uses calculated video sizes,
+        //      we should nudge the float thresholds a bit.
+        const bool sd_interlace = ((num_lines > 288.5) && (num_lines < 576.5));
+        const bool hd_interlace = bool(interlace_1080i) ?
+            ((num_lines > 1079.5) && (num_lines < 1080.5)) :
+            false;
+        return (sd_interlace || hd_interlace);
+    }
+    else
+    {
+        return false;
+    }
+}
+
+#endif  //  SCANLINE_FUNCTIONS_H
+
+/////////////////////////////  END SCANLINE-FUNCTIONS  ////////////////////////////
+
+void main() {
+	const float2 tex_uv = vTexCoord.xy;
+    //  Linearize the input based on CRT gamma and bob interlaced fields.
+    //  Bobbing ensures we can immediately blur without getting artifacts.
+    //  Note: TFF/BFF won't matter for sources that double-weave or similar.
+    if(bool(interlace_detect))
+    {
+        //  Sample the current line and an average of the previous/next line;
+        //  tex2D_linearize will decode CRT gamma.  Don't bother branching:
+        const float2 v_step = float2(0.0, uv_step.y);
+        const float3 curr_line = tex2D_linearize(
+            input_texture, tex_uv).rgb;
+        const float3 last_line = tex2D_linearize(
+            input_texture, tex_uv - v_step).rgb;
+        const float3 next_line = tex2D_linearize(
+            input_texture, tex_uv + v_step).rgb;
+        const float3 interpolated_line = 0.5 * (last_line + next_line);
+        //  If we're interlacing, determine which field curr_line is in:
+        const float modulus = interlaced + 1.0;
+        const float field_offset =
+            fmod(frame_count + interlace_bff, modulus);
+        const float curr_line_texel = tex_uv.y * texture_size.y;
+        //  Use under_half to fix a rounding bug around exact texel locations.
+        const float line_num_last = floor(curr_line_texel - under_half);
+        const float wrong_field = fmod(line_num_last + field_offset, modulus);
+        //  Select the correct color, and output the result:
+        const float3 color = lerp(curr_line, interpolated_line, wrong_field);
+        FragColor =  encode_output(float4(color, 1.0));
+    }
+    else
+    {
+        FragColor =  encode_output(tex2D_linearize(input_texture, tex_uv));
+    }
+}
\ No newline at end of file
diff --git a/shaders/CRT-Royale.shader/first-pass-linearize-crt-gamma-bob-fields.vs b/shaders/CRT-Royale.shader/first-pass-linearize-crt-gamma-bob-fields.vs
new file mode 100644
index 00000000..12b93534
--- /dev/null
+++ b/shaders/CRT-Royale.shader/first-pass-linearize-crt-gamma-bob-fields.vs
@@ -0,0 +1,4704 @@
+#version 150
+
+in vec4 position;
+in vec2 texCoord;
+
+out Vertex {
+   vec2 vTexCoord;
+   vec2 uv_step;
+   float interlaced;
+};
+
+uniform vec4 targetSize;
+uniform vec4 sourceSize[];
+
+// USER SETTINGS BLOCK //
+
+#define crt_gamma 2.500000
+#define lcd_gamma 2.200000
+#define levels_contrast 1.0
+#define halation_weight 0.0
+#define diffusion_weight 0.075
+#define bloom_underestimate_levels 0.8
+#define bloom_excess 0.000000
+#define beam_min_sigma 0.020000
+#define beam_max_sigma 0.300000
+#define beam_spot_power 0.330000
+#define beam_min_shape 2.000000
+#define beam_max_shape 4.000000
+#define beam_shape_power 0.250000
+#define beam_horiz_filter 0.000000
+#define beam_horiz_sigma 0.35
+#define beam_horiz_linear_rgb_weight 1.000000
+#define convergence_offset_x_r -0.000000
+#define convergence_offset_x_g 0.000000
+#define convergence_offset_x_b 0.000000
+#define convergence_offset_y_r 0.000000
+#define convergence_offset_y_g -0.000000
+#define convergence_offset_y_b 0.000000
+#define mask_type 1.000000
+#define mask_sample_mode_desired 0.000000
+#define mask_specify_num_triads 0.000000
+#define mask_triad_size_desired 3.000000
+#define mask_num_triads_desired 480.000000
+#define aa_subpixel_r_offset_x_runtime -0.0
+#define aa_subpixel_r_offset_y_runtime 0.000000
+#define aa_cubic_c 0.500000
+#define aa_gauss_sigma 0.500000
+#define geom_mode_runtime 0.000000
+#define geom_radius 2.000000
+#define geom_view_dist 2.000000
+#define geom_tilt_angle_x 0.000000
+#define geom_tilt_angle_y 0.000000
+#define geom_aspect_ratio_x 432.000000
+#define geom_aspect_ratio_y 329.000000
+#define geom_overscan_x 1.000000
+#define geom_overscan_y 1.000000
+#define border_size 0.015
+#define border_darkness 2.0
+#define border_compress 2.500000
+#define interlace_bff 0.000000
+#define interlace_1080i 0.000000
+
+// END USER SETTINGS BLOCK //
+
+// compatibility macros for transparently converting HLSLisms into GLSLisms
+#define mul(a,b) (b*a)
+#define lerp(a,b,c) mix(a,b,c)
+#define saturate(c) clamp(c, 0.0, 1.0)
+#define frac(x) (fract(x))
+#define float2 vec2
+#define float3 vec3
+#define float4 vec4
+#define bool2 bvec2
+#define bool3 bvec3
+#define bool4 bvec4
+#define float2x2 mat2x2
+#define float3x3 mat3x3
+#define float4x4 mat4x4
+#define float4x3 mat4x3
+#define float2x4 mat2x4
+#define IN params
+#define texture_size sourceSize[0].xy
+#define video_size sourceSize[0].xy
+#define output_size targetSize.xy
+#define frame_count phase
+#define static  
+#define inline  
+#define const  
+#define fmod(x,y) mod(x,y)
+#define ddx(c) dFdx(c)
+#define ddy(c) dFdy(c)
+#define atan2(x,y) atan(y,x)
+#define rsqrt(c) inversesqrt(c)
+
+#define input_texture source[0]
+
+#ifdef GL_ES
+#ifdef GL_FRAGMENT_PRECISION_HIGH
+precision highp float;
+#else
+precision mediump float;
+#endif
+#define COMPAT_PRECISION mediump
+#else
+#define COMPAT_PRECISION
+#endif
+
+#if __VERSION__ >= 130
+#define COMPAT_VARYING in
+#define COMPAT_TEXTURE texture
+#else
+#define COMPAT_VARYING varying
+#define FragColor gl_FragColor
+#define COMPAT_TEXTURE texture2D
+#endif
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+//  PASS SETTINGS:
+//  gamma-management.h needs to know what kind of pipeline we're using and
+//  what pass this is in that pipeline.  This will become obsolete if/when we
+//  can #define things like this in the .cgp preset file.
+#define FIRST_PASS
+#define SIMULATE_CRT_ON_LCD
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//#include "bind-shader-h"
+
+/////////////////////////////  BEGIN BIND-SHADER-PARAMS  ////////////////////////////
+
+#ifndef BIND_SHADER_PARAMS_H
+#define BIND_SHADER_PARAMS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+/////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+////////////////////   END DERIVED-SETTINGS-AND-CONSTANTS   /////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+//  Override some parameters for gamma-management.h and tex2Dantialias.h:
+#define OVERRIDE_DEVICE_GAMMA
+static const float gba_gamma = 3.5; //  Irrelevant but necessary to define.
+#define ANTIALIAS_OVERRIDE_BASICS
+#define ANTIALIAS_OVERRIDE_PARAMETERS
+
+//  Provide accessors for vector constants that pack scalar uniforms:
+inline float2 get_aspect_vector(const float geom_aspect_ratio)
+{
+    //  Get an aspect ratio vector.  Enforce geom_max_aspect_ratio, and prevent
+    //  the absolute scale from affecting the uv-mapping for curvature:
+    const float geom_clamped_aspect_ratio =
+        min(geom_aspect_ratio, geom_max_aspect_ratio);
+    const float2 geom_aspect =
+        normalize(float2(geom_clamped_aspect_ratio, 1.0));
+    return geom_aspect;
+}
+
+inline float2 get_geom_overscan_vector()
+{
+    return float2(geom_overscan_x, geom_overscan_y);
+}
+
+inline float2 get_geom_tilt_angle_vector()
+{
+    return float2(geom_tilt_angle_x, geom_tilt_angle_y);
+}
+
+inline float3 get_convergence_offsets_x_vector()
+{
+    return float3(convergence_offset_x_r, convergence_offset_x_g,
+        convergence_offset_x_b);
+}
+
+inline float3 get_convergence_offsets_y_vector()
+{
+    return float3(convergence_offset_y_r, convergence_offset_y_g,
+        convergence_offset_y_b);
+}
+
+inline float2 get_convergence_offsets_r_vector()
+{
+    return float2(convergence_offset_x_r, convergence_offset_y_r);
+}
+
+inline float2 get_convergence_offsets_g_vector()
+{
+    return float2(convergence_offset_x_g, convergence_offset_y_g);
+}
+
+inline float2 get_convergence_offsets_b_vector()
+{
+    return float2(convergence_offset_x_b, convergence_offset_y_b);
+}
+
+inline float2 get_aa_subpixel_r_offset()
+{
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+            //  WARNING: THIS IS EXTREMELY EXPENSIVE.
+            return float2(aa_subpixel_r_offset_x_runtime,
+                aa_subpixel_r_offset_y_runtime);
+        #else
+            return aa_subpixel_r_offset_static;
+        #endif
+    #else
+        return aa_subpixel_r_offset_static;
+    #endif
+}
+
+//  Provide accessors settings which still need "cooking:"
+inline float get_mask_amplify()
+{
+    static const float mask_grille_amplify = 1.0/mask_grille_avg_color;
+    static const float mask_slot_amplify = 1.0/mask_slot_avg_color;
+    static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color;
+    return mask_type < 0.5 ? mask_grille_amplify :
+        mask_type < 1.5 ? mask_slot_amplify :
+        mask_shadow_amplify;
+}
+
+inline float get_mask_sample_mode()
+{
+    #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_desired;
+        #else
+            return clamp(mask_sample_mode_desired, 1.0, 2.0);
+        #endif
+    #else
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_static;
+        #else
+            return clamp(mask_sample_mode_static, 1.0, 2.0);
+        #endif
+    #endif
+}
+
+#endif  //  BIND_SHADER_PARAMS_H
+
+////////////////////////////  END BIND-SHADER-PARAMS  ///////////////////////////
+
+//#include "../../../../include/gamma-management.h"
+
+////////////////////////////  BEGIN GAMMA-MANAGEMENT  //////////////////////////
+
+#ifndef GAMMA_MANAGEMENT_H
+#define GAMMA_MANAGEMENT_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//  
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides gamma-aware tex*D*() and encode_output() functions.
+//  Requires:   Before #include-ing this file, the including file must #define
+//              the following macros when applicable and follow their rules:
+//              1.) #define FIRST_PASS if this is the first pass.
+//              2.) #define LAST_PASS if this is the last pass.
+//              3.) If sRGB is available, set srgb_framebufferN = "true" for
+//                  every pass except the last in your .cgp preset.
+//              4.) If sRGB isn't available but you want gamma-correctness with
+//                  no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
+//              5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
+//              6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
+//              7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
+//              8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
+//              If an option in [5, 8] is #defined in the first or last pass, it
+//              should be #defined for both.  It shouldn't make a difference
+//              whether it's #defined for intermediate passes or not.
+//  Optional:   The including file (or an earlier included file) may optionally
+//              #define a number of macros indicating it will override certain
+//              macros and associated constants are as follows:
+//              static constants with either static or uniform constants.  The
+//              1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
+//                  static const float ntsc_gamma
+//                  static const float pal_gamma
+//                  static const float crt_reference_gamma_high
+//                  static const float crt_reference_gamma_low
+//                  static const float lcd_reference_gamma
+//                  static const float crt_office_gamma
+//                  static const float lcd_office_gamma
+//              2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
+//                  static const float crt_gamma
+//                  static const float gba_gamma
+//                  static const float lcd_gamma
+//              3.) OVERRIDE_FINAL_GAMMA: The user must first define:
+//                  static const float input_gamma
+//                  static const float intermediate_gamma
+//                  static const float output_gamma
+//                  (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
+//              4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
+//                  static const bool assume_opaque_alpha
+//              The gamma constant overrides must be used in every pass or none,
+//              and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
+//              OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
+//  Usage:      After setting macros appropriately, ignore gamma correction and
+//              replace all tex*D*() calls with equivalent gamma-aware
+//              tex*D*_linearize calls, except:
+//              1.) When you read an LUT, use regular tex*D or a gamma-specified
+//                  function, depending on its gamma encoding:
+//                      tex*D*_linearize_gamma (takes a runtime gamma parameter)
+//              2.) If you must read pass0's original input in a later pass, use
+//                  tex2D_linearize_ntsc_gamma.  If you want to read pass0's
+//                  input with gamma-corrected bilinear filtering, consider
+//                  creating a first linearizing pass and reading from the input
+//                  of pass1 later.
+//              Then, return encode_output(color) from every fragment shader.
+//              Finally, use the global gamma_aware_bilinear boolean if you want
+//              to statically branch based on whether bilinear filtering is
+//              gamma-correct or not (e.g. for placing Gaussian blur samples).
+//
+//  Detailed Policy:
+//  tex*D*_linearize() functions enforce a consistent gamma-management policy
+//  based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings.  They assume
+//  their input texture has the same encoding characteristics as the input for
+//  the current pass (which doesn't apply to the exceptions listed above).
+//  Similarly, encode_output() enforces a policy based on the LAST_PASS and
+//  GAMMA_ENCODE_EVERY_FBO settings.  Together, they result in one of the
+//  following two pipelines.
+//  Typical pipeline with intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = linear_color;     //  Automatic sRGB encoding
+//      linear_color = intermediate_output;     //  Automatic sRGB decoding
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Typical pipeline without intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
+//      linear_color = pow(intermediate_output, intermediate_gamma);
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
+//  easily get gamma-correctness without banding on devices where sRGB isn't
+//  supported.
+//
+//  Use This Header to Maximize Code Reuse:
+//  The purpose of this header is to provide a consistent interface for texture
+//  reads and output gamma-encoding that localizes and abstracts away all the
+//  annoying details.  This greatly reduces the amount of code in each shader
+//  pass that depends on the pass number in the .cgp preset or whether sRGB
+//  FBO's are being used: You can trivially change the gamma behavior of your
+//  whole pass by commenting or uncommenting 1-3 #defines.  To reuse the same
+//  code in your first, Nth, and last passes, you can even put it all in another
+//  header file and #include it from skeleton .cg files that #define the
+//  appropriate pass-specific settings.
+//
+//  Rationale for Using Three Macros:
+//  This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
+//  SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
+//  a lower maintenance burden on each pass.  At first glance it seems we could
+//  accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
+//  This works for simple use cases where input_gamma == output_gamma, but it
+//  breaks down for more complex scenarios like CRT simulation, where the pass
+//  number determines the gamma encoding of the input and output.
+
+
+///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
+
+//  Set standard gamma constants, but allow users to override them:
+#ifndef OVERRIDE_STANDARD_GAMMA
+    //  Standard encoding gammas:
+    static const float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
+    static const float pal_gamma = 2.8;     //  Never actually 2.8 in practice
+    //  Typical device decoding gammas (only use for emulating devices):
+    //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
+    //  gammas: The standards purposely undercorrected for an analog CRT's
+    //  assumed 2.5 reference display gamma to maintain contrast in assumed
+    //  [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
+    //  These unstated assumptions about display gamma and perceptual rendering
+    //  intent caused a lot of confusion, and more modern CRT's seemed to target
+    //  NTSC 2.2 gamma with circuitry.  LCD displays seem to have followed suit
+    //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
+    //  displays designed to view sRGB in bright environments.  (Standards are
+    //  also in flux again with BT.1886, but it's underspecified for displays.)
+    static const float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
+    static const float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
+    static const float lcd_reference_gamma = 2.5;       //  To match CRT
+    static const float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
+    static const float lcd_office_gamma = 2.2;  //  Approximates sRGB
+#endif  //  OVERRIDE_STANDARD_GAMMA
+
+//  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
+//  but only if they're aware of it.
+#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
+    static const bool assume_opaque_alpha = false;
+#endif
+
+
+///////////////////////  DERIVED CONSTANTS AS FUNCTIONS  ///////////////////////
+
+//  gamma-management.h should be compatible with overriding gamma values with
+//  runtime user parameters, but we can only define other global constants in
+//  terms of static constants, not uniform user parameters.  To get around this
+//  limitation, we need to define derived constants using functions.
+
+//  Set device gamma constants, but allow users to override them:
+#ifdef OVERRIDE_DEVICE_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_crt_gamma()    {   return crt_gamma;   }
+    inline float get_gba_gamma()    {   return gba_gamma;   }
+    inline float get_lcd_gamma()    {   return lcd_gamma;   }
+#else
+    inline float get_crt_gamma()    {   return crt_reference_gamma_high;    }
+    inline float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
+    inline float get_lcd_gamma()    {   return lcd_office_gamma;            }
+#endif  //  OVERRIDE_DEVICE_GAMMA
+
+//  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
+#ifdef OVERRIDE_FINAL_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_intermediate_gamma()   {   return intermediate_gamma;  }
+    inline float get_input_gamma()          {   return input_gamma;         }
+    inline float get_output_gamma()         {   return output_gamma;        }
+#else
+    //  If we gamma-correct every pass, always use ntsc_gamma between passes to
+    //  ensure middle passes don't need to care if anything is being simulated:
+    inline float get_intermediate_gamma()   {   return ntsc_gamma;          }
+    #ifdef SIMULATE_CRT_ON_LCD
+        inline float get_input_gamma()      {   return get_crt_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_LCD
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_LCD_ON_CRT
+        inline float get_input_gamma()      {   return get_lcd_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_CRT
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else   //  Don't simulate anything:
+        inline float get_input_gamma()      {   return ntsc_gamma;          }
+        inline float get_output_gamma()     {   return ntsc_gamma;          }
+    #endif  //  SIMULATE_GBA_ON_CRT
+    #endif  //  SIMULATE_LCD_ON_CRT
+    #endif  //  SIMULATE_GBA_ON_LCD
+    #endif  //  SIMULATE_CRT_ON_LCD
+#endif  //  OVERRIDE_FINAL_GAMMA
+
+//  Set decoding/encoding gammas for the current pass.  Use static constants for
+//  linearize_input and gamma_encode_output, because they aren't derived, and
+//  they let the compiler do dead-code elimination.
+#ifndef GAMMA_ENCODE_EVERY_FBO
+    #ifdef FIRST_PASS
+        static const bool linearize_input = true;
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        static const bool linearize_input = false;
+        inline float get_pass_input_gamma()     {   return 1.0;                 }
+    #endif
+    #ifdef LAST_PASS
+        static const bool gamma_encode_output = true;
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        static const bool gamma_encode_output = false;
+        inline float get_pass_output_gamma()    {   return 1.0;                 }
+    #endif
+#else
+    static const bool linearize_input = true;
+    static const bool gamma_encode_output = true;
+    #ifdef FIRST_PASS
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        inline float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
+    #endif
+    #ifdef LAST_PASS
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        inline float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
+    #endif
+#endif
+
+//  Users might want to know if bilinear filtering will be gamma-correct:
+static const bool gamma_aware_bilinear = !linearize_input;
+
+
+//////////////////////  COLOR ENCODING/DECODING FUNCTIONS  /////////////////////
+
+inline float4 encode_output(const float4 color)
+{
+    if(gamma_encode_output)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_input(const float4 color)
+{
+    if(linearize_input)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_gamma_input(const float4 color, const float3 gamma)
+{
+    if(assume_opaque_alpha)
+    {
+        return float4(pow(color.rgb, gamma), 1.0);
+    }
+    else
+    {
+        return float4(pow(color.rgb, gamma), color.a);
+    }
+}
+
+//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯
+//#define tex2D_linearize(C, D) decode_input(vec4(texture(C, D)))
+// EDIT: it's the 'const' in front of the coords that's doing it
+
+///////////////////////////  TEXTURE LOOKUP WRAPPERS  //////////////////////////
+
+//  "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a wide array of linearizing texture lookup wrapper functions.  The
+//  Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
+//  lookups are provided for completeness in case that changes someday.  Nobody
+//  is likely to use the *fetch and *proj functions, but they're included just
+//  in case.  The only tex*D texture sampling functions omitted are:
+//      - tex*Dcmpbias
+//      - tex*Dcmplod
+//      - tex*DARRAY*
+//      - tex*DMS*
+//      - Variants returning integers
+//  Standard line length restrictions are ignored below for vertical brevity.
+/*
+//  tex1D:
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex1Dbias:
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dbias(tex, tex_coords));   }
+
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dbias(tex, tex_coords, texel_off));    }
+
+//  tex1Dfetch:
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
+{   return decode_input(tex1Dfetch(tex, tex_coords));  }
+
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex1Dlod:
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dlod(tex, tex_coords));    }
+
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dlod(tex, tex_coords, texel_off));     }
+
+//  tex1Dproj:
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+*/
+//  tex2D:
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords, texel_off));    }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex2Dbias:
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
+//{   return decode_input(tex2Dbias(tex, tex_coords));   }
+
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dbias(tex, tex_coords, texel_off));    }
+
+//  tex2Dfetch:
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
+//{   return decode_input(tex2Dfetch(tex, tex_coords));  }
+
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords)
+{   return decode_input(textureLod(tex, tex_coords.xy, 0.0));    }
+
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));     }
+/*
+//  tex2Dproj:
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+*/
+/*
+//  tex3D:
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
+{   return decode_input(tex3D(tex, tex_coords));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, texel_off));    }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex3Dbias:
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dbias(tex, tex_coords));   }
+
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dbias(tex, tex_coords, texel_off));    }
+
+//  tex3Dfetch:
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
+{   return decode_input(tex3Dfetch(tex, tex_coords));  }
+
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex3Dlod:
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dlod(tex, tex_coords));    }
+
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dlod(tex, tex_coords, texel_off));     }
+
+//  tex3Dproj:
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dproj(tex, tex_coords));   }
+
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dproj(tex, tex_coords, texel_off));    }
+/////////*
+
+//  NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  This narrow selection of nonstandard tex2D* functions can be useful:
+
+//  tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0)));   }
+
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off));    }
+
+
+//  MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a narrower selection of tex2D* wrapper functions that decode an
+//  input sample with a specified gamma value.  These are useful for reading
+//  LUT's and for reading the input of pass0 in a later pass.
+
+//  tex2D:
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma);   }
+
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+/*
+//  tex2Dbias:
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma);   }
+
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma);    }
+
+//  tex2Dfetch:
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma);  }
+
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma);   }
+*/
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma);    }
+
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma);     }
+
+
+#endif  //  GAMMA_MANAGEMENT_H
+
+////////////////////////////  END GAMMA-MANAGEMENT  //////////////////////////
+
+//#include "scanline-functions.h"
+
+/////////////////////////////  BEGIN SCANLINE-FUNCTIONS  ////////////////////////////
+
+#ifndef SCANLINE_FUNCTIONS_H
+#define SCANLINE_FUNCTIONS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+////////////////////////////  END USER-SETTINGS  //////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  END DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////////////
+
+//#include "../../../../include/special-functions.h"
+
+///////////////////////////  BEGIN SPECIAL-FUNCTIONS  //////////////////////////
+
+#ifndef SPECIAL_FUNCTIONS_H
+#define SPECIAL_FUNCTIONS_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file implements the following mathematical special functions:
+//  1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2))
+//  2.) gamma(s), a real-numbered extension of the integer factorial function
+//  It also implements normalized_ligamma(s, z), a normalized lower incomplete
+//  gamma function for s < 0.5 only.  Both gamma() and normalized_ligamma() can
+//  be called with an _impl suffix to use an implementation version with a few
+//  extra precomputed parameters (which may be useful for the caller to reuse).
+//  See below for details.
+//
+//  Design Rationale:
+//  Pretty much every line of code in this file is duplicated four times for
+//  different input types (float4/float3/float2/float).  This is unfortunate,
+//  but Cg doesn't allow function templates.  Macros would be far less verbose,
+//  but they would make the code harder to document and read.  I don't expect
+//  these functions will require a whole lot of maintenance changes unless
+//  someone ever has need for more robust incomplete gamma functions, so code
+//  duplication seems to be the lesser evil in this case.
+
+
+///////////////////////////  GAUSSIAN ERROR FUNCTION  //////////////////////////
+
+float4 erf6(float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Return an Abramowitz/Stegun approximation of erf(), where:
+    //                  erf(x) = 2/sqrt(pi) * integral(e**(-x**2))
+    //              This approximation has a max absolute error of 2.5*10**-5
+    //              with solid numerical robustness and efficiency.  See:
+	//                  https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions
+	static const float4 one = float4(1.0);
+	const float4 sign_x = sign(x);
+	const float4 t = one/(one + 0.47047*abs(x));
+	const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float3 erf6(const float3 x)
+{
+    //  Float3 version:
+	static const float3 one = float3(1.0);
+	const float3 sign_x = sign(x);
+	const float3 t = one/(one + 0.47047*abs(x));
+	const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float2 erf6(const float2 x)
+{
+    //  Float2 version:
+	static const float2 one = float2(1.0);
+	const float2 sign_x = sign(x);
+	const float2 t = one/(one + 0.47047*abs(x));
+	const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float erf6(const float x)
+{
+    //  Float version:
+	const float sign_x = sign(x);
+	const float t = 1.0/(1.0 + 0.47047*abs(x));
+	const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float4 erft(const float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Approximate erf() with the hyperbolic tangent.  The error is
+    //              visually noticeable, but it's blazing fast and perceptually
+    //              close...at least on ATI hardware.  See:
+    //                  http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html
+    //  Warning:    Only use this if your hardware drivers correctly implement
+    //              tanh(): My nVidia 8800GTS returns garbage output.
+	return tanh(1.202760580 * x);
+}
+
+float3 erft(const float3 x)
+{
+    //  Float3 version:
+	return tanh(1.202760580 * x);
+}
+
+float2 erft(const float2 x)
+{
+    //  Float2 version:
+	return tanh(1.202760580 * x);
+}
+
+float erft(const float x)
+{
+    //  Float version:
+	return tanh(1.202760580 * x);
+}
+
+inline float4 erf(const float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Some approximation of erf(x), depending on user settings.
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float3 erf(const float3 x)
+{
+    //  Float3 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float2 erf(const float2 x)
+{
+    //  Float2 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float erf(const float x)
+{
+    //  Float version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+
+///////////////////////////  COMPLETE GAMMA FUNCTION  //////////////////////////
+
+float4 gamma_impl(const float4 s, const float4 s_inv)
+{
+    //  Requires:   1.) s is the standard parameter to the gamma function, and
+    //                  it should lie in the [0, 36] range.
+    //              2.) s_inv = 1.0/s.  This implementation function requires
+    //                  the caller to precompute this value, giving users the
+    //                  opportunity to reuse it.
+    //  Returns:    Return approximate gamma function (real-numbered factorial)
+    //              output using the Lanczos approximation with two coefficients
+    //              calculated using Paul Godfrey's method here:
+    //                  http://my.fit.edu/~gabdo/gamma.txt
+    //              An optimal g value for s in [0, 36] is ~1.12906830989, with
+    //              a maximum relative error of 0.000463 for 2**16 equally
+    //              evals.  We could use three coeffs (0.0000346 error) without
+    //              hurting latency, but this allows more parallelism with
+    //              outside instructions.
+	static const float4 g = float4(1.12906830989);
+	static const float4 c0 = float4(0.8109119309638332633713423362694399653724431);
+	static const float4 c1 = float4(0.4808354605142681877121661197951496120000040);
+	static const float4 e = float4(2.71828182845904523536028747135266249775724709);
+	const float4 sph = s + float4(0.5);
+	const float4 lanczos_sum = c0 + c1/(s + float4(1.0));
+	const float4 base = (sph + g)/e;  //  or (s + g + float4(0.5))/e
+	//  gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s).
+	//  This has less error for small s's than (s -= 1.0) at the beginning.
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float3 gamma_impl(const float3 s, const float3 s_inv)
+{
+    //  Float3 version:
+	static const float3 g = float3(1.12906830989);
+	static const float3 c0 = float3(0.8109119309638332633713423362694399653724431);
+	static const float3 c1 = float3(0.4808354605142681877121661197951496120000040);
+	static const float3 e = float3(2.71828182845904523536028747135266249775724709);
+	const float3 sph = s + float3(0.5);
+	const float3 lanczos_sum = c0 + c1/(s + float3(1.0));
+	const float3 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float2 gamma_impl(const float2 s, const float2 s_inv)
+{
+    //  Float2 version:
+	static const float2 g = float2(1.12906830989);
+	static const float2 c0 = float2(0.8109119309638332633713423362694399653724431);
+	static const float2 c1 = float2(0.4808354605142681877121661197951496120000040);
+	static const float2 e = float2(2.71828182845904523536028747135266249775724709);
+	const float2 sph = s + float2(0.5);
+	const float2 lanczos_sum = c0 + c1/(s + float2(1.0));
+	const float2 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float gamma_impl(const float s, const float s_inv)
+{
+    //  Float version:
+	static const float g = 1.12906830989;
+	static const float c0 = 0.8109119309638332633713423362694399653724431;
+	static const float c1 = 0.4808354605142681877121661197951496120000040;
+	static const float e = 2.71828182845904523536028747135266249775724709;
+	const float sph = s + 0.5;
+	const float lanczos_sum = c0 + c1/(s + 1.0);
+	const float base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float4 gamma(const float4 s)
+{
+    //  Requires:   s is the standard parameter to the gamma function, and it
+    //              should lie in the [0, 36] range.
+    //  Returns:    Return approximate gamma function output with a maximum
+    //              relative error of 0.000463.  See gamma_impl for details.
+	return gamma_impl(s, float4(1.0)/s);
+}
+
+float3 gamma(const float3 s)
+{
+    //  Float3 version:
+	return gamma_impl(s, float3(1.0)/s);
+}
+
+float2 gamma(const float2 s)
+{
+    //  Float2 version:
+	return gamma_impl(s, float2(1.0)/s);
+}
+
+float gamma(const float s)
+{
+    //  Float version:
+	return gamma_impl(s, 1.0/s);
+}
+
+
+////////////////  INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT)  ///////////////
+
+//  Lower incomplete gamma function for small s and z (implementation):
+float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z <= ~0.775075
+    //              3.) s_inv = 1.0/s (precomputed for outside reuse)
+    //  Returns:    A series representation for the lower incomplete gamma
+    //              function for small s and small z (4 terms).
+    //  The actual "rolled up" summation looks like:
+	//      last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0;
+	//      sum = last_sign * last_pow / ((s + k) * last_factorial)
+	//      for(int i = 0; i < 4; ++i)
+	//      {
+	//          last_sign *= -1.0; last_pow *= z; last_factorial *= i;
+	//          sum += last_sign * last_pow / ((s + k) * last_factorial);
+	//      }
+	//  Unrolled, constant-unfolded and arranged for madds and parallelism:
+	const float4 scale = pow(z, s);
+	float4 sum = s_inv;  //  Summation iteration 0 result
+	//  Summation iterations 1, 2, and 3:
+	const float4 z_sq = z*z;
+	const float4 denom1 = s + float4(1.0);
+	const float4 denom2 = 2.0*s + float4(4.0);
+	const float4 denom3 = 6.0*s + float4(18.0);
+	//float4 denom4 = 24.0*s + float4(96.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	//sum += z_sq * z_sq / denom4;
+	//  Scale and return:
+	return scale * sum;
+}
+
+float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv)
+{
+    //  Float3 version:
+	const float3 scale = pow(z, s);
+	float3 sum = s_inv;
+	const float3 z_sq = z*z;
+	const float3 denom1 = s + float3(1.0);
+	const float3 denom2 = 2.0*s + float3(4.0);
+	const float3 denom3 = 6.0*s + float3(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv)
+{
+    //  Float2 version:
+	const float2 scale = pow(z, s);
+	float2 sum = s_inv;
+	const float2 z_sq = z*z;
+	const float2 denom1 = s + float2(1.0);
+	const float2 denom2 = 2.0*s + float2(4.0);
+	const float2 denom3 = 6.0*s + float2(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float ligamma_small_z_impl(const float s, const float z, const float s_inv)
+{
+    //  Float version:
+	const float scale = pow(z, s);
+	float sum = s_inv;
+	const float z_sq = z*z;
+	const float denom1 = s + 1.0;
+	const float denom2 = 2.0*s + 4.0;
+	const float denom3 = 6.0*s + 18.0;
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+//  Upper incomplete gamma function for small s and large z (implementation):
+float4 uigamma_large_z_impl(const float4 s, const float4 z)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z > ~0.775075
+    //  Returns:    Gauss's continued fraction representation for the upper
+    //              incomplete gamma function (4 terms).
+	//  The "rolled up" continued fraction looks like this.  The denominator
+    //  is truncated, and it's calculated "from the bottom up:"
+	//      denom = float4('inf');
+	//      float4 one = float4(1.0);
+	//      for(int i = 4; i > 0; --i)
+	//      {
+	//          denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom;
+	//      }
+	//  Unrolled and constant-unfolded for madds and parallelism:
+	const float4 numerator = pow(z, s) * exp(-z);
+	float4 denom = float4(7.0) + z - s;
+	denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom;
+	denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom;
+	denom = float4(1.0) + z - s + (s - float4(1.0))/denom;
+	return numerator / denom;
+}
+
+float3 uigamma_large_z_impl(const float3 s, const float3 z)
+{
+    //  Float3 version:
+	const float3 numerator = pow(z, s) * exp(-z);
+	float3 denom = float3(7.0) + z - s;
+	denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom;
+	denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom;
+	denom = float3(1.0) + z - s + (s - float3(1.0))/denom;
+	return numerator / denom;
+}
+
+float2 uigamma_large_z_impl(const float2 s, const float2 z)
+{
+    //  Float2 version:
+	const float2 numerator = pow(z, s) * exp(-z);
+	float2 denom = float2(7.0) + z - s;
+	denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom;
+	denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom;
+	denom = float2(1.0) + z - s + (s - float2(1.0))/denom;
+	return numerator / denom;
+}
+
+float uigamma_large_z_impl(const float s, const float z)
+{
+    //  Float version:
+	const float numerator = pow(z, s) * exp(-z);
+	float denom = 7.0 + z - s;
+	denom = 5.0 + z - s + (3.0*s - 9.0)/denom;
+	denom = 3.0 + z - s + (2.0*s - 4.0)/denom;
+	denom = 1.0 + z - s + (s - 1.0)/denom;
+	return numerator / denom;
+}
+
+//  Normalized lower incomplete gamma function for small s (implementation):
+float4 normalized_ligamma_impl(const float4 s, const float4 z,
+    const float4 s_inv, const float4 gamma_s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) s_inv = 1/s (precomputed for outside reuse)
+    //              3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse)
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  Since we only care about s < 0.5, we only need
+    //              to evaluate two branches (not four) based on z.  Each branch
+    //              uses four terms, with a max relative error of ~0.00182.  The
+    //              branch threshold and specifics were adapted for fewer terms
+    //              from Gil/Segura/Temme's paper here:
+    //                  http://oai.cwi.nl/oai/asset/20433/20433B.pdf
+	//  Evaluate both branches: Real branches test slower even when available.
+	static const float4 thresh = float4(0.775075);
+	bool4 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	z_is_large.w = z.w > thresh.w;
+	const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	//  Combine the results from both branches:
+	bool4 inverse_z_is_large = not(z_is_large);
+	return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large);
+}
+
+float3 normalized_ligamma_impl(const float3 s, const float3 z,
+    const float3 s_inv, const float3 gamma_s_inv)
+{
+    //  Float3 version:
+	static const float3 thresh = float3(0.775075);
+	bool3 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool3 inverse_z_is_large = not(z_is_large);
+	return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large);
+}
+
+float2 normalized_ligamma_impl(const float2 s, const float2 z,
+    const float2 s_inv, const float2 gamma_s_inv)
+{
+    //  Float2 version:
+	static const float2 thresh = float2(0.775075);
+	bool2 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool2 inverse_z_is_large = not(z_is_large);
+	return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large);
+}
+
+float normalized_ligamma_impl(const float s, const float z,
+    const float s_inv, const float gamma_s_inv)
+{
+    //  Float version:
+	static const float thresh = 0.775075;
+	const bool z_is_large = z > thresh;
+	const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	return large_z * float(z_is_large) + small_z * float(!z_is_large);
+}
+
+//  Normalized lower incomplete gamma function for small s:
+float4 normalized_ligamma(const float4 s, const float4 z)
+{
+    //  Requires:   s < ~0.5
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  See normalized_ligamma_impl() for details.
+	const float4 s_inv = float4(1.0)/s;
+	const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float3 normalized_ligamma(const float3 s, const float3 z)
+{
+    //  Float3 version:
+	const float3 s_inv = float3(1.0)/s;
+	const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float2 normalized_ligamma(const float2 s, const float2 z)
+{
+    //  Float2 version:
+	const float2 s_inv = float2(1.0)/s;
+	const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float normalized_ligamma(const float s, const float z)
+{
+    //  Float version:
+	const float s_inv = 1.0/s;
+	const float gamma_s_inv = 1.0/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+#endif  //  SPECIAL_FUNCTIONS_H
+
+////////////////////////////  END SPECIAL-FUNCTIONS  ///////////////////////////
+
+//#include "../../../../include/gamma-management.h"
+
+////////////////////////////  BEGIN GAMMA-MANAGEMENT  //////////////////////////
+
+#ifndef GAMMA_MANAGEMENT_H
+#define GAMMA_MANAGEMENT_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//  
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides gamma-aware tex*D*() and encode_output() functions.
+//  Requires:   Before #include-ing this file, the including file must #define
+//              the following macros when applicable and follow their rules:
+//              1.) #define FIRST_PASS if this is the first pass.
+//              2.) #define LAST_PASS if this is the last pass.
+//              3.) If sRGB is available, set srgb_framebufferN = "true" for
+//                  every pass except the last in your .cgp preset.
+//              4.) If sRGB isn't available but you want gamma-correctness with
+//                  no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
+//              5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
+//              6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
+//              7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
+//              8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
+//              If an option in [5, 8] is #defined in the first or last pass, it
+//              should be #defined for both.  It shouldn't make a difference
+//              whether it's #defined for intermediate passes or not.
+//  Optional:   The including file (or an earlier included file) may optionally
+//              #define a number of macros indicating it will override certain
+//              macros and associated constants are as follows:
+//              static constants with either static or uniform constants.  The
+//              1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
+//                  static const float ntsc_gamma
+//                  static const float pal_gamma
+//                  static const float crt_reference_gamma_high
+//                  static const float crt_reference_gamma_low
+//                  static const float lcd_reference_gamma
+//                  static const float crt_office_gamma
+//                  static const float lcd_office_gamma
+//              2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
+//                  static const float crt_gamma
+//                  static const float gba_gamma
+//                  static const float lcd_gamma
+//              3.) OVERRIDE_FINAL_GAMMA: The user must first define:
+//                  static const float input_gamma
+//                  static const float intermediate_gamma
+//                  static const float output_gamma
+//                  (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
+//              4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
+//                  static const bool assume_opaque_alpha
+//              The gamma constant overrides must be used in every pass or none,
+//              and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
+//              OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
+//  Usage:      After setting macros appropriately, ignore gamma correction and
+//              replace all tex*D*() calls with equivalent gamma-aware
+//              tex*D*_linearize calls, except:
+//              1.) When you read an LUT, use regular tex*D or a gamma-specified
+//                  function, depending on its gamma encoding:
+//                      tex*D*_linearize_gamma (takes a runtime gamma parameter)
+//              2.) If you must read pass0's original input in a later pass, use
+//                  tex2D_linearize_ntsc_gamma.  If you want to read pass0's
+//                  input with gamma-corrected bilinear filtering, consider
+//                  creating a first linearizing pass and reading from the input
+//                  of pass1 later.
+//              Then, return encode_output(color) from every fragment shader.
+//              Finally, use the global gamma_aware_bilinear boolean if you want
+//              to statically branch based on whether bilinear filtering is
+//              gamma-correct or not (e.g. for placing Gaussian blur samples).
+//
+//  Detailed Policy:
+//  tex*D*_linearize() functions enforce a consistent gamma-management policy
+//  based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings.  They assume
+//  their input texture has the same encoding characteristics as the input for
+//  the current pass (which doesn't apply to the exceptions listed above).
+//  Similarly, encode_output() enforces a policy based on the LAST_PASS and
+//  GAMMA_ENCODE_EVERY_FBO settings.  Together, they result in one of the
+//  following two pipelines.
+//  Typical pipeline with intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = linear_color;     //  Automatic sRGB encoding
+//      linear_color = intermediate_output;     //  Automatic sRGB decoding
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Typical pipeline without intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
+//      linear_color = pow(intermediate_output, intermediate_gamma);
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
+//  easily get gamma-correctness without banding on devices where sRGB isn't
+//  supported.
+//
+//  Use This Header to Maximize Code Reuse:
+//  The purpose of this header is to provide a consistent interface for texture
+//  reads and output gamma-encoding that localizes and abstracts away all the
+//  annoying details.  This greatly reduces the amount of code in each shader
+//  pass that depends on the pass number in the .cgp preset or whether sRGB
+//  FBO's are being used: You can trivially change the gamma behavior of your
+//  whole pass by commenting or uncommenting 1-3 #defines.  To reuse the same
+//  code in your first, Nth, and last passes, you can even put it all in another
+//  header file and #include it from skeleton .cg files that #define the
+//  appropriate pass-specific settings.
+//
+//  Rationale for Using Three Macros:
+//  This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
+//  SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
+//  a lower maintenance burden on each pass.  At first glance it seems we could
+//  accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
+//  This works for simple use cases where input_gamma == output_gamma, but it
+//  breaks down for more complex scenarios like CRT simulation, where the pass
+//  number determines the gamma encoding of the input and output.
+
+
+///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
+
+//  Set standard gamma constants, but allow users to override them:
+#ifndef OVERRIDE_STANDARD_GAMMA
+    //  Standard encoding gammas:
+    static const float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
+    static const float pal_gamma = 2.8;     //  Never actually 2.8 in practice
+    //  Typical device decoding gammas (only use for emulating devices):
+    //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
+    //  gammas: The standards purposely undercorrected for an analog CRT's
+    //  assumed 2.5 reference display gamma to maintain contrast in assumed
+    //  [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
+    //  These unstated assumptions about display gamma and perceptual rendering
+    //  intent caused a lot of confusion, and more modern CRT's seemed to target
+    //  NTSC 2.2 gamma with circuitry.  LCD displays seem to have followed suit
+    //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
+    //  displays designed to view sRGB in bright environments.  (Standards are
+    //  also in flux again with BT.1886, but it's underspecified for displays.)
+    static const float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
+    static const float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
+    static const float lcd_reference_gamma = 2.5;       //  To match CRT
+    static const float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
+    static const float lcd_office_gamma = 2.2;  //  Approximates sRGB
+#endif  //  OVERRIDE_STANDARD_GAMMA
+
+//  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
+//  but only if they're aware of it.
+#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
+    static const bool assume_opaque_alpha = false;
+#endif
+
+
+///////////////////////  DERIVED CONSTANTS AS FUNCTIONS  ///////////////////////
+
+//  gamma-management.h should be compatible with overriding gamma values with
+//  runtime user parameters, but we can only define other global constants in
+//  terms of static constants, not uniform user parameters.  To get around this
+//  limitation, we need to define derived constants using functions.
+
+//  Set device gamma constants, but allow users to override them:
+#ifdef OVERRIDE_DEVICE_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_crt_gamma()    {   return crt_gamma;   }
+    inline float get_gba_gamma()    {   return gba_gamma;   }
+    inline float get_lcd_gamma()    {   return lcd_gamma;   }
+#else
+    inline float get_crt_gamma()    {   return crt_reference_gamma_high;    }
+    inline float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
+    inline float get_lcd_gamma()    {   return lcd_office_gamma;            }
+#endif  //  OVERRIDE_DEVICE_GAMMA
+
+//  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
+#ifdef OVERRIDE_FINAL_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_intermediate_gamma()   {   return intermediate_gamma;  }
+    inline float get_input_gamma()          {   return input_gamma;         }
+    inline float get_output_gamma()         {   return output_gamma;        }
+#else
+    //  If we gamma-correct every pass, always use ntsc_gamma between passes to
+    //  ensure middle passes don't need to care if anything is being simulated:
+    inline float get_intermediate_gamma()   {   return ntsc_gamma;          }
+    #ifdef SIMULATE_CRT_ON_LCD
+        inline float get_input_gamma()      {   return get_crt_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_LCD
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_LCD_ON_CRT
+        inline float get_input_gamma()      {   return get_lcd_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_CRT
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else   //  Don't simulate anything:
+        inline float get_input_gamma()      {   return ntsc_gamma;          }
+        inline float get_output_gamma()     {   return ntsc_gamma;          }
+    #endif  //  SIMULATE_GBA_ON_CRT
+    #endif  //  SIMULATE_LCD_ON_CRT
+    #endif  //  SIMULATE_GBA_ON_LCD
+    #endif  //  SIMULATE_CRT_ON_LCD
+#endif  //  OVERRIDE_FINAL_GAMMA
+
+//  Set decoding/encoding gammas for the current pass.  Use static constants for
+//  linearize_input and gamma_encode_output, because they aren't derived, and
+//  they let the compiler do dead-code elimination.
+#ifndef GAMMA_ENCODE_EVERY_FBO
+    #ifdef FIRST_PASS
+        static const bool linearize_input = true;
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        static const bool linearize_input = false;
+        inline float get_pass_input_gamma()     {   return 1.0;                 }
+    #endif
+    #ifdef LAST_PASS
+        static const bool gamma_encode_output = true;
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        static const bool gamma_encode_output = false;
+        inline float get_pass_output_gamma()    {   return 1.0;                 }
+    #endif
+#else
+    static const bool linearize_input = true;
+    static const bool gamma_encode_output = true;
+    #ifdef FIRST_PASS
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        inline float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
+    #endif
+    #ifdef LAST_PASS
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        inline float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
+    #endif
+#endif
+
+//  Users might want to know if bilinear filtering will be gamma-correct:
+static const bool gamma_aware_bilinear = !linearize_input;
+
+
+//////////////////////  COLOR ENCODING/DECODING FUNCTIONS  /////////////////////
+
+inline float4 encode_output(const float4 color)
+{
+    if(gamma_encode_output)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_input(const float4 color)
+{
+    if(linearize_input)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_gamma_input(const float4 color, const float3 gamma)
+{
+    if(assume_opaque_alpha)
+    {
+        return float4(pow(color.rgb, gamma), 1.0);
+    }
+    else
+    {
+        return float4(pow(color.rgb, gamma), color.a);
+    }
+}
+
+//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯
+//#define tex2D_linearize(C, D) decode_input(vec4(texture(C, D)))
+// EDIT: it's the 'const' in front of the coords that's doing it
+
+///////////////////////////  TEXTURE LOOKUP WRAPPERS  //////////////////////////
+
+//  "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a wide array of linearizing texture lookup wrapper functions.  The
+//  Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
+//  lookups are provided for completeness in case that changes someday.  Nobody
+//  is likely to use the *fetch and *proj functions, but they're included just
+//  in case.  The only tex*D texture sampling functions omitted are:
+//      - tex*Dcmpbias
+//      - tex*Dcmplod
+//      - tex*DARRAY*
+//      - tex*DMS*
+//      - Variants returning integers
+//  Standard line length restrictions are ignored below for vertical brevity.
+/*
+//  tex1D:
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex1Dbias:
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dbias(tex, tex_coords));   }
+
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dbias(tex, tex_coords, texel_off));    }
+
+//  tex1Dfetch:
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
+{   return decode_input(tex1Dfetch(tex, tex_coords));  }
+
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex1Dlod:
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dlod(tex, tex_coords));    }
+
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dlod(tex, tex_coords, texel_off));     }
+
+//  tex1Dproj:
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+*/
+//  tex2D:
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords, texel_off));    }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex2Dbias:
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
+//{   return decode_input(tex2Dbias(tex, tex_coords));   }
+
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dbias(tex, tex_coords, texel_off));    }
+
+//  tex2Dfetch:
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
+//{   return decode_input(tex2Dfetch(tex, tex_coords));  }
+
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords)
+{   return decode_input(textureLod(tex, tex_coords.xy, 0.0));    }
+
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));     }
+/*
+//  tex2Dproj:
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+*/
+/*
+//  tex3D:
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
+{   return decode_input(tex3D(tex, tex_coords));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, texel_off));    }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex3Dbias:
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dbias(tex, tex_coords));   }
+
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dbias(tex, tex_coords, texel_off));    }
+
+//  tex3Dfetch:
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
+{   return decode_input(tex3Dfetch(tex, tex_coords));  }
+
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex3Dlod:
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dlod(tex, tex_coords));    }
+
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dlod(tex, tex_coords, texel_off));     }
+
+//  tex3Dproj:
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dproj(tex, tex_coords));   }
+
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dproj(tex, tex_coords, texel_off));    }
+/////////*
+
+//  NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  This narrow selection of nonstandard tex2D* functions can be useful:
+
+//  tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0)));   }
+
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off));    }
+
+
+//  MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a narrower selection of tex2D* wrapper functions that decode an
+//  input sample with a specified gamma value.  These are useful for reading
+//  LUT's and for reading the input of pass0 in a later pass.
+
+//  tex2D:
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma);   }
+
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+/*
+//  tex2Dbias:
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma);   }
+
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma);    }
+
+//  tex2Dfetch:
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma);  }
+
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma);   }
+*/
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma);    }
+
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma);     }
+
+
+#endif  //  GAMMA_MANAGEMENT_H
+
+////////////////////////////  END GAMMA-MANAGEMENT  //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+/////////////////////////////  SCANLINE FUNCTIONS  /////////////////////////////
+
+inline float3 get_gaussian_sigma(const float3 color, const float sigma_range)
+{
+    //  Requires:   Globals:
+    //              1.) beam_min_sigma and beam_max_sigma are global floats
+    //                  containing the desired minimum and maximum beam standard
+    //                  deviations, for dim and bright colors respectively.
+    //              2.) beam_max_sigma must be > 0.0
+    //              3.) beam_min_sigma must be in (0.0, beam_max_sigma]
+    //              4.) beam_spot_power must be defined as a global float.
+    //              Parameters:
+    //              1.) color is the underlying source color along a scanline
+    //              2.) sigma_range = beam_max_sigma - beam_min_sigma; we take
+    //                  sigma_range as a parameter to avoid repeated computation
+    //                  when beam_{min, max}_sigma are runtime shader parameters
+    //  Optional:   Users may set beam_spot_shape_function to 1 to define the
+    //              inner f(color) subfunction (see below) as:
+    //                  f(color) = sqrt(1.0 - (color - 1.0)*(color - 1.0))
+    //              Otherwise (technically, if beam_spot_shape_function < 0.5):
+    //                  f(color) = pow(color, beam_spot_power)
+    //  Returns:    The standard deviation of the Gaussian beam for "color:"
+    //                  sigma = beam_min_sigma + sigma_range * f(color)
+    //  Details/Discussion:
+    //  The beam's spot shape vaguely resembles an aspect-corrected f() in the
+    //  range [0, 1] (not quite, but it's related).  f(color) = color makes
+    //  spots look like diamonds, and a spherical function or cube balances
+    //  between variable width and a soft/realistic shape.   A beam_spot_power
+    //  > 1.0 can produce an ugly spot shape and more initial clipping, but the
+    //  final shape also differs based on the horizontal resampling filter and
+    //  the phosphor bloom.  For instance, resampling horizontally in nonlinear
+    //  light and/or with a sharp (e.g. Lanczos) filter will sharpen the spot
+    //  shape, but a sixth root is still quite soft.  A power function (default
+    //  1.0/3.0 beam_spot_power) is most flexible, but a fixed spherical curve
+    //  has the highest variability without an awful spot shape.
+    //
+    //  beam_min_sigma affects scanline sharpness/aliasing in dim areas, and its
+    //  difference from beam_max_sigma affects beam width variability.  It only
+    //  affects clipping [for pure Gaussians] if beam_spot_power > 1.0 (which is
+    //  a conservative estimate for a more complex constraint).
+    //
+    //  beam_max_sigma affects clipping and increasing scanline width/softness
+    //  as color increases.  The wider this is, the more scanlines need to be
+    //  evaluated to avoid distortion.  For a pure Gaussian, the max_beam_sigma
+    //  at which the first unused scanline always has a weight < 1.0/255.0 is:
+    //      num scanlines = 2, max_beam_sigma = 0.2089; distortions begin ~0.34
+    //      num scanlines = 3, max_beam_sigma = 0.3879; distortions begin ~0.52
+    //      num scanlines = 4, max_beam_sigma = 0.5723; distortions begin ~0.70
+    //      num scanlines = 5, max_beam_sigma = 0.7591; distortions begin ~0.89
+    //      num scanlines = 6, max_beam_sigma = 0.9483; distortions begin ~1.08
+    //  Generalized Gaussians permit more leeway here as steepness increases.
+    if(beam_spot_shape_function < 0.5)
+    {
+        //  Use a power function:
+        return float3(beam_min_sigma) + sigma_range *
+            pow(color, float3(beam_spot_power));
+    }
+    else
+    {
+        //  Use a spherical function:
+        const float3 color_minus_1 = color - float3(1.0);
+        return float3(beam_min_sigma) + sigma_range *
+            sqrt(float3(1.0) - color_minus_1*color_minus_1);
+    }
+}
+
+inline float3 get_generalized_gaussian_beta(const float3 color,
+    const float shape_range)
+{
+    //  Requires:   Globals:
+    //              1.) beam_min_shape and beam_max_shape are global floats
+    //                  containing the desired min/max generalized Gaussian
+    //                  beta parameters, for dim and bright colors respectively.
+    //              2.) beam_max_shape must be >= 2.0
+    //              3.) beam_min_shape must be in [2.0, beam_max_shape]
+    //              4.) beam_shape_power must be defined as a global float.
+    //              Parameters:
+    //              1.) color is the underlying source color along a scanline
+    //              2.) shape_range = beam_max_shape - beam_min_shape; we take
+    //                  shape_range as a parameter to avoid repeated computation
+    //                  when beam_{min, max}_shape are runtime shader parameters
+    //  Returns:    The type-I generalized Gaussian "shape" parameter beta for
+    //              the given color.
+    //  Details/Discussion:
+    //  Beta affects the scanline distribution as follows:
+    //  a.) beta < 2.0 narrows the peak to a spike with a discontinuous slope
+    //  b.) beta == 2.0 just degenerates to a Gaussian
+    //  c.) beta > 2.0 flattens and widens the peak, then drops off more steeply
+    //      than a Gaussian.  Whereas high sigmas widen and soften peaks, high
+    //      beta widen and sharpen peaks at the risk of aliasing.
+    //  Unlike high beam_spot_powers, high beam_shape_powers actually soften shape
+    //  transitions, whereas lower ones sharpen them (at the risk of aliasing).
+    return beam_min_shape + shape_range * pow(color, float3(beam_shape_power));
+}
+
+float3 scanline_gaussian_integral_contrib(const float3 dist,
+    const float3 color, const float pixel_height, const float sigma_range)
+{
+    //  Requires:   1.) dist is the distance of the [potentially separate R/G/B]
+    //                  point(s) from a scanline in units of scanlines, where
+    //                  1.0 means the sample point straddles the next scanline.
+    //              2.) color is the underlying source color along a scanline.
+    //              3.) pixel_height is the output pixel height in scanlines.
+    //              4.) Requirements of get_gaussian_sigma() must be met.
+    //  Returns:    Return a scanline's light output over a given pixel.
+    //  Details:
+    //  The CRT beam profile follows a roughly Gaussian distribution which is
+    //  wider for bright colors than dark ones.  The integral over the full
+    //  range of a Gaussian function is always 1.0, so we can vary the beam
+    //  with a standard deviation without affecting brightness.  'x' = distance:
+    //      gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2))
+    //      gaussian integral = 0.5 (1.0 + erf(x/(sigma * sqrt(2))))
+    //  Use a numerical approximation of the "error function" (the Gaussian
+    //  indefinite integral) to find the definite integral of the scanline's
+    //  average brightness over a given pixel area.  Even if curved coords were
+    //  used in this pass, a flat scalar pixel height works almost as well as a
+    //  pixel height computed from a full pixel-space to scanline-space matrix.
+    const float3 sigma = get_gaussian_sigma(color, sigma_range);
+    const float3 ph_offset = float3(pixel_height * 0.5);
+    const float3 denom_inv = 1.0/(sigma*sqrt(2.0));
+    const float3 integral_high = erf((dist + ph_offset)*denom_inv);
+    const float3 integral_low = erf((dist - ph_offset)*denom_inv);
+    return color * 0.5*(integral_high - integral_low)/pixel_height;
+}
+
+float3 scanline_generalized_gaussian_integral_contrib(float3 dist,
+    float3 color, float pixel_height, float sigma_range,
+    float shape_range)
+{
+    //  Requires:   1.) Requirements of scanline_gaussian_integral_contrib()
+    //                  must be met.
+    //              2.) Requirements of get_gaussian_sigma() must be met.
+    //              3.) Requirements of get_generalized_gaussian_beta() must be
+    //                  met.
+    //  Returns:    Return a scanline's light output over a given pixel.
+    //  A generalized Gaussian distribution allows the shape (beta) to vary
+    //  as well as the width (alpha).  "gamma" refers to the gamma function:
+    //      generalized sample =
+    //          beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta)
+    //  ligamma(s, z) is the lower incomplete gamma function, for which we only
+    //  implement two of four branches (because we keep 1/beta <= 0.5):
+    //      generalized integral = 0.5 + 0.5* sign(x) *
+    //          ligamma(1/beta, (|x|/alpha)**beta)/gamma(1/beta)
+    //  See get_generalized_gaussian_beta() for a discussion of beta.
+    //  We base alpha on the intended Gaussian sigma, but it only strictly
+    //  models models standard deviation at beta == 2, because the standard
+    //  deviation depends on both alpha and beta (keeping alpha independent is
+    //  faster and preserves intuitive behavior and a full spectrum of results).
+    const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
+    const float3 beta = get_generalized_gaussian_beta(color, shape_range);
+    const float3 alpha_inv = float3(1.0)/alpha;
+    const float3 s = float3(1.0)/beta;
+    const float3 ph_offset = float3(pixel_height * 0.5);
+    //  Pass beta to gamma_impl to avoid repeated divides.  Similarly pass
+    //  beta (i.e. 1/s) and 1/gamma(s) to normalized_ligamma_impl.
+    const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, beta);
+    const float3 dist1 = dist + ph_offset;
+    const float3 dist0 = dist - ph_offset;
+    const float3 integral_high = sign(dist1) * normalized_ligamma_impl(
+        s, pow(abs(dist1)*alpha_inv, beta), beta, gamma_s_inv);
+    const float3 integral_low = sign(dist0) * normalized_ligamma_impl(
+        s, pow(abs(dist0)*alpha_inv, beta), beta, gamma_s_inv);
+    return color * 0.5*(integral_high - integral_low)/pixel_height;
+}
+
+float3 scanline_gaussian_sampled_contrib(const float3 dist, const float3 color,
+    const float pixel_height, const float sigma_range)
+{
+    //  See scanline_gaussian integral_contrib() for detailed comments!
+    //  gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2))
+    const float3 sigma = get_gaussian_sigma(color, sigma_range);
+    //  Avoid repeated divides:
+    const float3 sigma_inv = float3(1.0)/sigma;
+    const float3 inner_denom_inv = 0.5 * sigma_inv * sigma_inv;
+    const float3 outer_denom_inv = sigma_inv/sqrt(2.0*pi);
+    if(beam_antialias_level > 0.5)
+    {
+        //  Sample 1/3 pixel away in each direction as well:
+        const float3 sample_offset = float3(pixel_height/3.0);
+        const float3 dist2 = dist + sample_offset;
+        const float3 dist3 = abs(dist - sample_offset);
+        //  Average three pure Gaussian samples:
+        const float3 scale = color/3.0 * outer_denom_inv;
+        const float3 weight1 = exp(-(dist*dist)*inner_denom_inv);
+        const float3 weight2 = exp(-(dist2*dist2)*inner_denom_inv);
+        const float3 weight3 = exp(-(dist3*dist3)*inner_denom_inv);
+        return scale * (weight1 + weight2 + weight3);
+    }
+    else
+    {
+        return color*exp(-(dist*dist)*inner_denom_inv)*outer_denom_inv;
+    }
+}
+
+float3 scanline_generalized_gaussian_sampled_contrib(float3 dist,
+    float3 color, float pixel_height, float sigma_range,
+    float shape_range)
+{
+    //  See scanline_generalized_gaussian_integral_contrib() for details!
+    //  generalized sample =
+    //      beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta)
+    const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
+    const float3 beta = get_generalized_gaussian_beta(color, shape_range);
+    //  Avoid repeated divides:
+    const float3 alpha_inv = float3(1.0)/alpha;
+    const float3 beta_inv = float3(1.0)/beta;
+    const float3 scale = color * beta * 0.5 * alpha_inv /
+        gamma_impl(beta_inv, beta);
+    if(beam_antialias_level > 0.5)
+    {
+        //  Sample 1/3 pixel closer to and farther from the scanline too.
+        const float3 sample_offset = float3(pixel_height/3.0);
+        const float3 dist2 = dist + sample_offset;
+        const float3 dist3 = abs(dist - sample_offset);
+        //  Average three generalized Gaussian samples:
+        const float3 weight1 = exp(-pow(abs(dist*alpha_inv), beta));
+        const float3 weight2 = exp(-pow(abs(dist2*alpha_inv), beta));
+        const float3 weight3 = exp(-pow(abs(dist3*alpha_inv), beta));
+        return scale/3.0 * (weight1 + weight2 + weight3);
+    }
+    else
+    {
+        return scale * exp(-pow(abs(dist*alpha_inv), beta));
+    }
+}
+
+inline float3 scanline_contrib(float3 dist, float3 color,
+    float pixel_height, const float sigma_range, const float shape_range)
+{
+    //  Requires:   1.) Requirements of scanline_gaussian_integral_contrib()
+    //                  must be met.
+    //              2.) Requirements of get_gaussian_sigma() must be met.
+    //              3.) Requirements of get_generalized_gaussian_beta() must be
+    //                  met.
+    //  Returns:    Return a scanline's light output over a given pixel, using
+    //              a generalized or pure Gaussian distribution and sampling or
+    //              integrals as desired by user codepath choices.
+    if(beam_generalized_gaussian)
+    {
+        if(beam_antialias_level > 1.5)
+        {
+            return scanline_generalized_gaussian_integral_contrib(
+                dist, color, pixel_height, sigma_range, shape_range);
+        }
+        else
+        {
+            return scanline_generalized_gaussian_sampled_contrib(
+                dist, color, pixel_height, sigma_range, shape_range);
+        }
+    }
+    else
+    {
+        if(beam_antialias_level > 1.5)
+        {
+            return scanline_gaussian_integral_contrib(
+                dist, color, pixel_height, sigma_range);
+        }
+        else
+        {
+            return scanline_gaussian_sampled_contrib(
+                dist, color, pixel_height, sigma_range);
+        }
+    }
+}
+
+inline float3 get_raw_interpolated_color(const float3 color0,
+    const float3 color1, const float3 color2, const float3 color3,
+    const float4 weights)
+{
+    //  Use max to avoid bizarre artifacts from negative colors:
+    return max(mul(weights, float4x3(color0, color1, color2, color3)), 0.0);
+}
+
+float3 get_interpolated_linear_color(const float3 color0, const float3 color1,
+    const float3 color2, const float3 color3, const float4 weights)
+{
+    //  Requires:   1.) Requirements of include/gamma-management.h must be met:
+    //                  intermediate_gamma must be globally defined, and input
+    //                  colors are interpreted as linear RGB unless you #define
+    //                  GAMMA_ENCODE_EVERY_FBO (in which case they are
+    //                  interpreted as gamma-encoded with intermediate_gamma).
+    //              2.) color0-3 are colors sampled from a texture with tex2D().
+    //                  They are interpreted as defined in requirement 1.
+    //              3.) weights contains weights for each color, summing to 1.0.
+    //              4.) beam_horiz_linear_rgb_weight must be defined as a global
+    //                  float in [0.0, 1.0] describing how much blending should
+    //                  be done in linear RGB (rest is gamma-corrected RGB).
+    //              5.) RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE must be #defined
+    //                  if beam_horiz_linear_rgb_weight is anything other than a
+    //                  static constant, or we may try branching at runtime
+    //                  without dynamic branches allowed (slow).
+    //  Returns:    Return an interpolated color lookup between the four input
+    //              colors based on the weights in weights.  The final color will
+    //              be a linear RGB value, but the blending will be done as
+    //              indicated above.
+    const float intermediate_gamma = get_intermediate_gamma();
+    //  Branch if beam_horiz_linear_rgb_weight is static (for free) or if the
+    //  profile allows dynamic branches (faster than computing extra pows):
+    #ifndef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+    #else
+        #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+            #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+        #endif
+    #endif
+    #ifdef SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+        //  beam_horiz_linear_rgb_weight is static, so we can branch:
+        #ifdef GAMMA_ENCODE_EVERY_FBO
+            const float3 gamma_mixed_color = pow(get_raw_interpolated_color(
+                color0, color1, color2, color3, weights), float3(intermediate_gamma));
+            if(beam_horiz_linear_rgb_weight > 0.0)
+            {
+                const float3 linear_mixed_color = get_raw_interpolated_color(
+                    pow(color0, float3(intermediate_gamma)),
+                    pow(color1, float3(intermediate_gamma)),
+                    pow(color2, float3(intermediate_gamma)),
+                    pow(color3, float3(intermediate_gamma)),
+                    weights);
+                return lerp(gamma_mixed_color, linear_mixed_color,
+                    beam_horiz_linear_rgb_weight);
+            }
+            else
+            {
+                return gamma_mixed_color;
+            }
+        #else
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                color0, color1, color2, color3, weights);
+            if(beam_horiz_linear_rgb_weight < 1.0)
+            {
+                const float3 gamma_mixed_color = get_raw_interpolated_color(
+                    pow(color0, float3(1.0/intermediate_gamma)),
+                    pow(color1, float3(1.0/intermediate_gamma)),
+                    pow(color2, float3(1.0/intermediate_gamma)),
+                    pow(color3, float3(1.0/intermediate_gamma)),
+                    weights);
+                return lerp(gamma_mixed_color, linear_mixed_color,
+                    beam_horiz_linear_rgb_weight);
+            }
+            else
+            {
+                return linear_mixed_color;
+            }
+        #endif  //  GAMMA_ENCODE_EVERY_FBO
+    #else
+        #ifdef GAMMA_ENCODE_EVERY_FBO
+            //  Inputs: color0-3 are colors in gamma-encoded RGB.
+            const float3 gamma_mixed_color = pow(get_raw_interpolated_color(
+                color0, color1, color2, color3, weights), intermediate_gamma);
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                pow(color0, float3(intermediate_gamma)),
+                pow(color1, float3(intermediate_gamma)),
+                pow(color2, float3(intermediate_gamma)),
+                pow(color3, float3(intermediate_gamma)),
+                weights);
+            return lerp(gamma_mixed_color, linear_mixed_color,
+                beam_horiz_linear_rgb_weight);
+        #else
+            //  Inputs: color0-3 are colors in linear RGB.
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                color0, color1, color2, color3, weights);
+            const float3 gamma_mixed_color = get_raw_interpolated_color(
+                    pow(color0, float3(1.0/intermediate_gamma)),
+                    pow(color1, float3(1.0/intermediate_gamma)),
+                    pow(color2, float3(1.0/intermediate_gamma)),
+                    pow(color3, float3(1.0/intermediate_gamma)),
+                    weights);
+			// wtf fixme
+//			const float beam_horiz_linear_rgb_weight1 = 1.0;
+            return lerp(gamma_mixed_color, linear_mixed_color,
+                beam_horiz_linear_rgb_weight);
+        #endif  //  GAMMA_ENCODE_EVERY_FBO
+    #endif  //  SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+}
+
+float3 get_scanline_color(const sampler2D tex, const float2 scanline_uv,
+    const float2 uv_step_x, const float4 weights)
+{
+    //  Requires:   1.) scanline_uv must be vertically snapped to the caller's
+    //                  desired line or scanline and horizontally snapped to the
+    //                  texel just left of the output pixel (color1)
+    //              2.) uv_step_x must contain the horizontal uv distance
+    //                  between texels.
+    //              3.) weights must contain interpolation filter weights for
+    //                  color0, color1, color2, and color3, where color1 is just
+    //                  left of the output pixel.
+    //  Returns:    Return a horizontally interpolated texture lookup using 2-4
+    //              nearby texels, according to weights and the conventions of
+    //              get_interpolated_linear_color().
+    //  We can ignore the outside texture lookups for Quilez resampling.
+    const float3 color1 = COMPAT_TEXTURE(tex, scanline_uv).rgb;
+    const float3 color2 = COMPAT_TEXTURE(tex, scanline_uv + uv_step_x).rgb;
+    float3 color0 = float3(0.0);
+    float3 color3 = float3(0.0);
+    if(beam_horiz_filter > 0.5)
+    {
+        color0 = COMPAT_TEXTURE(tex, scanline_uv - uv_step_x).rgb;
+        color3 = COMPAT_TEXTURE(tex, scanline_uv + 2.0 * uv_step_x).rgb;
+    }
+    //  Sample the texture as-is, whether it's linear or gamma-encoded:
+    //  get_interpolated_linear_color() will handle the difference.
+    return get_interpolated_linear_color(color0, color1, color2, color3, weights);
+}
+
+float3 sample_single_scanline_horizontal(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size,
+    const float2 texture_size_inv)
+{
+    //  TODO: Add function requirements.
+    //  Snap to the previous texel and get sample dists from 2/4 nearby texels:
+    const float2 curr_texel = tex_uv * tex_size;
+    //  Use under_half to fix a rounding bug right around exact texel locations.
+    const float2 prev_texel =
+        floor(curr_texel - float2(under_half)) + float2(0.5);
+    const float2 prev_texel_hor = float2(prev_texel.x, curr_texel.y);
+    const float2 prev_texel_hor_uv = prev_texel_hor * texture_size_inv;
+    const float prev_dist = curr_texel.x - prev_texel_hor.x;
+    const float4 sample_dists = float4(1.0 + prev_dist, prev_dist,
+        1.0 - prev_dist, 2.0 - prev_dist);
+    //  Get Quilez, Lanczos2, or Gaussian resize weights for 2/4 nearby texels:
+    float4 weights;
+    if(beam_horiz_filter < 0.5)
+    {
+        //  Quilez:
+        const float x = sample_dists.y;
+        const float w2 = x*x*x*(x*(x*6.0 - 15.0) + 10.0);
+        weights = float4(0.0, 1.0 - w2, w2, 0.0);
+    }
+    else if(beam_horiz_filter < 1.5)
+    {
+        //  Gaussian:
+        float inner_denom_inv = 1.0/(2.0*beam_horiz_sigma*beam_horiz_sigma);
+        weights = exp(-(sample_dists*sample_dists)*inner_denom_inv);
+    }
+    else
+    {
+        //  Lanczos2:
+        const float4 pi_dists = FIX_ZERO(sample_dists * pi);
+        weights = 2.0 * sin(pi_dists) * sin(pi_dists * 0.5) /
+            (pi_dists * pi_dists);
+    }
+    //  Ensure the weight sum == 1.0:
+    const float4 final_weights = weights/dot(weights, float4(1.0));
+    //  Get the interpolated horizontal scanline color:
+    const float2 uv_step_x = float2(texture_size_inv.x, 0.0);
+    return get_scanline_color(
+        tex, prev_texel_hor_uv, uv_step_x, final_weights);
+}
+
+float3 sample_rgb_scanline_horizontal(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size,
+    const float2 texture_size_inv)
+{
+    //  TODO: Add function requirements.
+    //  Rely on a helper to make convergence easier.
+    if(beam_misconvergence)
+    {
+        const float3 convergence_offsets_rgb =
+            get_convergence_offsets_x_vector();
+        const float3 offset_u_rgb =
+            convergence_offsets_rgb * texture_size_inv.xxx;
+        const float2 scanline_uv_r = tex_uv - float2(offset_u_rgb.r, 0.0);
+        const float2 scanline_uv_g = tex_uv - float2(offset_u_rgb.g, 0.0);
+        const float2 scanline_uv_b = tex_uv - float2(offset_u_rgb.b, 0.0);
+        const float3 sample_r = sample_single_scanline_horizontal(
+            tex, scanline_uv_r, tex_size, texture_size_inv);
+        const float3 sample_g = sample_single_scanline_horizontal(
+            tex, scanline_uv_g, tex_size, texture_size_inv);
+        const float3 sample_b = sample_single_scanline_horizontal(
+            tex, scanline_uv_b, tex_size, texture_size_inv);
+        return float3(sample_r.r, sample_g.g, sample_b.b);
+    }
+    else
+    {
+        return sample_single_scanline_horizontal(tex, tex_uv, tex_size,
+            texture_size_inv);
+    }
+}
+
+float2 get_last_scanline_uv(const float2 tex_uv, const float2 tex_size,
+    const float2 texture_size_inv, const float2 il_step_multiple,
+    const float frame_count, out float dist)
+{
+    //  Compute texture coords for the last/upper scanline, accounting for
+    //  interlacing: With interlacing, only consider even/odd scanlines every
+    //  other frame.  Top-field first (TFF) order puts even scanlines on even
+    //  frames, and BFF order puts them on odd frames.  Texels are centered at:
+    //      frac(tex_uv * tex_size) == x.5
+    //  Caution: If these coordinates ever seem incorrect, first make sure it's
+    //  not because anisotropic filtering is blurring across field boundaries.
+    //  Note: TFF/BFF won't matter for sources that double-weave or similar.
+	// wtf fixme
+//	const float interlace_bff1 = 1.0;
+    const float field_offset = floor(il_step_multiple.y * 0.75) *
+        fmod(frame_count + float(interlace_bff), 2.0);
+    const float2 curr_texel = tex_uv * tex_size;
+    //  Use under_half to fix a rounding bug right around exact texel locations.
+    const float2 prev_texel_num = floor(curr_texel - float2(under_half));
+    const float wrong_field = fmod(
+        prev_texel_num.y + field_offset, il_step_multiple.y);
+    const float2 scanline_texel_num = prev_texel_num - float2(0.0, wrong_field);
+    //  Snap to the center of the previous scanline in the current field:
+    const float2 scanline_texel = scanline_texel_num + float2(0.5);
+    const float2 scanline_uv = scanline_texel * texture_size_inv;
+    //  Save the sample's distance from the scanline, in units of scanlines:
+    dist = (curr_texel.y - scanline_texel.y)/il_step_multiple.y;
+    return scanline_uv;
+}
+
+inline bool is_interlaced(float num_lines)
+{
+    //  Detect interlacing based on the number of lines in the source.
+    if(interlace_detect)
+    {
+        //  NTSC: 525 lines, 262.5/field; 486 active (2 half-lines), 243/field
+        //  NTSC Emulators: Typically 224 or 240 lines
+        //  PAL: 625 lines, 312.5/field; 576 active (typical), 288/field
+        //  PAL Emulators: ?
+        //  ATSC: 720p, 1080i, 1080p
+        //  Where do we place our cutoffs?  Assumptions:
+        //  1.) We only need to care about active lines.
+        //  2.) Anything > 288 and <= 576 lines is probably interlaced.
+        //  3.) Anything > 576 lines is probably not interlaced...
+        //  4.) ...except 1080 lines, which is a crapshoot (user decision).
+        //  5.) Just in case the main program uses calculated video sizes,
+        //      we should nudge the float thresholds a bit.
+        const bool sd_interlace = ((num_lines > 288.5) && (num_lines < 576.5));
+        const bool hd_interlace = bool(interlace_1080i) ?
+            ((num_lines > 1079.5) && (num_lines < 1080.5)) :
+            false;
+        return (sd_interlace || hd_interlace);
+    }
+    else
+    {
+        return false;
+    }
+}
+
+#endif  //  SCANLINE_FUNCTIONS_H
+
+/////////////////////////////  END SCANLINE-FUNCTIONS  ////////////////////////////
+
+void main() {
+   gl_Position = position;
+   vTexCoord = texCoord;
+   uv_step = float2(1.0)/texture_size;
+   
+   //  Detect interlacing: 1.0 = true, 0.0 = false.
+   const float2 _video_size = video_size;
+   interlaced = float(is_interlaced(_video_size.y));
+}
\ No newline at end of file
diff --git a/shaders/CRT-Royale.shader/geometry-aa-last-pass.fs b/shaders/CRT-Royale.shader/geometry-aa-last-pass.fs
new file mode 100644
index 00000000..87d1b721
--- /dev/null
+++ b/shaders/CRT-Royale.shader/geometry-aa-last-pass.fs
@@ -0,0 +1,5279 @@
+#version 150
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+uniform sampler2D source[];
+uniform vec4 sourceSize[];
+uniform vec4 targetSize;
+uniform int phase;
+
+in Vertex {
+   vec2 vTexCoord;
+   vec2 tex_uv;
+   vec4 video_and_texture_size_inv;
+   vec2 output_size_inv;
+   vec3 eye_pos_local;
+   vec4 geom_aspect_and_overscan;
+   vec3 global_to_local_row0;
+   vec3 global_to_local_row1;
+   vec3 global_to_local_row2;
+};
+
+out vec4 FragColor;
+
+// USER SETTINGS BLOCK //
+
+#define crt_gamma 2.500000
+#define lcd_gamma 2.200000
+#define levels_contrast 1.0
+#define halation_weight 0.0
+#define diffusion_weight 0.075
+#define bloom_underestimate_levels 0.8
+#define bloom_excess 0.000000
+#define beam_min_sigma 0.020000
+#define beam_max_sigma 0.300000
+#define beam_spot_power 0.330000
+#define beam_min_shape 2.000000
+#define beam_max_shape 4.000000
+#define beam_shape_power 0.250000
+#define beam_horiz_filter 0.000000
+#define beam_horiz_sigma 0.35
+#define beam_horiz_linear_rgb_weight 1.000000
+#define convergence_offset_x_r -0.000000
+#define convergence_offset_x_g 0.000000
+#define convergence_offset_x_b 0.000000
+#define convergence_offset_y_r 0.000000
+#define convergence_offset_y_g -0.000000
+#define convergence_offset_y_b 0.000000
+#define mask_type 1.000000
+#define mask_sample_mode_desired 0.000000
+#define mask_specify_num_triads 0.000000
+#define mask_triad_size_desired 3.000000
+#define mask_num_triads_desired 480.000000
+#define aa_subpixel_r_offset_x_runtime -0.0
+#define aa_subpixel_r_offset_y_runtime 0.000000
+#define aa_cubic_c 0.500000
+#define aa_gauss_sigma 0.500000
+#define geom_mode_runtime 1.000000
+#define geom_radius 2.000000
+#define geom_view_dist 2.000000
+#define geom_tilt_angle_x 0.000000
+#define geom_tilt_angle_y 0.000000
+#define geom_aspect_ratio_x 432.000000
+#define geom_aspect_ratio_y 329.000000
+#define geom_overscan_x 1.000000
+#define geom_overscan_y 1.000000
+#define border_size 0.015
+#define border_darkness 2.0
+#define border_compress 2.500000
+#define interlace_bff 0.000000
+#define interlace_1080i 0.000000
+
+// END USER SETTINGS BLOCK //
+
+// compatibility macros for transparently converting HLSLisms into GLSLisms
+#define mul(a,b) (b*a)
+#define lerp(a,b,c) mix(a,b,c)
+#define saturate(c) clamp(c, 0.0, 1.0)
+#define frac(x) (fract(x))
+#define float2 vec2
+#define float3 vec3
+#define float4 vec4
+#define bool2 bvec2
+#define bool3 bvec3
+#define bool4 bvec4
+#define float2x2 mat2x2
+#define float3x3 mat3x3
+#define float4x4 mat4x4
+#define float4x3 mat4x3
+#define float2x4 mat2x4
+#define IN params
+#define texture_size sourceSize[0].xy
+#define video_size sourceSize[0].xy
+#define output_size targetSize.xy
+#define frame_count phase
+#define static  
+#define inline  
+#define const  
+#define fmod(x,y) mod(x,y)
+#define ddx(c) dFdx(c)
+#define ddy(c) dFdy(c)
+#define atan2(x,y) atan(x,y)
+#define rsqrt(c) inversesqrt(c)
+
+#define input_texture source[0]
+
+#if defined(GL_ES)
+	#define COMPAT_PRECISION mediump
+#else
+	#define COMPAT_PRECISION
+#endif
+
+#if __VERSION__ >= 130
+	#define COMPAT_TEXTURE texture
+#else
+	#define COMPAT_TEXTURE texture2D
+#endif
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+#define LAST_PASS
+#define SIMULATE_CRT_ON_LCD
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+////////////////////////////  END USER-SETTINGS  //////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  END DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////////////
+
+//#include "bind-shader-h"
+
+/////////////////////////////  BEGIN BIND-SHADER-PARAMS  ////////////////////////////
+
+#ifndef BIND_SHADER_PARAMS_H
+#define BIND_SHADER_PARAMS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+/////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+////////////////////   END DERIVED-SETTINGS-AND-CONSTANTS   /////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+//  Override some parameters for gamma-management.h and tex2Dantialias.h:
+#define OVERRIDE_DEVICE_GAMMA
+static const float gba_gamma = 3.5; //  Irrelevant but necessary to define.
+#define ANTIALIAS_OVERRIDE_BASICS
+#define ANTIALIAS_OVERRIDE_PARAMETERS
+
+//  Provide accessors for vector constants that pack scalar uniforms:
+inline float2 get_aspect_vector(const float geom_aspect_ratio)
+{
+    //  Get an aspect ratio vector.  Enforce geom_max_aspect_ratio, and prevent
+    //  the absolute scale from affecting the uv-mapping for curvature:
+    const float geom_clamped_aspect_ratio =
+        min(geom_aspect_ratio, geom_max_aspect_ratio);
+    const float2 geom_aspect =
+        normalize(float2(geom_clamped_aspect_ratio, 1.0));
+    return geom_aspect;
+}
+
+inline float2 get_geom_overscan_vector()
+{
+    return float2(geom_overscan_x, geom_overscan_y);
+}
+
+inline float2 get_geom_tilt_angle_vector()
+{
+    return float2(geom_tilt_angle_x, geom_tilt_angle_y);
+}
+
+inline float3 get_convergence_offsets_x_vector()
+{
+    return float3(convergence_offset_x_r, convergence_offset_x_g,
+        convergence_offset_x_b);
+}
+
+inline float3 get_convergence_offsets_y_vector()
+{
+    return float3(convergence_offset_y_r, convergence_offset_y_g,
+        convergence_offset_y_b);
+}
+
+inline float2 get_convergence_offsets_r_vector()
+{
+    return float2(convergence_offset_x_r, convergence_offset_y_r);
+}
+
+inline float2 get_convergence_offsets_g_vector()
+{
+    return float2(convergence_offset_x_g, convergence_offset_y_g);
+}
+
+inline float2 get_convergence_offsets_b_vector()
+{
+    return float2(convergence_offset_x_b, convergence_offset_y_b);
+}
+
+inline float2 get_aa_subpixel_r_offset()
+{
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+            //  WARNING: THIS IS EXTREMELY EXPENSIVE.
+            return float2(aa_subpixel_r_offset_x_runtime,
+                aa_subpixel_r_offset_y_runtime);
+        #else
+            return aa_subpixel_r_offset_static;
+        #endif
+    #else
+        return aa_subpixel_r_offset_static;
+    #endif
+}
+
+//  Provide accessors settings which still need "cooking:"
+inline float get_mask_amplify()
+{
+    static const float mask_grille_amplify = 1.0/mask_grille_avg_color;
+    static const float mask_slot_amplify = 1.0/mask_slot_avg_color;
+    static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color;
+    return mask_type < 0.5 ? mask_grille_amplify :
+        mask_type < 1.5 ? mask_slot_amplify :
+        mask_shadow_amplify;
+}
+
+inline float get_mask_sample_mode()
+{
+    #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_desired;
+        #else
+            return clamp(mask_sample_mode_desired, 1.0, 2.0);
+        #endif
+    #else
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_static;
+        #else
+            return clamp(mask_sample_mode_static, 1.0, 2.0);
+        #endif
+    #endif
+}
+
+#endif  //  BIND_SHADER_PARAMS_H
+
+////////////////////////////  END BIND-SHADER-PARAMS  ///////////////////////////
+
+#ifndef RUNTIME_GEOMETRY_TILT
+    //  Create a local-to-global rotation matrix for the CRT's coordinate frame
+    //  and its global-to-local inverse.  See the vertex shader for details.
+    //  It's faster to compute these statically if possible.
+    static const float2 sin_tilt = sin(geom_tilt_angle_static);
+    static const float2 cos_tilt = cos(geom_tilt_angle_static);
+    static const float3x3 geom_local_to_global_static = float3x3(
+        cos_tilt.x, sin_tilt.y*sin_tilt.x, cos_tilt.y*sin_tilt.x,
+        0.0, cos_tilt.y, -sin_tilt.y,
+        -sin_tilt.x, sin_tilt.y*cos_tilt.x, cos_tilt.y*cos_tilt.x);
+    static const float3x3 geom_global_to_local_static = float3x3(
+        cos_tilt.x, 0.0, -sin_tilt.x,
+        sin_tilt.y*sin_tilt.x, cos_tilt.y, sin_tilt.y*cos_tilt.x,
+        cos_tilt.y*sin_tilt.x, -sin_tilt.y, cos_tilt.y*cos_tilt.x);
+#endif
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//#include "../../../../include/gamma-management.h"
+
+////////////////////////////  BEGIN GAMMA-MANAGEMENT  //////////////////////////
+
+#ifndef GAMMA_MANAGEMENT_H
+#define GAMMA_MANAGEMENT_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//  
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides gamma-aware tex*D*() and encode_output() functions.
+//  Requires:   Before #include-ing this file, the including file must #define
+//              the following macros when applicable and follow their rules:
+//              1.) #define FIRST_PASS if this is the first pass.
+//              2.) #define LAST_PASS if this is the last pass.
+//              3.) If sRGB is available, set srgb_framebufferN = "true" for
+//                  every pass except the last in your .cgp preset.
+//              4.) If sRGB isn't available but you want gamma-correctness with
+//                  no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
+//              5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
+//              6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
+//              7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
+//              8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
+//              If an option in [5, 8] is #defined in the first or last pass, it
+//              should be #defined for both.  It shouldn't make a difference
+//              whether it's #defined for intermediate passes or not.
+//  Optional:   The including file (or an earlier included file) may optionally
+//              #define a number of macros indicating it will override certain
+//              macros and associated constants are as follows:
+//              static constants with either static or uniform constants.  The
+//              1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
+//                  static const float ntsc_gamma
+//                  static const float pal_gamma
+//                  static const float crt_reference_gamma_high
+//                  static const float crt_reference_gamma_low
+//                  static const float lcd_reference_gamma
+//                  static const float crt_office_gamma
+//                  static const float lcd_office_gamma
+//              2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
+//                  static const float crt_gamma
+//                  static const float gba_gamma
+//                  static const float lcd_gamma
+//              3.) OVERRIDE_FINAL_GAMMA: The user must first define:
+//                  static const float input_gamma
+//                  static const float intermediate_gamma
+//                  static const float output_gamma
+//                  (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
+//              4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
+//                  static const bool assume_opaque_alpha
+//              The gamma constant overrides must be used in every pass or none,
+//              and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
+//              OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
+//  Usage:      After setting macros appropriately, ignore gamma correction and
+//              replace all tex*D*() calls with equivalent gamma-aware
+//              tex*D*_linearize calls, except:
+//              1.) When you read an LUT, use regular tex*D or a gamma-specified
+//                  function, depending on its gamma encoding:
+//                      tex*D*_linearize_gamma (takes a runtime gamma parameter)
+//              2.) If you must read pass0's original input in a later pass, use
+//                  tex2D_linearize_ntsc_gamma.  If you want to read pass0's
+//                  input with gamma-corrected bilinear filtering, consider
+//                  creating a first linearizing pass and reading from the input
+//                  of pass1 later.
+//              Then, return encode_output(color) from every fragment shader.
+//              Finally, use the global gamma_aware_bilinear boolean if you want
+//              to statically branch based on whether bilinear filtering is
+//              gamma-correct or not (e.g. for placing Gaussian blur samples).
+//
+//  Detailed Policy:
+//  tex*D*_linearize() functions enforce a consistent gamma-management policy
+//  based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings.  They assume
+//  their input texture has the same encoding characteristics as the input for
+//  the current pass (which doesn't apply to the exceptions listed above).
+//  Similarly, encode_output() enforces a policy based on the LAST_PASS and
+//  GAMMA_ENCODE_EVERY_FBO settings.  Together, they result in one of the
+//  following two pipelines.
+//  Typical pipeline with intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = linear_color;     //  Automatic sRGB encoding
+//      linear_color = intermediate_output;     //  Automatic sRGB decoding
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Typical pipeline without intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
+//      linear_color = pow(intermediate_output, intermediate_gamma);
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
+//  easily get gamma-correctness without banding on devices where sRGB isn't
+//  supported.
+//
+//  Use This Header to Maximize Code Reuse:
+//  The purpose of this header is to provide a consistent interface for texture
+//  reads and output gamma-encoding that localizes and abstracts away all the
+//  annoying details.  This greatly reduces the amount of code in each shader
+//  pass that depends on the pass number in the .cgp preset or whether sRGB
+//  FBO's are being used: You can trivially change the gamma behavior of your
+//  whole pass by commenting or uncommenting 1-3 #defines.  To reuse the same
+//  code in your first, Nth, and last passes, you can even put it all in another
+//  header file and #include it from skeleton .cg files that #define the
+//  appropriate pass-specific settings.
+//
+//  Rationale for Using Three Macros:
+//  This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
+//  SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
+//  a lower maintenance burden on each pass.  At first glance it seems we could
+//  accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
+//  This works for simple use cases where input_gamma == output_gamma, but it
+//  breaks down for more complex scenarios like CRT simulation, where the pass
+//  number determines the gamma encoding of the input and output.
+
+
+///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
+
+//  Set standard gamma constants, but allow users to override them:
+#ifndef OVERRIDE_STANDARD_GAMMA
+    //  Standard encoding gammas:
+    static const float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
+    static const float pal_gamma = 2.8;     //  Never actually 2.8 in practice
+    //  Typical device decoding gammas (only use for emulating devices):
+    //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
+    //  gammas: The standards purposely undercorrected for an analog CRT's
+    //  assumed 2.5 reference display gamma to maintain contrast in assumed
+    //  [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
+    //  These unstated assumptions about display gamma and perceptual rendering
+    //  intent caused a lot of confusion, and more modern CRT's seemed to target
+    //  NTSC 2.2 gamma with circuitry.  LCD displays seem to have followed suit
+    //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
+    //  displays designed to view sRGB in bright environments.  (Standards are
+    //  also in flux again with BT.1886, but it's underspecified for displays.)
+    static const float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
+    static const float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
+    static const float lcd_reference_gamma = 2.5;       //  To match CRT
+    static const float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
+    static const float lcd_office_gamma = 2.2;  //  Approximates sRGB
+#endif  //  OVERRIDE_STANDARD_GAMMA
+
+//  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
+//  but only if they're aware of it.
+#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
+    static const bool assume_opaque_alpha = false;
+#endif
+
+
+///////////////////////  DERIVED CONSTANTS AS FUNCTIONS  ///////////////////////
+
+//  gamma-management.h should be compatible with overriding gamma values with
+//  runtime user parameters, but we can only define other global constants in
+//  terms of static constants, not uniform user parameters.  To get around this
+//  limitation, we need to define derived constants using functions.
+
+//  Set device gamma constants, but allow users to override them:
+#ifdef OVERRIDE_DEVICE_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_crt_gamma()    {   return crt_gamma;   }
+    inline float get_gba_gamma()    {   return gba_gamma;   }
+    inline float get_lcd_gamma()    {   return lcd_gamma;   }
+#else
+    inline float get_crt_gamma()    {   return crt_reference_gamma_high;    }
+    inline float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
+    inline float get_lcd_gamma()    {   return lcd_office_gamma;            }
+#endif  //  OVERRIDE_DEVICE_GAMMA
+
+//  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
+#ifdef OVERRIDE_FINAL_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_intermediate_gamma()   {   return intermediate_gamma;  }
+    inline float get_input_gamma()          {   return input_gamma;         }
+    inline float get_output_gamma()         {   return output_gamma;        }
+#else
+    //  If we gamma-correct every pass, always use ntsc_gamma between passes to
+    //  ensure middle passes don't need to care if anything is being simulated:
+    inline float get_intermediate_gamma()   {   return ntsc_gamma;          }
+    #ifdef SIMULATE_CRT_ON_LCD
+        inline float get_input_gamma()      {   return get_crt_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_LCD
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_LCD_ON_CRT
+        inline float get_input_gamma()      {   return get_lcd_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_CRT
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else   //  Don't simulate anything:
+        inline float get_input_gamma()      {   return ntsc_gamma;          }
+        inline float get_output_gamma()     {   return ntsc_gamma;          }
+    #endif  //  SIMULATE_GBA_ON_CRT
+    #endif  //  SIMULATE_LCD_ON_CRT
+    #endif  //  SIMULATE_GBA_ON_LCD
+    #endif  //  SIMULATE_CRT_ON_LCD
+#endif  //  OVERRIDE_FINAL_GAMMA
+
+//  Set decoding/encoding gammas for the current pass.  Use static constants for
+//  linearize_input and gamma_encode_output, because they aren't derived, and
+//  they let the compiler do dead-code elimination.
+#ifndef GAMMA_ENCODE_EVERY_FBO
+    #ifdef FIRST_PASS
+        static const bool linearize_input = true;
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        static const bool linearize_input = false;
+        inline float get_pass_input_gamma()     {   return 1.0;                 }
+    #endif
+    #ifdef LAST_PASS
+        static const bool gamma_encode_output = true;
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        static const bool gamma_encode_output = false;
+        inline float get_pass_output_gamma()    {   return 1.0;                 }
+    #endif
+#else
+    static const bool linearize_input = true;
+    static const bool gamma_encode_output = true;
+    #ifdef FIRST_PASS
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        inline float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
+    #endif
+    #ifdef LAST_PASS
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        inline float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
+    #endif
+#endif
+
+//  Users might want to know if bilinear filtering will be gamma-correct:
+static const bool gamma_aware_bilinear = !linearize_input;
+
+
+//////////////////////  COLOR ENCODING/DECODING FUNCTIONS  /////////////////////
+
+inline float4 encode_output(const float4 color)
+{
+    if(gamma_encode_output)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_input(const float4 color)
+{
+    if(linearize_input)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_gamma_input(const float4 color, const float3 gamma)
+{
+    if(assume_opaque_alpha)
+    {
+        return float4(pow(color.rgb, gamma), 1.0);
+    }
+    else
+    {
+        return float4(pow(color.rgb, gamma), color.a);
+    }
+}
+
+//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯
+//#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D)))
+// EDIT: it's the 'const' in front of the coords that's doing it
+
+///////////////////////////  TEXTURE LOOKUP WRAPPERS  //////////////////////////
+
+//  "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a wide array of linearizing texture lookup wrapper functions.  The
+//  Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
+//  lookups are provided for completeness in case that changes someday.  Nobody
+//  is likely to use the *fetch and *proj functions, but they're included just
+//  in case.  The only tex*D texture sampling functions omitted are:
+//      - tex*Dcmpbias
+//      - tex*Dcmplod
+//      - tex*DARRAY*
+//      - tex*DMS*
+//      - Variants returning integers
+//  Standard line length restrictions are ignored below for vertical brevity.
+/*
+//  tex1D:
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex1Dbias:
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dbias(tex, tex_coords));   }
+
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dbias(tex, tex_coords, texel_off));    }
+
+//  tex1Dfetch:
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
+{   return decode_input(tex1Dfetch(tex, tex_coords));  }
+
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex1Dlod:
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dlod(tex, tex_coords));    }
+
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dlod(tex, tex_coords, texel_off));     }
+
+//  tex1Dproj:
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+*/
+//  tex2D:
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords, texel_off));    }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex2Dbias:
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
+//{   return decode_input(tex2Dbias(tex, tex_coords));   }
+
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dbias(tex, tex_coords, texel_off));    }
+
+//  tex2Dfetch:
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
+//{   return decode_input(tex2Dfetch(tex, tex_coords));  }
+
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords)
+{   return decode_input(textureLod(tex, tex_coords.xy, 0.0));    }
+
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));     }
+/*
+//  tex2Dproj:
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+*/
+/*
+//  tex3D:
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
+{   return decode_input(tex3D(tex, tex_coords));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, texel_off));    }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex3Dbias:
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dbias(tex, tex_coords));   }
+
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dbias(tex, tex_coords, texel_off));    }
+
+//  tex3Dfetch:
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
+{   return decode_input(tex3Dfetch(tex, tex_coords));  }
+
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex3Dlod:
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dlod(tex, tex_coords));    }
+
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dlod(tex, tex_coords, texel_off));     }
+
+//  tex3Dproj:
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dproj(tex, tex_coords));   }
+
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dproj(tex, tex_coords, texel_off));    }
+/////////*
+
+//  NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  This narrow selection of nonstandard tex2D* functions can be useful:
+
+//  tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0)));   }
+
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off));    }
+
+
+//  MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a narrower selection of tex2D* wrapper functions that decode an
+//  input sample with a specified gamma value.  These are useful for reading
+//  LUT's and for reading the input of pass0 in a later pass.
+
+//  tex2D:
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma);   }
+
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+/*
+//  tex2Dbias:
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma);   }
+
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma);    }
+
+//  tex2Dfetch:
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma);  }
+
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma);   }
+*/
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma);    }
+
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma);     }
+
+
+#endif  //  GAMMA_MANAGEMENT_H
+
+////////////////////////////  END GAMMA-MANAGEMENT  //////////////////////////
+
+//#include "tex2Dantialias.h"
+
+/////////////////////////  BEGIN TEX2DANTIALIAS  /////////////////////////
+
+#ifndef TEX2DANTIALIAS_H
+#define TEX2DANTIALIAS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides antialiased and subpixel-aware tex2D lookups.
+//  Requires:   All functions share these requirements:
+//              1.) All requirements of gamma-management.h must be satisfied!
+//              2.) pixel_to_tex_uv must be a 2x2 matrix that transforms pixe-
+//                  space offsets to texture uv offsets.  You can get this with:
+//                      const float2 duv_dx = ddx(tex_uv);
+//                      const float2 duv_dy = ddy(tex_uv);
+//                      const float2x2 pixel_to_tex_uv = float2x2(
+//                          duv_dx.x, duv_dy.x,
+//                          duv_dx.y, duv_dy.y);
+//                  This is left to the user in case the current Cg profile
+//                  doesn't support ddx()/ddy().  Ideally, the user could find
+//                  calculate a distorted tangent-space mapping analytically.
+//                  If not, a simple flat mapping can be obtained with:
+//                      const float2 xy_to_uv_scale = output_size *
+//                          video_size/texture_size;
+//                      const float2x2 pixel_to_tex_uv = float2x2(
+//                          xy_to_uv_scale.x, 0.0,
+//                          0.0, xy_to_uv_scale.y);
+//  Optional:   To set basic AA settings, #define ANTIALIAS_OVERRIDE_BASICS and:
+//              1.) Set an antialiasing level:
+//                      static const float aa_level = {0 (none),
+//                          1 (sample subpixels), 4, 5, 6, 7, 8, 12, 16, 20, 24}
+//              2.) Set a filter type:
+//                      static const float aa_filter = {
+//                          0 (Box, Separable), 1 (Box, Cylindrical),
+//                          2 (Tent, Separable), 3 (Tent, Cylindrical)
+//                          4 (Gaussian, Separable), 5 (Gaussian, Cylindrical)
+//                          6 (Cubic, Separable), 7 (Cubic, Cylindrical)
+//                          8 (Lanczos Sinc, Separable),
+//                          9 (Lanczos Jinc, Cylindrical)}
+//                  If the input is unknown, a separable box filter is used.
+//                  Note: Lanczos Jinc is terrible for sparse sampling, and
+//                  using aa_axis_importance (see below) defeats the purpose.
+//              3.) Mirror the sample pattern on odd frames?
+//                      static const bool aa_temporal = {true, false]
+//                  This helps rotational invariance but can look "fluttery."
+//              The user may #define ANTIALIAS_OVERRIDE_PARAMETERS to override
+//              (all of) the following default parameters with static or uniform
+//              constants (or an accessor function for subpixel offsets):
+//              1.) Cubic parameters:
+//                      static const float aa_cubic_c = 0.5;
+//                  See http://www.imagemagick.org/Usage/filter/#mitchell
+//              2.) Gaussian parameters:
+//                      static const float aa_gauss_sigma =
+//                          0.5/aa_pixel_diameter;
+//              3.) Set subpixel offsets.  This requires an accessor function
+//                  for compatibility with scalar runtime shader   Return
+//                  a float2 pixel offset in [-0.5, 0.5] for the red subpixel:
+//                      float2 get_aa_subpixel_r_offset()
+//              The user may also #define ANTIALIAS_OVERRIDE_STATIC_CONSTANTS to
+//              override (all of) the following default static values.  However,
+//              the file's structure requires them to be declared static const:
+//              1.) static const float aa_lanczos_lobes = 3.0;
+//              2.) static const float aa_gauss_support = 1.0/aa_pixel_diameter;
+//                  Note the default tent/Gaussian support radii may appear
+//                  arbitrary, but extensive testing found them nearly optimal
+//                  for tough cases like strong distortion at low AA levels.
+//                  (The Gaussian default is only best for practical gauss_sigma
+//                  values; much larger gauss_sigmas ironically prefer slightly
+//                  smaller support given sparse sampling, and vice versa.)
+//              3.) static const float aa_tent_support = 1.0 / aa_pixel_diameter;
+//              4.) static const float2 aa_xy_axis_importance:
+//                  The sparse N-queens sampling grid interacts poorly with
+//                  negative-lobed 2D filters.  However, if aliasing is much
+//                  stronger in one direction (e.g. horizontally with a phosphor
+//                  mask), it can be useful to downplay sample offsets along the
+//                  other axis.  The support radius in each direction scales with
+//                  aa_xy_axis_importance down to a minimum of 0.5 (box support),
+//                  after which point only the offsets used for calculating
+//                  weights continue to scale downward.  This works as follows:
+//                  If aa_xy_axis_importance = float2(1.0, 1.0/support_radius),
+//                  the vertical support radius will drop to 1.0, and we'll just
+//                  filter vertical offsets with the first filter lobe, while
+//                  horizontal offsets go through the full multi-lobe filter.
+//                  If aa_xy_axis_importance = float2(1.0, 0.0), the vertical
+//                  support radius will drop to box support, and the vertical
+//                  offsets will be ignored entirely (essentially giving us a
+//                  box filter vertically).  The former is potentially smoother
+//                  (but less predictable) and the default behavior of Lanczos
+//                  jinc, whereas the latter is sharper and the default behavior
+//                  of cubics and Lanczos sinc.
+//              5.) static const float aa_pixel_diameter: You can expand the
+//                  pixel diameter to e.g. sqrt(2.0), which may be a better
+//                  support range for cylindrical filters (they don't
+//                  currently discard out-of-circle samples though).
+//              Finally, there are two miscellaneous options:
+//              1.) If you want to antialias a manually tiled texture, you can
+//                  #define ANTIALIAS_DISABLE_ANISOTROPIC to use tex2Dlod() to
+//                  fix incompatibilities with anisotropic filtering.  This is
+//                  slower, and the Cg profile must support tex2Dlod().
+//              2.) If aa_cubic_c is a runtime uniform, you can #define
+//                  RUNTIME_ANTIALIAS_WEIGHTS to evaluate cubic weights once per
+//                  fragment instead of at the usage site (which is used by
+//                  default, because it enables static evaluation).
+//  Description:
+//  Each antialiased lookup follows these steps:
+//  1.) Define a sample pattern of pixel offsets in the range of [-0.5, 0.5]
+//      pixels, spanning the diameter of a rectangular box filter.
+//  2.) Scale these offsets by the support diameter of the user's chosen filter.
+//  3.) Using these pixel offsets from the pixel center, compute the offsets to
+//      predefined subpixel locations.
+//  4.) Compute filter weights based on subpixel offsets.
+//  Much of that can often be done at compile-time.  At runtime:
+//  1.) Project pixel-space offsets into uv-space with a matrix multiplication
+//      to get the uv offsets for each sample.  Rectangular pixels have a
+//      diameter of 1.0.  Circular pixels are not currently supported, but they
+//      might be better with a diameter of sqrt(2.0) to ensure there are no gaps
+//      between them.
+//  2.) Load, weight, and sum samples.
+//  We use a sparse bilinear sampling grid, so there are two major implications:
+//  1.) We can directly project the pixel-space support box into uv-space even
+//      if we're upsizing.  This wouldn't be the case for nearest neighbor,
+//      where we'd have to expand the uv-space diameter to at least the support
+//      size to ensure sufficient filter support.  In our case, this allows us
+//      to treat upsizing the same as downsizing and use static weighting. :)
+//  2.) For decent results, negative-lobed filters must be computed based on
+//      separable weights, not radial distances, because the sparse sampling
+//      makes no guarantees about radial distributions.  Even then, it's much
+//      better to set aa_xy_axis_importance to e.g. float2(1.0, 0.0) to use e.g.
+//      Lanczos2 horizontally and a box filter vertically.  This is mainly due
+//      to the sparse N-queens sampling and a statistically enormous positive or
+//      negative covariance between horizontal and vertical weights.
+//
+//  Design Decision Comments:
+//  "aa_temporal" mirrors the sample pattern on odd frames along the axis that
+//  keeps subpixel weights constant.  This helps with rotational invariance, but
+//  it can cause distracting fluctuations, and horizontal and vertical edges
+//  will look the same.  Using a different pattern on a shifted grid would
+//  exploit temporal AA better, but it would require a dynamic branch or a lot
+//  of conditional moves, so it's prohibitively slow for the minor benefit.
+
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+#ifndef ANTIALIAS_OVERRIDE_BASICS
+    //  The following settings must be static constants:
+    static const float aa_level = 12.0;
+    static const float aa_filter = 0.0;
+    static const bool aa_temporal = false;
+#endif
+
+#ifndef ANTIALIAS_OVERRIDE_STATIC_CONSTANTS
+    //  Users may override these parameters, but the file structure requires
+    //  them to be static constants; see the descriptions above.
+    static const float aa_pixel_diameter = 1.0;
+    static const float aa_lanczos_lobes = 3.0;
+    static const float aa_gauss_support = 1.0 / aa_pixel_diameter;
+    static const float aa_tent_support = 1.0 / aa_pixel_diameter;
+    
+    //  If we're using a negative-lobed filter, default to using it horizontally
+    //  only, and use only the first lobe vertically or a box filter, over a
+    //  correspondingly smaller range.  This compensates for the sparse sampling
+    //  grid's typically large positive/negative x/y covariance.
+    static const float2 aa_xy_axis_importance =
+        aa_filter < 5.5 ? float2(1.0) :         //  Box, tent, Gaussian
+        aa_filter < 8.5 ? float2(1.0, 0.0) :    //  Cubic and Lanczos sinc
+        aa_filter < 9.5 ? float2(1.0, 1.0/aa_lanczos_lobes) :   //  Lanczos jinc
+        float2(1.0);                            //  Default to box
+#endif
+
+#ifndef ANTIALIAS_OVERRIDE_PARAMETERS
+    //  Users may override these values with their own uniform or static consts.
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c = 0.5;
+    static const float aa_gauss_sigma = 0.5 / aa_pixel_diameter;
+    //  Users may override the subpixel offset accessor function with their own.
+    //  A function is used for compatibility with scalar runtime shader 
+    inline float2 get_aa_subpixel_r_offset()
+    {
+        return float2(0.0, 0.0);
+    }
+#endif
+
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//#include "../../../../include/gamma-management.h"
+
+
+//////////////////////////////////  CONSTANTS  /////////////////////////////////
+
+static const float aa_box_support = 0.5;
+static const float aa_cubic_support = 2.0;
+
+
+////////////////////////////  GLOBAL NON-CONSTANTS  ////////////////////////////
+
+//  We'll want to define these only once per fragment at most.
+#ifdef RUNTIME_ANTIALIAS_WEIGHTS
+    float aa_cubic_b;
+    float cubic_branch1_x3_coeff;
+    float cubic_branch1_x2_coeff;
+    float cubic_branch1_x0_coeff;
+    float cubic_branch2_x3_coeff;
+    float cubic_branch2_x2_coeff;
+    float cubic_branch2_x1_coeff;
+    float cubic_branch2_x0_coeff;
+#endif
+
+
+///////////////////////////////////  HELPERS  //////////////////////////////////
+
+void assign_aa_cubic_constants()
+{
+    //  Compute cubic coefficients on demand at runtime, and save them to global
+    //  uniforms.  The B parameter is computed from C, because "Keys cubics"
+    //  with B = 1 - 2C are considered the highest quality.
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        if(aa_filter > 5.5 && aa_filter < 7.5)
+        {
+            aa_cubic_b = 1.0 - 2.0*aa_cubic_c;
+            cubic_branch1_x3_coeff = 12.0 - 9.0*aa_cubic_b - 6.0*aa_cubic_c;
+            cubic_branch1_x2_coeff = -18.0 + 12.0*aa_cubic_b + 6.0*aa_cubic_c;
+            cubic_branch1_x0_coeff = 6.0 - 2.0 * aa_cubic_b;
+            cubic_branch2_x3_coeff = -aa_cubic_b - 6.0 * aa_cubic_c;
+            cubic_branch2_x2_coeff = 6.0*aa_cubic_b + 30.0*aa_cubic_c;
+            cubic_branch2_x1_coeff = -12.0*aa_cubic_b - 48.0*aa_cubic_c;
+            cubic_branch2_x0_coeff = 8.0*aa_cubic_b + 24.0*aa_cubic_c;
+        }
+    #endif
+}
+
+inline float4 get_subpixel_support_diam_and_final_axis_importance()
+{
+    //  Statically select the base support radius:
+    static const float base_support_radius =
+        aa_filter < 1.5 ? aa_box_support :
+        aa_filter < 3.5 ? aa_tent_support :
+        aa_filter < 5.5 ? aa_gauss_support :
+        aa_filter < 7.5 ? aa_cubic_support :
+        aa_filter < 9.5 ? aa_lanczos_lobes :
+        aa_box_support; //  Default to box
+    //  Expand the filter support for subpixel filtering.
+    const float2 subpixel_support_radius_raw =
+        float2(base_support_radius) + abs(get_aa_subpixel_r_offset());
+    if(aa_filter < 1.5)
+    {
+        //  Ignore aa_xy_axis_importance for box filtering.
+        const float2 subpixel_support_diam =
+            2.0 * subpixel_support_radius_raw;
+        const float2 final_axis_importance = float2(1.0);
+        return float4(subpixel_support_diam, final_axis_importance);
+    }
+    else
+    {
+        //  Scale the support window by aa_xy_axis_importance, but don't narrow
+        //  it further than box support.  This allows decent vertical AA without
+        //  messing up horizontal weights or using something silly like Lanczos4
+        //  horizontally with a huge vertical average over an 8-pixel radius.
+        const float2 subpixel_support_radius = max(float2(aa_box_support, aa_box_support),
+            subpixel_support_radius_raw * aa_xy_axis_importance);
+        //  Adjust aa_xy_axis_importance to compensate for what's already done:
+        const float2 final_axis_importance = aa_xy_axis_importance *
+            subpixel_support_radius_raw/subpixel_support_radius;
+        const float2 subpixel_support_diam = 2.0 * subpixel_support_radius;
+        return float4(subpixel_support_diam, final_axis_importance);
+    }
+}
+
+
+///////////////////////////  FILTER WEIGHT FUNCTIONS  //////////////////////////
+
+inline float eval_box_filter(const float dist)
+{
+    return float(abs(dist) <= aa_box_support);
+}
+
+inline float eval_separable_box_filter(const float2 offset)
+{
+    return float(all(bool2((abs(offset.x) <= aa_box_support), (abs(offset.y) <= aa_box_support))));
+}
+
+inline float eval_tent_filter(const float dist)
+{
+    return clamp((aa_tent_support - dist)/
+        aa_tent_support, 0.0, 1.0);
+}
+
+inline float eval_gaussian_filter(const float dist)
+{
+    return exp(-(dist*dist) / (2.0*aa_gauss_sigma*aa_gauss_sigma));
+}
+
+inline float eval_cubic_filter(const float dist)
+{
+    //  Compute coefficients like assign_aa_cubic_constants(), but statically.
+    #ifndef RUNTIME_ANTIALIAS_WEIGHTS
+        //  When runtime weights are used, these values are instead written to
+        //  global uniforms at the beginning of each tex2Daa* call.
+        const float aa_cubic_b = 1.0 - 2.0*aa_cubic_c;
+        const float cubic_branch1_x3_coeff = 12.0 - 9.0*aa_cubic_b - 6.0*aa_cubic_c;
+        const float cubic_branch1_x2_coeff = -18.0 + 12.0*aa_cubic_b + 6.0*aa_cubic_c;
+        const float cubic_branch1_x0_coeff = 6.0 - 2.0 * aa_cubic_b;
+        const float cubic_branch2_x3_coeff = -aa_cubic_b - 6.0 * aa_cubic_c;
+        const float cubic_branch2_x2_coeff = 6.0*aa_cubic_b + 30.0*aa_cubic_c;
+        const float cubic_branch2_x1_coeff = -12.0*aa_cubic_b - 48.0*aa_cubic_c;
+        const float cubic_branch2_x0_coeff = 8.0*aa_cubic_b + 24.0*aa_cubic_c;
+    #endif
+    const float abs_dist = abs(dist);
+    //  Compute the cubic based on the Horner's method formula in:
+    //  http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/Mitchell.pdf
+    return (abs_dist < 1.0 ?
+        (cubic_branch1_x3_coeff*abs_dist +
+            cubic_branch1_x2_coeff)*abs_dist*abs_dist +
+            cubic_branch1_x0_coeff :
+        abs_dist < 2.0 ?
+            ((cubic_branch2_x3_coeff*abs_dist +
+                cubic_branch2_x2_coeff)*abs_dist +
+                cubic_branch2_x1_coeff)*abs_dist + cubic_branch2_x0_coeff :
+            0.0)/6.0;
+}
+
+inline float eval_separable_cubic_filter(const float2 offset)
+{
+    //  This is faster than using a specific float2 version:
+    return eval_cubic_filter(offset.x) *
+        eval_cubic_filter(offset.y);
+}
+
+inline float2 eval_sinc_filter(const float2 offset)
+{
+    //  It's faster to let the caller handle the zero case, or at least it
+    //  was when I used macros and the shader preset took a full minute to load.
+    const float2 pi_offset = pi * offset;
+    return sin(pi_offset)/pi_offset;
+}
+
+inline float eval_separable_lanczos_sinc_filter(const float2 offset_unsafe)
+{
+    //  Note: For sparse sampling, you really need to pick an axis to use
+    //  Lanczos along (e.g. set aa_xy_axis_importance = float2(1.0, 0.0)).
+    const float2 offset = FIX_ZERO(offset_unsafe);
+    const float2 xy_weights = eval_sinc_filter(offset) *
+        eval_sinc_filter(offset/aa_lanczos_lobes);
+    return xy_weights.x * xy_weights.y;
+}
+
+inline float eval_jinc_filter_unorm(const float x)
+{
+    //  This is a Jinc approximation for x in [0, 45).  We'll use x in range
+    //  [0, 4*pi) or so.  There are faster/closer approximations based on
+    //  piecewise cubics from [0, 45) and asymptotic approximations beyond that,
+    //  but this has a maximum absolute error < 1/512, and it's simpler/faster
+    //  for shaders...not that it's all that useful for sparse sampling anyway.
+    const float point3845_x = 0.38448566093564*x;
+    const float exp_term = exp(-(point3845_x*point3845_x));
+    const float point8154_plus_x = 0.815362332840791 + x;
+    const float cos_term = cos(point8154_plus_x);
+    return (
+        0.0264727330997042*min(x, 6.83134964622778) +
+        0.680823557250528*exp_term +
+        -0.0597255978950933*min(7.41043194481873, x)*cos_term /
+            (point8154_plus_x + 0.0646074538634482*(x*x) +
+            cos(x)*max(exp_term, cos(x) + cos_term)) -
+        0.180837503591406);
+}
+
+inline float eval_jinc_filter(const float dist)
+{
+    return eval_jinc_filter_unorm(pi * dist);
+}
+
+inline float eval_lanczos_jinc_filter(const float dist)
+{
+    return eval_jinc_filter(dist) * eval_jinc_filter(dist/aa_lanczos_lobes);
+}
+
+
+inline float3 eval_unorm_rgb_weights(const float2 offset,
+    const float2 final_axis_importance)
+{
+    //  Requires:   1.) final_axis_impportance must be computed according to
+    //                  get_subpixel_support_diam_and_final_axis_importance().
+    //              2.) aa_filter must be a global constant.
+    //              3.) offset must be an xy pixel offset in the range:
+    //                      ([-subpixel_support_diameter.x/2,
+    //                      subpixel_support_diameter.x/2],
+    //                      [-subpixel_support_diameter.y/2,
+    //                      subpixel_support_diameter.y/2])
+    //  Returns:    Sample weights at R/G/B destination subpixels for the
+    //              given xy pixel offset.
+    const float2 offset_g = offset * final_axis_importance;
+    const float2 aa_r_offset = get_aa_subpixel_r_offset();
+    const float2 offset_r = offset_g - aa_r_offset * final_axis_importance;
+    const float2 offset_b = offset_g + aa_r_offset * final_axis_importance;
+    //  Statically select a filter:
+    if(aa_filter < 0.5)
+    {
+        return float3(eval_separable_box_filter(offset_r),
+            eval_separable_box_filter(offset_g),
+            eval_separable_box_filter(offset_b));
+    }
+    else if(aa_filter < 1.5)
+    {
+        return float3(eval_box_filter(length(offset_r)),
+            eval_box_filter(length(offset_g)),
+            eval_box_filter(length(offset_b)));
+    }
+    else if(aa_filter < 2.5)
+    {
+        return float3(
+            eval_tent_filter(offset_r.x) * eval_tent_filter(offset_r.y),
+            eval_tent_filter(offset_g.x) * eval_tent_filter(offset_g.y),
+            eval_tent_filter(offset_b.x) * eval_tent_filter(offset_b.y));
+    }
+    else if(aa_filter < 3.5)
+    {
+        return float3(eval_tent_filter(length(offset_r)),
+            eval_tent_filter(length(offset_g)),
+            eval_tent_filter(length(offset_b)));
+    }
+    else if(aa_filter < 4.5)
+    {
+        return float3(
+            eval_gaussian_filter(offset_r.x) * eval_gaussian_filter(offset_r.y),
+            eval_gaussian_filter(offset_g.x) * eval_gaussian_filter(offset_g.y),
+            eval_gaussian_filter(offset_b.x) * eval_gaussian_filter(offset_b.y));
+    }
+    else if(aa_filter < 5.5)
+    {
+        return float3(eval_gaussian_filter(length(offset_r)),
+            eval_gaussian_filter(length(offset_g)),
+            eval_gaussian_filter(length(offset_b)));
+    }
+    else if(aa_filter < 6.5)
+    {
+        return float3(
+            eval_cubic_filter(offset_r.x) * eval_cubic_filter(offset_r.y),
+            eval_cubic_filter(offset_g.x) * eval_cubic_filter(offset_g.y),
+            eval_cubic_filter(offset_b.x) * eval_cubic_filter(offset_b.y));
+    }
+    else if(aa_filter < 7.5)
+    {
+        return float3(eval_cubic_filter(length(offset_r)),
+            eval_cubic_filter(length(offset_g)),
+            eval_cubic_filter(length(offset_b)));
+    }
+    else if(aa_filter < 8.5)
+    {
+        return float3(eval_separable_lanczos_sinc_filter(offset_r),
+            eval_separable_lanczos_sinc_filter(offset_g),
+            eval_separable_lanczos_sinc_filter(offset_b));
+    }
+    else if(aa_filter < 9.5)
+    {
+        return float3(eval_lanczos_jinc_filter(length(offset_r)),
+            eval_lanczos_jinc_filter(length(offset_g)),
+            eval_lanczos_jinc_filter(length(offset_b)));
+    }
+    else
+    {
+        //  Default to a box, because Lanczos Jinc is so bad. ;)
+        return float3(eval_separable_box_filter(offset_r),
+            eval_separable_box_filter(offset_g),
+            eval_separable_box_filter(offset_b));
+    }
+}
+
+
+//////////////////////////////  HELPER FUNCTIONS  //////////////////////////////
+
+inline float4 tex2Daa_tiled_linearize(const sampler2D samp, const float2 s)
+{
+    //  If we're manually tiling a texture, anisotropic filtering can get
+    //  confused.  This is one workaround:
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        //  TODO: Use tex2Dlod_linearize with a calculated mip level.
+        return tex2Dlod_linearize(samp, float4(s, 0.0, 0.0));
+    #else
+        return tex2D_linearize(samp, s);
+    #endif
+}
+
+inline float2 get_frame_sign(const float frame)
+{
+    if(aa_temporal)
+    {
+        //  Mirror the sampling pattern for odd frames in a direction that
+        //  lets us keep the same subpixel sample weights:
+        const float frame_odd = float(fmod(frame, 2.0) > 0.5);
+        const float2 aa_r_offset = get_aa_subpixel_r_offset();
+        const float2 mirror = -float2(abs(aa_r_offset.x) < (FIX_ZERO(0.0)), abs(aa_r_offset.y) < (FIX_ZERO(0.0)));
+        return mirror;
+    }
+    else
+    {
+        return float2(1.0, 1.0);
+    }
+}
+
+
+/////////////////////////  ANTIALIASED TEXTURE LOOKUPS  ////////////////////////
+
+float3 tex2Daa_subpixel_weights_only(const sampler2D tex,
+    const float2 tex_uv, const float2x2 pixel_to_tex_uv)
+{
+    //  This function is unlike the others: Just perform a single independent
+    //  lookup for each subpixel.  It may be very aliased.
+    const float2 aa_r_offset = get_aa_subpixel_r_offset();
+    const float2 aa_r_offset_uv_offset = mul(pixel_to_tex_uv, aa_r_offset);
+    const float color_g = tex2D_linearize(tex, tex_uv).g;
+    const float color_r = tex2D_linearize(tex, tex_uv + aa_r_offset_uv_offset).r;
+    const float color_b = tex2D_linearize(tex, tex_uv - aa_r_offset_uv_offset).b;
+    return float3(color_r, color_g, color_b);
+}
+
+//  The tex2Daa* functions compile very slowly due to all the macros and
+//  compile-time math, so only include the ones we'll actually use!
+float3 tex2Daa4x(const sampler2D tex, const float2 tex_uv,
+    const float2x2 pixel_to_tex_uv, const float frame)
+{
+    //  Use an RGMS4 pattern (4-queens):
+    //  . . Q .  : off =(-1.5, -1.5)/4 + (2.0, 0.0)/4
+    //  Q . . .  : off =(-1.5, -1.5)/4 + (0.0, 1.0)/4
+    //  . . . Q  : off =(-1.5, -1.5)/4 + (3.0, 2.0)/4
+    //  . Q . .  : off =(-1.5, -1.5)/4 + (1.0, 3.0)/4
+    //  Static screenspace sample offsets (compute some implicitly):
+    static const float grid_size = 4.0;
+    assign_aa_cubic_constants();
+    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const float2 subpixel_support_diameter = ssd_fai.xy;
+    const float2 final_axis_importance = ssd_fai.zw;
+    const float2 xy_step = float2(1.0,1.0)/grid_size * subpixel_support_diameter;
+    const float2 xy_start_offset = float2(0.5 - grid_size*0.5,0.5 - grid_size*0.5) * xy_step;
+    //  Get the xy offset of each sample.  Exploit diagonal symmetry:
+    const float2 xy_offset0 = xy_start_offset + float2(2.0, 0.0) * xy_step;
+    const float2 xy_offset1 = xy_start_offset + float2(0.0, 1.0) * xy_step;
+    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
+    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const float3 w2 = w1.bgr;
+    const float3 w3 = w0.bgr;
+    //  Get the weight sum to normalize the total to 1.0 later:
+    const float3 half_sum = w0 + w1;
+    const float3 w_sum = half_sum + half_sum.bgr;
+    const float3 w_sum_inv = float3(1.0,1.0,1.0)/(w_sum);
+    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
+    const float2x2 true_pixel_to_tex_uv =
+        float2x2(pixel_to_tex_uv * aa_pixel_diameter);
+    //  Get uv sample offsets, mirror on odd frames if directed, and exploit
+    //  diagonal symmetry:
+    const float2 frame_sign = get_frame_sign(frame);
+    const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
+    const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
+    //  Load samples, linearizing if necessary, etc.:
+    const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
+    const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
+    const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
+    const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
+    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
+    return w_sum_inv * (w0 * sample0 + w1 * sample1 +
+        w2 * sample2 + w3 * sample3);
+}
+
+float3 tex2Daa5x(const sampler2D tex, const float2 tex_uv,
+    const float2x2 pixel_to_tex_uv, const float frame)
+{
+    //  Use a diagonally symmetric 5-queens pattern:
+    //  . Q . . .  : off =(-2.0, -2.0)/5 + (1.0, 0.0)/5
+    //  . . . . Q  : off =(-2.0, -2.0)/5 + (4.0, 1.0)/5
+    //  . . Q . .  : off =(-2.0, -2.0)/5 + (2.0, 2.0)/5
+    //  Q . . . .  : off =(-2.0, -2.0)/5 + (0.0, 3.0)/5
+    //  . . . Q .  : off =(-2.0, -2.0)/5 + (3.0, 4.0)/5
+    //  Static screenspace sample offsets (compute some implicitly):
+    static const float grid_size = 5.0;
+    assign_aa_cubic_constants();
+    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const float2 subpixel_support_diameter = ssd_fai.xy;
+    const float2 final_axis_importance = ssd_fai.zw;
+    const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter;
+    const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step;
+    //  Get the xy offset of each sample.  Exploit diagonal symmetry:
+    const float2 xy_offset0 = xy_start_offset + float2(1.0, 0.0) * xy_step;
+    const float2 xy_offset1 = xy_start_offset + float2(4.0, 1.0) * xy_step;
+    const float2 xy_offset2 = xy_start_offset + float2(2.0, 2.0) * xy_step;
+    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
+    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
+    const float3 w3 = w1.bgr;
+    const float3 w4 = w0.bgr;
+    //  Get the weight sum to normalize the total to 1.0 later:
+    const float3 w_sum_inv = float3(1.0)/(w0 + w1 + w2 + w3 + w4);
+    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
+    const float2x2 true_pixel_to_tex_uv =
+        float2x2(pixel_to_tex_uv * aa_pixel_diameter);
+    //  Get uv sample offsets, mirror on odd frames if directed, and exploit
+    //  diagonal symmetry:
+    const float2 frame_sign = get_frame_sign(frame);
+    const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
+    const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
+    //  Load samples, linearizing if necessary, etc.:
+    const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
+    const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
+    const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv).rgb;
+    const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
+    const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
+    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
+    return w_sum_inv * (w0 * sample0 + w1 * sample1 +
+        w2 * sample2 + w3 * sample3 + w4 * sample4);
+}
+
+float3 tex2Daa6x(const sampler2D tex, const float2 tex_uv,
+    const float2x2 pixel_to_tex_uv, const float frame)
+{
+    //  Use a diagonally symmetric 6-queens pattern with a stronger horizontal
+    //  than vertical slant:
+    //  . . . . Q .  : off =(-2.5, -2.5)/6 + (4.0, 0.0)/6
+    //  . . Q . . .  : off =(-2.5, -2.5)/6 + (2.0, 1.0)/6
+    //  Q . . . . .  : off =(-2.5, -2.5)/6 + (0.0, 2.0)/6
+    //  . . . . . Q  : off =(-2.5, -2.5)/6 + (5.0, 3.0)/6
+    //  . . . Q . .  : off =(-2.5, -2.5)/6 + (3.0, 4.0)/6
+    //  . Q . . . .  : off =(-2.5, -2.5)/6 + (1.0, 5.0)/6
+    //  Static screenspace sample offsets (compute some implicitly):
+    static const float grid_size = 6.0;
+    assign_aa_cubic_constants();
+    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const float2 subpixel_support_diameter = ssd_fai.xy;
+    const float2 final_axis_importance = ssd_fai.zw;
+    const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter;
+    const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step;
+    //  Get the xy offset of each sample.  Exploit diagonal symmetry:
+    const float2 xy_offset0 = xy_start_offset + float2(4.0, 0.0) * xy_step;
+    const float2 xy_offset1 = xy_start_offset + float2(2.0, 1.0) * xy_step;
+    const float2 xy_offset2 = xy_start_offset + float2(0.0, 2.0) * xy_step;
+    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
+    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
+    const float3 w3 = w2.bgr;
+    const float3 w4 = w1.bgr;
+    const float3 w5 = w0.bgr;
+    //  Get the weight sum to normalize the total to 1.0 later:
+    const float3 half_sum = w0 + w1 + w2;
+    const float3 w_sum = half_sum + half_sum.bgr;
+    const float3 w_sum_inv = float3(1.0)/(w_sum);
+    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
+    const float2x2 true_pixel_to_tex_uv =
+        float2x2(pixel_to_tex_uv * aa_pixel_diameter);
+    //  Get uv sample offsets, mirror on odd frames if directed, and exploit
+    //  diagonal symmetry:
+    const float2 frame_sign = get_frame_sign(frame);
+    const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
+    const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
+    const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign);
+    //  Load samples, linearizing if necessary, etc.:
+    const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
+    const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
+    const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb;
+    const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb;
+    const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
+    const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
+    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
+    return w_sum_inv * (w0 * sample0 + w1 * sample1 + w2 * sample2 +
+        w3 * sample3 + w4 * sample4 + w5 * sample5);
+}
+
+float3 tex2Daa7x(const sampler2D tex, const float2 tex_uv,
+    const float2x2 pixel_to_tex_uv, const float frame)
+{
+    //  Use a diagonally symmetric 7-queens pattern with a queen in the center:
+    //  . Q . . . . .  : off =(-3.0, -3.0)/7 + (1.0, 0.0)/7
+    //  . . . . Q . .  : off =(-3.0, -3.0)/7 + (4.0, 1.0)/7
+    //  Q . . . . . .  : off =(-3.0, -3.0)/7 + (0.0, 2.0)/7
+    //  . . . Q . . .  : off =(-3.0, -3.0)/7 + (3.0, 3.0)/7
+    //  . . . . . . Q  : off =(-3.0, -3.0)/7 + (6.0, 4.0)/7
+    //  . . Q . . . .  : off =(-3.0, -3.0)/7 + (2.0, 5.0)/7
+    //  . . . . . Q .  : off =(-3.0, -3.0)/7 + (5.0, 6.0)/7
+    static const float grid_size = 7.0;
+    assign_aa_cubic_constants();
+    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const float2 subpixel_support_diameter = ssd_fai.xy;
+    const float2 final_axis_importance = ssd_fai.zw;
+    const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter;
+    const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step;
+    //  Get the xy offset of each sample.  Exploit diagonal symmetry:
+    const float2 xy_offset0 = xy_start_offset + float2(1.0, 0.0) * xy_step;
+    const float2 xy_offset1 = xy_start_offset + float2(4.0, 1.0) * xy_step;
+    const float2 xy_offset2 = xy_start_offset + float2(0.0, 2.0) * xy_step;
+    const float2 xy_offset3 = xy_start_offset + float2(3.0, 3.0) * xy_step;
+    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
+    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
+    const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
+    const float3 w4 = w2.bgr;
+    const float3 w5 = w1.bgr;
+    const float3 w6 = w0.bgr;
+    //  Get the weight sum to normalize the total to 1.0 later:
+    const float3 half_sum = w0 + w1 + w2;
+    const float3 w_sum = half_sum + half_sum.bgr + w3;
+    const float3 w_sum_inv = float3(1.0)/(w_sum);
+    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
+    const float2x2 true_pixel_to_tex_uv =
+        float2x2(pixel_to_tex_uv * aa_pixel_diameter);
+    //  Get uv sample offsets, mirror on odd frames if directed, and exploit
+    //  diagonal symmetry:
+    const float2 frame_sign = get_frame_sign(frame);
+    const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
+    const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
+    const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign);
+    //  Load samples, linearizing if necessary, etc.:
+    const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
+    const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
+    const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb;
+    const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv).rgb;
+    const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb;
+    const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
+    const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
+    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
+    return w_sum_inv * (
+        w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
+        w4 * sample4 + w5 * sample5 + w6 * sample6);
+}
+
+float3 tex2Daa8x(const sampler2D tex, const float2 tex_uv,
+    const float2x2 pixel_to_tex_uv, const float frame)
+{
+    //  Use a diagonally symmetric 8-queens pattern.
+    //  . . Q . . . . .  : off =(-3.5, -3.5)/8 + (2.0, 0.0)/8
+    //  . . . . Q . . .  : off =(-3.5, -3.5)/8 + (4.0, 1.0)/8
+    //  . Q . . . . . .  : off =(-3.5, -3.5)/8 + (1.0, 2.0)/8
+    //  . . . . . . . Q  : off =(-3.5, -3.5)/8 + (7.0, 3.0)/8
+    //  Q . . . . . . .  : off =(-3.5, -3.5)/8 + (0.0, 4.0)/8
+    //  . . . . . . Q .  : off =(-3.5, -3.5)/8 + (6.0, 5.0)/8
+    //  . . . Q . . . .  : off =(-3.5, -3.5)/8 + (3.0, 6.0)/8
+    //  . . . . . Q . .  : off =(-3.5, -3.5)/8 + (5.0, 7.0)/8
+    static const float grid_size = 8.0;
+    assign_aa_cubic_constants();
+    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const float2 subpixel_support_diameter = ssd_fai.xy;
+    const float2 final_axis_importance = ssd_fai.zw;
+    const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter;
+    const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step;
+    //  Get the xy offset of each sample.  Exploit diagonal symmetry:
+    const float2 xy_offset0 = xy_start_offset + float2(2.0, 0.0) * xy_step;
+    const float2 xy_offset1 = xy_start_offset + float2(4.0, 1.0) * xy_step;
+    const float2 xy_offset2 = xy_start_offset + float2(1.0, 2.0) * xy_step;
+    const float2 xy_offset3 = xy_start_offset + float2(7.0, 3.0) * xy_step;
+    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
+    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
+    const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
+    const float3 w4 = w3.bgr;
+    const float3 w5 = w2.bgr;
+    const float3 w6 = w1.bgr;
+    const float3 w7 = w0.bgr;
+    //  Get the weight sum to normalize the total to 1.0 later:
+    const float3 half_sum = w0 + w1 + w2 + w3;
+    const float3 w_sum = half_sum + half_sum.bgr;
+    const float3 w_sum_inv = float3(1.0)/(w_sum);
+    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
+    const float2x2 true_pixel_to_tex_uv =
+        float2x2(pixel_to_tex_uv * aa_pixel_diameter);
+    //  Get uv sample offsets, and mirror on odd frames if directed:
+    const float2 frame_sign = get_frame_sign(frame);
+    const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
+    const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
+    const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign);
+    const float2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign);
+    //  Load samples, linearizing if necessary, etc.:
+    const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
+    const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
+    const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb;
+    const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb;
+    const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb;
+    const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb;
+    const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
+    const float3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
+    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
+    return w_sum_inv * (
+        w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
+        w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7);
+}
+
+float3 tex2Daa12x(const sampler2D tex, const float2 tex_uv,
+    const float2x2 pixel_to_tex_uv, const float frame)
+{
+    //  Use a diagonally symmetric 12-superqueens pattern where no 3 points are
+    //  exactly collinear.
+    //  . . . Q . . . . . . . .  : off =(-5.5, -5.5)/12 + (3.0, 0.0)/12
+    //  . . . . . . . . . Q . .  : off =(-5.5, -5.5)/12 + (9.0, 1.0)/12
+    //  . . . . . . Q . . . . .  : off =(-5.5, -5.5)/12 + (6.0, 2.0)/12
+    //  . Q . . . . . . . . . .  : off =(-5.5, -5.5)/12 + (1.0, 3.0)/12
+    //  . . . . . . . . . . . Q  : off =(-5.5, -5.5)/12 + (11.0, 4.0)/12
+    //  . . . . Q . . . . . . .  : off =(-5.5, -5.5)/12 + (4.0, 5.0)/12
+    //  . . . . . . . Q . . . .  : off =(-5.5, -5.5)/12 + (7.0, 6.0)/12
+    //  Q . . . . . . . . . . .  : off =(-5.5, -5.5)/12 + (0.0, 7.0)/12
+    //  . . . . . . . . . . Q .  : off =(-5.5, -5.5)/12 + (10.0, 8.0)/12
+    //  . . . . . Q . . . . . .  : off =(-5.5, -5.5)/12 + (5.0, 9.0)/12
+    //  . . Q . . . . . . . . .  : off =(-5.5, -5.5)/12 + (2.0, 10.0)/12
+    //  . . . . . . . . Q . . .  : off =(-5.5, -5.5)/12 + (8.0, 11.0)/12
+    static const float grid_size = 12.0;
+    assign_aa_cubic_constants();
+    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const float2 subpixel_support_diameter = ssd_fai.xy;
+    const float2 final_axis_importance = ssd_fai.zw;
+    const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter;
+    const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step;
+    //  Get the xy offset of each sample.  Exploit diagonal symmetry:
+    const float2 xy_offset0 = xy_start_offset + float2(3.0, 0.0) * xy_step;
+    const float2 xy_offset1 = xy_start_offset + float2(9.0, 1.0) * xy_step;
+    const float2 xy_offset2 = xy_start_offset + float2(6.0, 2.0) * xy_step;
+    const float2 xy_offset3 = xy_start_offset + float2(1.0, 3.0) * xy_step;
+    const float2 xy_offset4 = xy_start_offset + float2(11.0, 4.0) * xy_step;
+    const float2 xy_offset5 = xy_start_offset + float2(4.0, 5.0) * xy_step;
+    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
+    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
+    const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
+    const float3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance);
+    const float3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance);
+    const float3 w6 = w5.bgr;
+    const float3 w7 = w4.bgr;
+    const float3 w8 = w3.bgr;
+    const float3 w9 = w2.bgr;
+    const float3 w10 = w1.bgr;
+    const float3 w11 = w0.bgr;
+    //  Get the weight sum to normalize the total to 1.0 later:
+    const float3 half_sum = w0 + w1 + w2 + w3 + w4 + w5;
+    const float3 w_sum = half_sum + half_sum.bgr;
+    const float3 w_sum_inv = float3(1.0)/w_sum;
+    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
+    const float2x2 true_pixel_to_tex_uv =
+        float2x2(pixel_to_tex_uv * aa_pixel_diameter);
+    //  Get uv sample offsets, mirror on odd frames if directed, and exploit
+    //  diagonal symmetry:
+    const float2 frame_sign = get_frame_sign(frame);
+    const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
+    const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
+    const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign);
+    const float2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign);
+    const float2 uv_offset4 = mul(true_pixel_to_tex_uv, xy_offset4 * frame_sign);
+    const float2 uv_offset5 = mul(true_pixel_to_tex_uv, xy_offset5 * frame_sign);
+    //  Load samples, linearizing if necessary, etc.:
+    const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
+    const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
+    const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb;
+    const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb;
+    const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset4).rgb;
+    const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset5).rgb;
+    const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset5).rgb;
+    const float3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset4).rgb;
+    const float3 sample8 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb;
+    const float3 sample9 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb;
+    const float3 sample10 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
+    const float3 sample11 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
+    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
+    return w_sum_inv * (
+        w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
+        w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 +
+        w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11);
+}
+
+float3 tex2Daa16x(const sampler2D tex, const float2 tex_uv,
+    const float2x2 pixel_to_tex_uv, const float frame)
+{
+    //  Use a diagonally symmetric 16-superqueens pattern where no 3 points are
+    //  exactly collinear.
+    //  . . Q . . . . . . . . . . . . .  : off =(-7.5, -7.5)/16 + (2.0, 0.0)/16
+    //  . . . . . . . . . Q . . . . . .  : off =(-7.5, -7.5)/16 + (9.0, 1.0)/16
+    //  . . . . . . . . . . . . Q . . .  : off =(-7.5, -7.5)/16 + (12.0, 2.0)/16
+    //  . . . . Q . . . . . . . . . . .  : off =(-7.5, -7.5)/16 + (4.0, 3.0)/16
+    //  . . . . . . . . Q . . . . . . .  : off =(-7.5, -7.5)/16 + (8.0, 4.0)/16
+    //  . . . . . . . . . . . . . . Q .  : off =(-7.5, -7.5)/16 + (14.0, 5.0)/16
+    //  Q . . . . . . . . . . . . . . .  : off =(-7.5, -7.5)/16 + (0.0, 6.0)/16
+    //  . . . . . . . . . . Q . . . . .  : off =(-7.5, -7.5)/16 + (10.0, 7.0)/16
+    //  . . . . . Q . . . . . . . . . .  : off =(-7.5, -7.5)/16 + (5.0, 8.0)/16
+    //  . . . . . . . . . . . . . . . Q  : off =(-7.5, -7.5)/16 + (15.0, 9.0)/16
+    //  . Q . . . . . . . . . . . . . .  : off =(-7.5, -7.5)/16 + (1.0, 10.0)/16
+    //  . . . . . . . Q . . . . . . . .  : off =(-7.5, -7.5)/16 + (7.0, 11.0)/16
+    //  . . . . . . . . . . . Q . . . .  : off =(-7.5, -7.5)/16 + (11.0, 12.0)/16
+    //  . . . Q . . . . . . . . . . . .  : off =(-7.5, -7.5)/16 + (3.0, 13.0)/16
+    //  . . . . . . Q . . . . . . . . .  : off =(-7.5, -7.5)/16 + (6.0, 14.0)/16
+    //  . . . . . . . . . . . . . Q . .  : off =(-7.5, -7.5)/16 + (13.0, 15.0)/16
+    static const float grid_size = 16.0;
+    assign_aa_cubic_constants();
+    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const float2 subpixel_support_diameter = ssd_fai.xy;
+    const float2 final_axis_importance = ssd_fai.zw;
+    const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter;
+    const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step;
+    //  Get the xy offset of each sample.  Exploit diagonal symmetry:
+    const float2 xy_offset0 = xy_start_offset + float2(2.0, 0.0) * xy_step;
+    const float2 xy_offset1 = xy_start_offset + float2(9.0, 1.0) * xy_step;
+    const float2 xy_offset2 = xy_start_offset + float2(12.0, 2.0) * xy_step;
+    const float2 xy_offset3 = xy_start_offset + float2(4.0, 3.0) * xy_step;
+    const float2 xy_offset4 = xy_start_offset + float2(8.0, 4.0) * xy_step;
+    const float2 xy_offset5 = xy_start_offset + float2(14.0, 5.0) * xy_step;
+    const float2 xy_offset6 = xy_start_offset + float2(0.0, 6.0) * xy_step;
+    const float2 xy_offset7 = xy_start_offset + float2(10.0, 7.0) * xy_step;
+    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
+    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
+    const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
+    const float3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance);
+    const float3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance);
+    const float3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance);
+    const float3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance);
+    const float3 w8 = w7.bgr;
+    const float3 w9 = w6.bgr;
+    const float3 w10 = w5.bgr;
+    const float3 w11 = w4.bgr;
+    const float3 w12 = w3.bgr;
+    const float3 w13 = w2.bgr;
+    const float3 w14 = w1.bgr;
+    const float3 w15 = w0.bgr;
+    //  Get the weight sum to normalize the total to 1.0 later:
+    const float3 half_sum = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7;
+    const float3 w_sum = half_sum + half_sum.bgr;
+    const float3 w_sum_inv = float3(1.0)/(w_sum);
+    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
+    const float2x2 true_pixel_to_tex_uv =
+        float2x2(pixel_to_tex_uv * aa_pixel_diameter);
+    //  Get uv sample offsets, mirror on odd frames if directed, and exploit
+    //  diagonal symmetry:
+    const float2 frame_sign = get_frame_sign(frame);
+    const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
+    const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
+    const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign);
+    const float2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign);
+    const float2 uv_offset4 = mul(true_pixel_to_tex_uv, xy_offset4 * frame_sign);
+    const float2 uv_offset5 = mul(true_pixel_to_tex_uv, xy_offset5 * frame_sign);
+    const float2 uv_offset6 = mul(true_pixel_to_tex_uv, xy_offset6 * frame_sign);
+    const float2 uv_offset7 = mul(true_pixel_to_tex_uv, xy_offset7 * frame_sign);
+    //  Load samples, linearizing if necessary, etc.:
+    const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
+    const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
+    const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb;
+    const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb;
+    const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset4).rgb;
+    const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset5).rgb;
+    const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset6).rgb;
+    const float3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset7).rgb;
+    const float3 sample8 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset7).rgb;
+    const float3 sample9 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset6).rgb;
+    const float3 sample10 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset5).rgb;
+    const float3 sample11 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset4).rgb;
+    const float3 sample12 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb;
+    const float3 sample13 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb;
+    const float3 sample14 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
+    const float3 sample15 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
+    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
+    return w_sum_inv * (
+        w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
+        w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 +
+        w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 +
+        w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15);
+}
+
+float3 tex2Daa20x(const sampler2D tex, const float2 tex_uv,
+    const float2x2 pixel_to_tex_uv, const float frame)
+{
+    //  Use a diagonally symmetric 20-superqueens pattern where no 3 points are
+    //  exactly collinear and superqueens have a squared attack radius of 13.
+    //  . . . . . . . Q . . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (7.0, 0.0)/20
+    //  . . . . . . . . . . . . . . . . Q . . .  : off =(-9.5, -9.5)/20 + (16.0, 1.0)/20
+    //  . . . . . . . . . . . Q . . . . . . . .  : off =(-9.5, -9.5)/20 + (11.0, 2.0)/20
+    //  . Q . . . . . . . . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (1.0, 3.0)/20
+    //  . . . . . Q . . . . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (5.0, 4.0)/20
+    //  . . . . . . . . . . . . . . . Q . . . .  : off =(-9.5, -9.5)/20 + (15.0, 5.0)/20
+    //  . . . . . . . . . . Q . . . . . . . . .  : off =(-9.5, -9.5)/20 + (10.0, 6.0)/20
+    //  . . . . . . . . . . . . . . . . . . . Q  : off =(-9.5, -9.5)/20 + (19.0, 7.0)/20
+    //  . . Q . . . . . . . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (2.0, 8.0)/20
+    //  . . . . . . Q . . . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (6.0, 9.0)/20
+    //  . . . . . . . . . . . . . Q . . . . . .  : off =(-9.5, -9.5)/20 + (13.0, 10.0)/20
+    //  . . . . . . . . . . . . . . . . . Q . .  : off =(-9.5, -9.5)/20 + (17.0, 11.0)/20
+    //  Q . . . . . . . . . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (0.0, 12.0)/20
+    //  . . . . . . . . . Q . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (9.0, 13.0)/20
+    //  . . . . Q . . . . . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (4.0, 14.0)/20
+    //  . . . . . . . . . . . . . . Q . . . . .  : off =(-9.5, -9.5)/20 + (14.0, 15.0)/20
+    //  . . . . . . . . . . . . . . . . . . Q .  : off =(-9.5, -9.5)/20 + (18.0, 16.0)/20
+    //  . . . . . . . . Q . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (8.0, 17.0)/20
+    //  . . . Q . . . . . . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (3.0, 18.0)/20
+    //  . . . . . . . . . . . . Q . . . . . . .  : off =(-9.5, -9.5)/20 + (12.0, 19.0)/20
+    static const float grid_size = 20.0;
+    assign_aa_cubic_constants();
+    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const float2 subpixel_support_diameter = ssd_fai.xy;
+    const float2 final_axis_importance = ssd_fai.zw;
+    const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter;
+    const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step;
+    //  Get the xy offset of each sample.  Exploit diagonal symmetry:
+    const float2 xy_offset0 = xy_start_offset + float2(7.0, 0.0) * xy_step;
+    const float2 xy_offset1 = xy_start_offset + float2(16.0, 1.0) * xy_step;
+    const float2 xy_offset2 = xy_start_offset + float2(11.0, 2.0) * xy_step;
+    const float2 xy_offset3 = xy_start_offset + float2(1.0, 3.0) * xy_step;
+    const float2 xy_offset4 = xy_start_offset + float2(5.0, 4.0) * xy_step;
+    const float2 xy_offset5 = xy_start_offset + float2(15.0, 5.0) * xy_step;
+    const float2 xy_offset6 = xy_start_offset + float2(10.0, 6.0) * xy_step;
+    const float2 xy_offset7 = xy_start_offset + float2(19.0, 7.0) * xy_step;
+    const float2 xy_offset8 = xy_start_offset + float2(2.0, 8.0) * xy_step;
+    const float2 xy_offset9 = xy_start_offset + float2(6.0, 9.0) * xy_step;
+    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
+    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
+    const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
+    const float3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance);
+    const float3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance);
+    const float3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance);
+    const float3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance);
+    const float3 w8 = eval_unorm_rgb_weights(xy_offset8, final_axis_importance);
+    const float3 w9 = eval_unorm_rgb_weights(xy_offset9, final_axis_importance);
+    const float3 w10 = w9.bgr;
+    const float3 w11 = w8.bgr;
+    const float3 w12 = w7.bgr;
+    const float3 w13 = w6.bgr;
+    const float3 w14 = w5.bgr;
+    const float3 w15 = w4.bgr;
+    const float3 w16 = w3.bgr;
+    const float3 w17 = w2.bgr;
+    const float3 w18 = w1.bgr;
+    const float3 w19 = w0.bgr;
+    //  Get the weight sum to normalize the total to 1.0 later:
+    const float3 half_sum = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9;
+    const float3 w_sum = half_sum + half_sum.bgr;
+    const float3 w_sum_inv = float3(1.0)/(w_sum);
+    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
+    const float2x2 true_pixel_to_tex_uv =
+        float2x2(pixel_to_tex_uv * aa_pixel_diameter);
+    //  Get uv sample offsets, mirror on odd frames if directed, and exploit
+    //  diagonal symmetry:
+    const float2 frame_sign = get_frame_sign(frame);
+    const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
+    const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
+    const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign);
+    const float2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign);
+    const float2 uv_offset4 = mul(true_pixel_to_tex_uv, xy_offset4 * frame_sign);
+    const float2 uv_offset5 = mul(true_pixel_to_tex_uv, xy_offset5 * frame_sign);
+    const float2 uv_offset6 = mul(true_pixel_to_tex_uv, xy_offset6 * frame_sign);
+    const float2 uv_offset7 = mul(true_pixel_to_tex_uv, xy_offset7 * frame_sign);
+    const float2 uv_offset8 = mul(true_pixel_to_tex_uv, xy_offset8 * frame_sign);
+    const float2 uv_offset9 = mul(true_pixel_to_tex_uv, xy_offset9 * frame_sign);
+    //  Load samples, linearizing if necessary, etc.:
+    const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
+    const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
+    const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb;
+    const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb;
+    const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset4).rgb;
+    const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset5).rgb;
+    const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset6).rgb;
+    const float3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset7).rgb;
+    const float3 sample8 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset8).rgb;
+    const float3 sample9 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset9).rgb;
+    const float3 sample10 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset9).rgb;
+    const float3 sample11 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset8).rgb;
+    const float3 sample12 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset7).rgb;
+    const float3 sample13 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset6).rgb;
+    const float3 sample14 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset5).rgb;
+    const float3 sample15 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset4).rgb;
+    const float3 sample16 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb;
+    const float3 sample17 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb;
+    const float3 sample18 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
+    const float3 sample19 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
+    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
+    return w_sum_inv * (
+        w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
+        w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 +
+        w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 +
+        w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15 +
+        w16 * sample16 + w17 * sample17 + w18 * sample18 + w19 * sample19);
+}
+
+float3 tex2Daa24x(const sampler2D tex, const float2 tex_uv,
+    const float2x2 pixel_to_tex_uv, const float frame)
+{
+    //  Use a diagonally symmetric 24-superqueens pattern where no 3 points are
+    //  exactly collinear and superqueens have a squared attack radius of 13.
+    //  . . . . . . Q . . . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (6.0, 0.0)/24
+    //  . . . . . . . . . . . . . . . . Q . . . . . . .  : off =(-11.5, -11.5)/24 + (16.0, 1.0)/24
+    //  . . . . . . . . . . Q . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (10.0, 2.0)/24
+    //  . . . . . . . . . . . . . . . . . . . . . Q . .  : off =(-11.5, -11.5)/24 + (21.0, 3.0)/24
+    //  . . . . . Q . . . . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (5.0, 4.0)/24
+    //  . . . . . . . . . . . . . . . Q . . . . . . . .  : off =(-11.5, -11.5)/24 + (15.0, 5.0)/24
+    //  . Q . . . . . . . . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (1.0, 6.0)/24
+    //  . . . . . . . . . . . Q . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (11.0, 7.0)/24
+    //  . . . . . . . . . . . . . . . . . . . Q . . . .  : off =(-11.5, -11.5)/24 + (19.0, 8.0)/24
+    //  . . . . . . . . . . . . . . . . . . . . . . . Q  : off =(-11.5, -11.5)/24 + (23.0, 9.0)/24
+    //  . . . Q . . . . . . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (3.0, 10.0)/24
+    //  . . . . . . . . . . . . . . Q . . . . . . . . .  : off =(-11.5, -11.5)/24 + (14.0, 11.0)/24
+    //  . . . . . . . . . Q . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (9.0, 12.0)/24
+    //  . . . . . . . . . . . . . . . . . . . . Q . . .  : off =(-11.5, -11.5)/24 + (20.0, 13.0)/24
+    //  Q . . . . . . . . . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (0.0, 14.0)/24
+    //  . . . . Q . . . . . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (4.0, 15.0)/24
+    //  . . . . . . . . . . . . Q . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (12.0, 16.0)/24
+    //  . . . . . . . . . . . . . . . . . . . . . . Q .  : off =(-11.5, -11.5)/24 + (22.0, 17.0)/24
+    //  . . . . . . . . Q . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (8.0, 18.0)/24
+    //  . . . . . . . . . . . . . . . . . . Q . . . . .  : off =(-11.5, -11.5)/24 + (18.0, 19.0)/24
+    //  . . Q . . . . . . . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (2.0, 20.0)/24
+    //  . . . . . . . . . . . . . Q . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (13.0, 21.0)/24
+    //  . . . . . . . Q . . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (7.0, 22.0)/24
+    //  . . . . . . . . . . . . . . . . . Q . . . . . .  : off =(-11.5, -11.5)/24 + (17.0, 23.0)/24
+    static const float grid_size = 24.0;
+    assign_aa_cubic_constants();
+    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const float2 subpixel_support_diameter = ssd_fai.xy;
+    const float2 final_axis_importance = ssd_fai.zw;
+    const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter;
+    const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step;
+    //  Get the xy offset of each sample.  Exploit diagonal symmetry:
+    const float2 xy_offset0 = xy_start_offset + float2(6.0, 0.0) * xy_step;
+    const float2 xy_offset1 = xy_start_offset + float2(16.0, 1.0) * xy_step;
+    const float2 xy_offset2 = xy_start_offset + float2(10.0, 2.0) * xy_step;
+    const float2 xy_offset3 = xy_start_offset + float2(21.0, 3.0) * xy_step;
+    const float2 xy_offset4 = xy_start_offset + float2(5.0, 4.0) * xy_step;
+    const float2 xy_offset5 = xy_start_offset + float2(15.0, 5.0) * xy_step;
+    const float2 xy_offset6 = xy_start_offset + float2(1.0, 6.0) * xy_step;
+    const float2 xy_offset7 = xy_start_offset + float2(11.0, 7.0) * xy_step;
+    const float2 xy_offset8 = xy_start_offset + float2(19.0, 8.0) * xy_step;
+    const float2 xy_offset9 = xy_start_offset + float2(23.0, 9.0) * xy_step;
+    const float2 xy_offset10 = xy_start_offset + float2(3.0, 10.0) * xy_step;
+    const float2 xy_offset11 = xy_start_offset + float2(14.0, 11.0) * xy_step;
+    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
+    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
+    const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
+    const float3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance);
+    const float3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance);
+    const float3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance);
+    const float3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance);
+    const float3 w8 = eval_unorm_rgb_weights(xy_offset8, final_axis_importance);
+    const float3 w9 = eval_unorm_rgb_weights(xy_offset9, final_axis_importance);
+    const float3 w10 = eval_unorm_rgb_weights(xy_offset10, final_axis_importance);
+    const float3 w11 = eval_unorm_rgb_weights(xy_offset11, final_axis_importance);
+    const float3 w12 = w11.bgr;
+    const float3 w13 = w10.bgr;
+    const float3 w14 = w9.bgr;
+    const float3 w15 = w8.bgr;
+    const float3 w16 = w7.bgr;
+    const float3 w17 = w6.bgr;
+    const float3 w18 = w5.bgr;
+    const float3 w19 = w4.bgr;
+    const float3 w20 = w3.bgr;
+    const float3 w21 = w2.bgr;
+    const float3 w22 = w1.bgr;
+    const float3 w23 = w0.bgr;
+    //  Get the weight sum to normalize the total to 1.0 later:
+    const float3 half_sum = w0 + w1 + w2 + w3 + w4 +
+        w5 + w6 + w7 + w8 + w9 + w10 + w11;
+    const float3 w_sum = half_sum + half_sum.bgr;
+    const float3 w_sum_inv = float3(1.0)/(w_sum);
+    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
+    const float2x2 true_pixel_to_tex_uv =
+        float2x2(pixel_to_tex_uv * aa_pixel_diameter);
+    //  Get uv sample offsets, mirror on odd frames if directed, and exploit
+    //  diagonal symmetry:
+    const float2 frame_sign = get_frame_sign(frame);
+    const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
+    const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
+    const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign);
+    const float2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign);
+    const float2 uv_offset4 = mul(true_pixel_to_tex_uv, xy_offset4 * frame_sign);
+    const float2 uv_offset5 = mul(true_pixel_to_tex_uv, xy_offset5 * frame_sign);
+    const float2 uv_offset6 = mul(true_pixel_to_tex_uv, xy_offset6 * frame_sign);
+    const float2 uv_offset7 = mul(true_pixel_to_tex_uv, xy_offset7 * frame_sign);
+    const float2 uv_offset8 = mul(true_pixel_to_tex_uv, xy_offset8 * frame_sign);
+    const float2 uv_offset9 = mul(true_pixel_to_tex_uv, xy_offset9 * frame_sign);
+    const float2 uv_offset10 = mul(true_pixel_to_tex_uv, xy_offset10 * frame_sign);
+    const float2 uv_offset11 = mul(true_pixel_to_tex_uv, xy_offset11 * frame_sign);
+    //  Load samples, linearizing if necessary, etc.:
+    const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
+    const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
+    const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb;
+    const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb;
+    const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset4).rgb;
+    const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset5).rgb;
+    const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset6).rgb;
+    const float3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset7).rgb;
+    const float3 sample8 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset8).rgb;
+    const float3 sample9 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset9).rgb;
+    const float3 sample10 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset10).rgb;
+    const float3 sample11 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset11).rgb;
+    const float3 sample12 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset11).rgb;
+    const float3 sample13 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset10).rgb;
+    const float3 sample14 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset9).rgb;
+    const float3 sample15 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset8).rgb;
+    const float3 sample16 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset7).rgb;
+    const float3 sample17 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset6).rgb;
+    const float3 sample18 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset5).rgb;
+    const float3 sample19 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset4).rgb;
+    const float3 sample20 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb;
+    const float3 sample21 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb;
+    const float3 sample22 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
+    const float3 sample23 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
+    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
+    return w_sum_inv * (
+        w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
+        w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 +
+        w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 +
+        w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15 +
+        w16 * sample16 + w17 * sample17 + w18 * sample18 + w19 * sample19 +
+        w20 * sample20 + w21 * sample21 + w22 * sample22 + w23 * sample23);
+}
+
+float3 tex2Daa_debug_16x_regular(const sampler2D tex, const float2 tex_uv,
+    const float2x2 pixel_to_tex_uv, const float frame)
+{
+    //  Sample on a regular 4x4 grid.  This is mainly for testing.
+    static const float grid_size = 4.0;
+    assign_aa_cubic_constants();
+    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const float2 subpixel_support_diameter = ssd_fai.xy;
+    const float2 final_axis_importance = ssd_fai.zw;
+    const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter;
+    const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step;
+    //  Get the xy offset of each sample:
+    const float2 xy_offset0 = xy_start_offset + float2(0.0, 0.0) * xy_step;
+    const float2 xy_offset1 = xy_start_offset + float2(1.0, 0.0) * xy_step;
+    const float2 xy_offset2 = xy_start_offset + float2(2.0, 0.0) * xy_step;
+    const float2 xy_offset3 = xy_start_offset + float2(3.0, 0.0) * xy_step;
+    const float2 xy_offset4 = xy_start_offset + float2(0.0, 1.0) * xy_step;
+    const float2 xy_offset5 = xy_start_offset + float2(1.0, 1.0) * xy_step;
+    const float2 xy_offset6 = xy_start_offset + float2(2.0, 1.0) * xy_step;
+    const float2 xy_offset7 = xy_start_offset + float2(3.0, 1.0) * xy_step;
+    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
+    //  (We can't exploit vertical or horizontal symmetry due to uncertain
+    //  subpixel offsets.  We could fix that by rotating xy offsets with the
+    //  subpixel structure, but...no.)
+    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
+    const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
+    const float3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance);
+    const float3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance);
+    const float3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance);
+    const float3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance);
+    const float3 w8 = w7.bgr;
+    const float3 w9 = w6.bgr;
+    const float3 w10 = w5.bgr;
+    const float3 w11 = w4.bgr;
+    const float3 w12 = w3.bgr;
+    const float3 w13 = w2.bgr;
+    const float3 w14 = w1.bgr;
+    const float3 w15 = w0.bgr;
+    //  Get the weight sum to normalize the total to 1.0 later:
+    const float3 half_sum = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7;
+    const float3 w_sum = half_sum + half_sum.bgr;
+    const float3 w_sum_inv = float3(1.0)/(w_sum);
+    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
+    const float2x2 true_pixel_to_tex_uv =
+        float2x2(pixel_to_tex_uv * aa_pixel_diameter);
+    //  Get uv sample offsets, taking advantage of row alignment:
+    const float2 uv_step_x = mul(true_pixel_to_tex_uv, float2(xy_step.x, 0.0));
+    const float2 uv_step_y = mul(true_pixel_to_tex_uv, float2(0.0, xy_step.y));
+    const float2 uv_offset0 = -1.5 * (uv_step_x + uv_step_y);
+    const float2 sample0_uv = tex_uv + uv_offset0;
+    const float2 sample4_uv = sample0_uv + uv_step_y;
+    const float2 sample8_uv = sample0_uv + uv_step_y * 2.0;
+    const float2 sample12_uv = sample0_uv + uv_step_y * 3.0;
+    //  Load samples, linearizing if necessary, etc.:
+    const float3 sample0 = tex2Daa_tiled_linearize(tex, sample0_uv).rgb;
+    const float3 sample1 = tex2Daa_tiled_linearize(tex, sample0_uv + uv_step_x).rgb;
+    const float3 sample2 = tex2Daa_tiled_linearize(tex, sample0_uv + uv_step_x * 2.0).rgb;
+    const float3 sample3 = tex2Daa_tiled_linearize(tex, sample0_uv + uv_step_x * 3.0).rgb;
+    const float3 sample4 = tex2Daa_tiled_linearize(tex, sample4_uv).rgb;
+    const float3 sample5 = tex2Daa_tiled_linearize(tex, sample4_uv + uv_step_x).rgb;
+    const float3 sample6 = tex2Daa_tiled_linearize(tex, sample4_uv + uv_step_x * 2.0).rgb;
+    const float3 sample7 = tex2Daa_tiled_linearize(tex, sample4_uv + uv_step_x * 3.0).rgb;
+    const float3 sample8 = tex2Daa_tiled_linearize(tex, sample8_uv).rgb;
+    const float3 sample9 = tex2Daa_tiled_linearize(tex, sample8_uv + uv_step_x).rgb;
+    const float3 sample10 = tex2Daa_tiled_linearize(tex, sample8_uv + uv_step_x * 2.0).rgb;
+    const float3 sample11 = tex2Daa_tiled_linearize(tex, sample8_uv + uv_step_x * 3.0).rgb;
+    const float3 sample12 = tex2Daa_tiled_linearize(tex, sample12_uv).rgb;
+    const float3 sample13 = tex2Daa_tiled_linearize(tex, sample12_uv + uv_step_x).rgb;
+    const float3 sample14 = tex2Daa_tiled_linearize(tex, sample12_uv + uv_step_x * 2.0).rgb;
+    const float3 sample15 = tex2Daa_tiled_linearize(tex, sample12_uv + uv_step_x * 3.0).rgb;
+    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
+    return w_sum_inv * (
+        w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
+        w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 +
+        w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 +
+        w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15);
+}
+
+float3 tex2Daa_debug_dynamic(const sampler2D tex, const float2 tex_uv,
+    const float2x2 pixel_to_tex_uv, const float frame)
+{
+    //  This function is for testing only: Use an NxN grid with dynamic weights.
+    static const int grid_size = 8;
+    assign_aa_cubic_constants();
+    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const float2 subpixel_support_diameter = ssd_fai.xy;
+    const float2 final_axis_importance = ssd_fai.zw;
+    const float grid_radius_in_samples = (float(grid_size) - 1.0)/2.0;
+    const float2 filter_space_offset_step =
+        subpixel_support_diameter/float2(grid_size);
+    const float2 sample0_filter_space_offset =
+        -grid_radius_in_samples * filter_space_offset_step;
+    //  Compute xy sample offsets and subpixel weights:
+    float3 weights[64]; //originally grid_size * grid_size
+    float3 weight_sum = float3(0.0, 0.0, 0.0);
+    for(int i = 0; i < grid_size; ++i)
+    {
+        for(int j = 0; j < grid_size; ++j)
+        {
+            //  Weights based on xy distances:
+            const float2 offset = sample0_filter_space_offset +
+                float2(j, i) * filter_space_offset_step;
+            const float3 weight = eval_unorm_rgb_weights(offset, final_axis_importance);
+            weights[i*grid_size + j] = weight;
+            weight_sum += weight;
+        }
+    }
+    //  Get uv offset vectors along x and y directions:
+    const float2x2 true_pixel_to_tex_uv =
+        float2x2(pixel_to_tex_uv * aa_pixel_diameter);
+    const float2 uv_offset_step_x = mul(true_pixel_to_tex_uv,
+        float2(filter_space_offset_step.x, 0.0));
+    const float2 uv_offset_step_y = mul(true_pixel_to_tex_uv,
+        float2(0.0, filter_space_offset_step.y));
+    //  Get a starting sample location:
+    const float2 sample0_uv_offset = -grid_radius_in_samples *
+        (uv_offset_step_x + uv_offset_step_y);
+    const float2 sample0_uv = tex_uv + sample0_uv_offset;
+    //  Load, weight, and sum [linearized] samples:
+    float3 sum = float3(0.0, 0.0, 0.0);
+    const float3 weight_sum_inv = float3(1.0)/weight_sum;
+    for(int i = 0; i < grid_size; ++i)
+    {
+        const float2 row_i_first_sample_uv =
+            sample0_uv + i * uv_offset_step_y;
+        for(int j = 0; j < grid_size; ++j)
+        {
+            const float2 sample_uv =
+                row_i_first_sample_uv + j * uv_offset_step_x;
+            sum += weights[i*grid_size + j] *
+                tex2Daa_tiled_linearize(tex, sample_uv).rgb;
+        }
+    }
+    return sum * weight_sum_inv;
+}
+
+
+///////////////////////  ANTIALIASING CODEPATH SELECTION  //////////////////////
+
+inline float3 tex2Daa(const sampler2D tex, const float2 tex_uv,
+    const float2x2 pixel_to_tex_uv, const float frame)
+{
+#ifdef DEBUG
+	return tex2Daa_subpixel_weights_only(
+            tex, tex_uv, pixel_to_tex_uv);
+#else
+	//  Statically switch between antialiasing modes/levels:
+    return (aa_level < 0.5) ? tex2D_linearize(tex, tex_uv).rgb :
+        (aa_level < 3.5) ? tex2Daa_subpixel_weights_only(
+            tex, tex_uv, pixel_to_tex_uv) :
+        (aa_level < 4.5) ? tex2Daa4x(tex, tex_uv, pixel_to_tex_uv, frame) :
+        (aa_level < 5.5) ? tex2Daa5x(tex, tex_uv, pixel_to_tex_uv, frame) :
+        (aa_level < 6.5) ? tex2Daa6x(tex, tex_uv, pixel_to_tex_uv, frame) :
+        (aa_level < 7.5) ? tex2Daa7x(tex, tex_uv, pixel_to_tex_uv, frame) :
+        (aa_level < 11.5) ? tex2Daa8x(tex, tex_uv, pixel_to_tex_uv, frame) :
+        (aa_level < 15.5) ? tex2Daa12x(tex, tex_uv, pixel_to_tex_uv, frame) :
+        (aa_level < 19.5) ? tex2Daa16x(tex, tex_uv, pixel_to_tex_uv, frame) :
+        (aa_level < 23.5) ? tex2Daa20x(tex, tex_uv, pixel_to_tex_uv, frame) :
+        (aa_level < 253.5) ? tex2Daa24x(tex, tex_uv, pixel_to_tex_uv, frame) :
+        (aa_level < 254.5) ? tex2Daa_debug_16x_regular(
+            tex, tex_uv, pixel_to_tex_uv, frame) :
+        tex2Daa_debug_dynamic(tex, tex_uv, pixel_to_tex_uv, frame);
+#endif
+}
+
+
+#endif  //  TEX2DANTIALIAS_H
+
+/////////////////////////  END TEX2DANTIALIAS  /////////////////////////
+
+//#include "geometry-functions.h"
+
+/////////////////////////  BEGIN GEOMETRY-FUNCTIONS  /////////////////////////
+
+#ifndef GEOMETRY_FUNCTIONS_H
+#define GEOMETRY_FUNCTIONS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+// already included elsewhere
+//#include "../user-settings.h"
+//#include "derived-settings-and-constants.h"
+//#include "bind-shader-h"
+
+
+////////////////////////////  MACROS AND CONSTANTS  ////////////////////////////
+
+//  Curvature-related constants:
+#define MAX_POINT_CLOUD_SIZE 9
+
+
+/////////////////////////////  CURVATURE FUNCTIONS /////////////////////////////
+
+float2 quadratic_solve(const float a, const float b_over_2, const float c)
+{
+    //  Requires:   1.) a, b, and c are quadratic formula coefficients
+    //              2.) b_over_2 = b/2.0 (simplifies terms to factor 2 out)
+    //              3.) b_over_2 must be guaranteed < 0.0 (avoids a branch)
+    //  Returns:    Returns float2(first_solution, discriminant), so the caller
+    //              can choose how to handle the "no intersection" case.  The
+    //              Kahan or Citardauq formula is used for numerical robustness.
+    const float discriminant = b_over_2*b_over_2 - a*c;
+    const float solution0 = c/(-b_over_2 + sqrt(discriminant));
+    return float2(solution0, discriminant);
+}
+
+float2 intersect_sphere(const float3 view_vec, const float3 eye_pos_vec)
+{
+    //  Requires:   1.) view_vec and eye_pos_vec are 3D vectors in the sphere's
+    //                  local coordinate frame (eye_pos_vec is a position, i.e.
+    //                  a vector from the origin to the eye/camera)
+    //              2.) geom_radius is a global containing the sphere's radius
+    //  Returns:    Cast a ray of direction view_vec from eye_pos_vec at a
+    //              sphere of radius geom_radius, and return the distance to
+    //              the first intersection in units of length(view_vec).
+    //              http://wiki.cgsociety.org/index.php/Ray_Sphere_Intersection
+    //  Quadratic formula coefficients (b_over_2 is guaranteed negative):
+    const float a = dot(view_vec, view_vec);
+    const float b_over_2 = dot(view_vec, eye_pos_vec);  //  * 2.0 factored out
+    const float c = dot(eye_pos_vec, eye_pos_vec) - geom_radius*geom_radius;
+    return quadratic_solve(a, b_over_2, c);
+}
+
+float2 intersect_cylinder(const float3 view_vec, const float3 eye_pos_vec)
+{
+    //  Requires:   1.) view_vec and eye_pos_vec are 3D vectors in the sphere's
+    //                  local coordinate frame (eye_pos_vec is a position, i.e.
+    //                  a vector from the origin to the eye/camera)
+    //              2.) geom_radius is a global containing the cylinder's radius
+    //  Returns:    Cast a ray of direction view_vec from eye_pos_vec at a
+    //              cylinder of radius geom_radius, and return the distance to
+    //              the first intersection in units of length(view_vec).  The
+    //              derivation of the coefficients is in Christer Ericson's
+    //              Real-Time Collision Detection, p. 195-196, and this version
+    //              uses LaGrange's identity to reduce operations.
+    //  Arbitrary "cylinder top" reference point for an infinite cylinder:
+    const float3 cylinder_top_vec = float3(0.0, geom_radius, 0.0);
+    const float3 cylinder_axis_vec = float3(0.0, 1.0, 0.0);//float3(0.0, 2.0*geom_radius, 0.0);
+    const float3 top_to_eye_vec = eye_pos_vec - cylinder_top_vec;
+    const float3 axis_x_view = cross(cylinder_axis_vec, view_vec);
+    const float3 axis_x_top_to_eye = cross(cylinder_axis_vec, top_to_eye_vec);
+    //  Quadratic formula coefficients (b_over_2 is guaranteed negative):
+    const float a = dot(axis_x_view, axis_x_view);
+    const float b_over_2 = dot(axis_x_top_to_eye, axis_x_view);
+    const float c = dot(axis_x_top_to_eye, axis_x_top_to_eye) -
+        geom_radius*geom_radius;//*dot(cylinder_axis_vec, cylinder_axis_vec);
+    return quadratic_solve(a, b_over_2, c);
+}
+
+float2 cylinder_xyz_to_uv(const float3 intersection_pos_local,
+    const float2 geom_aspect)
+{
+    //  Requires:   An xyz intersection position on a cylinder.
+    //  Returns:    video_uv coords mapped to range [-0.5, 0.5]
+    //  Mapping:    Define square_uv.x to be the signed arc length in xz-space,
+    //              and define square_uv.y = -intersection_pos_local.y (+v = -y).
+    //  Start with a numerically robust arc length calculation.
+    const float angle_from_image_center = atan2(intersection_pos_local.x,
+        intersection_pos_local.z);
+    const float signed_arc_len = angle_from_image_center * geom_radius;
+    //  Get a uv-mapping where [-0.5, 0.5] maps to a "square" area, then divide
+    //  by the aspect ratio to stretch the mapping appropriately:
+    const float2 square_uv = float2(signed_arc_len, -intersection_pos_local.y);
+    const float2 video_uv = square_uv / geom_aspect;
+    return video_uv;
+}
+
+float3 cylinder_uv_to_xyz(const float2 video_uv, const float2 geom_aspect)
+{
+    //  Requires:   video_uv coords mapped to range [-0.5, 0.5]
+    //  Returns:    An xyz intersection position on a cylinder.  This is the
+    //              inverse of cylinder_xyz_to_uv().
+    //  Expand video_uv by the aspect ratio to get proportionate x/y lengths,
+    //  then calculate an xyz position for the cylindrical mapping above.
+    const float2 square_uv = video_uv * geom_aspect;
+    const float arc_len = square_uv.x;
+    const float angle_from_image_center = arc_len / geom_radius;
+    const float x_pos = sin(angle_from_image_center) * geom_radius;
+    const float z_pos = cos(angle_from_image_center) * geom_radius;
+    //  Or: z = sqrt(geom_radius**2 - x**2)
+    //  Or: z = geom_radius/sqrt(1.0 + tan(angle)**2), x = z * tan(angle)
+    const float3 intersection_pos_local = float3(x_pos, -square_uv.y, z_pos);
+    return intersection_pos_local;
+}
+
+float2 sphere_xyz_to_uv(const float3 intersection_pos_local,
+    const float2 geom_aspect)
+{
+    //  Requires:   An xyz intersection position on a sphere.
+    //  Returns:    video_uv coords mapped to range [-0.5, 0.5]
+    //  Mapping:    First define square_uv.x/square_uv.y ==
+    //              intersection_pos_local.x/intersection_pos_local.y.  Then,
+    //              length(square_uv) is the arc length from the image center
+    //              at (0.0, 0.0, geom_radius) along the tangent great circle.
+    //              Credit for this mapping goes to cgwg: I never managed to
+    //              understand his code, but he told me his mapping was based on
+    //              great circle distances when I asked him about it, which
+    //              informed this very similar (almost identical) mapping.
+    //  Start with a numerically robust arc length calculation between the ray-
+    //  sphere intersection point and the image center using a method posted by
+    //  Roger Stafford on comp.soft-sys.matlab:
+    //  https://groups.google.com/d/msg/comp.soft-sys.matlab/zNbUui3bjcA/c0HV_bHSx9cJ
+    const float3 image_center_pos_local = float3(0.0, 0.0, geom_radius);
+    const float cp_len =
+        length(cross(intersection_pos_local, image_center_pos_local));
+    const float dp = dot(intersection_pos_local, image_center_pos_local);
+    const float angle_from_image_center = atan2(cp_len, dp);
+    const float arc_len = angle_from_image_center * geom_radius;
+    //  Get a uv-mapping where [-0.5, 0.5] maps to a "square" area, then divide
+    //  by the aspect ratio to stretch the mapping appropriately:
+    const float2 square_uv_unit = normalize(float2(intersection_pos_local.x,
+        -intersection_pos_local.y));
+    const float2 square_uv = arc_len * square_uv_unit;
+    const float2 video_uv = square_uv / geom_aspect;
+    return video_uv;
+}
+
+float3 sphere_uv_to_xyz(const float2 video_uv, const float2 geom_aspect)
+{
+    //  Requires:   video_uv coords mapped to range [-0.5, 0.5]
+    //  Returns:    An xyz intersection position on a sphere.  This is the
+    //              inverse of sphere_xyz_to_uv().
+    //  Expand video_uv by the aspect ratio to get proportionate x/y lengths,
+    //  then calculate an xyz position for the spherical mapping above.
+    const float2 square_uv = video_uv * geom_aspect;
+    //  Using length or sqrt here butchers the framerate on my 8800GTS if
+    //  this function is called too many times, and so does taking the max
+    //  component of square_uv/square_uv_unit (program length threshold?).
+    //float arc_len = length(square_uv);
+    const float2 square_uv_unit = normalize(square_uv);
+    const float arc_len = square_uv.y/square_uv_unit.y;
+    const float angle_from_image_center = arc_len / geom_radius;
+    const float xy_dist_from_sphere_center =
+        sin(angle_from_image_center) * geom_radius;
+    //float2 xy_pos = xy_dist_from_sphere_center * (square_uv/FIX_ZERO(arc_len));
+    const float2 xy_pos = xy_dist_from_sphere_center * square_uv_unit;
+    const float z_pos = cos(angle_from_image_center) * geom_radius;
+    const float3 intersection_pos_local = float3(xy_pos.x, -xy_pos.y, z_pos);
+    return intersection_pos_local;
+}
+
+float2 sphere_alt_xyz_to_uv(const float3 intersection_pos_local,
+    const float2 geom_aspect)
+{
+    //  Requires:   An xyz intersection position on a cylinder.
+    //  Returns:    video_uv coords mapped to range [-0.5, 0.5]
+    //  Mapping:    Define square_uv.x to be the signed arc length in xz-space,
+    //              and define square_uv.y == signed arc length in yz-space.
+    //  See cylinder_xyz_to_uv() for implementation details (very similar).
+    const float2 angle_from_image_center = atan2(
+        float2(intersection_pos_local.x, -intersection_pos_local.y),
+        intersection_pos_local.zz);
+    const float2 signed_arc_len = angle_from_image_center * geom_radius;
+    const float2 video_uv = signed_arc_len / geom_aspect;
+    return video_uv;
+}
+
+float3 sphere_alt_uv_to_xyz(const float2 video_uv, const float2 geom_aspect)
+{
+    //  Requires:   video_uv coords mapped to range [-0.5, 0.5]
+    //  Returns:    An xyz intersection position on a sphere.  This is the
+    //              inverse of sphere_alt_xyz_to_uv().
+    //  See cylinder_uv_to_xyz() for implementation details (very similar).
+    const float2 square_uv = video_uv * geom_aspect;
+    const float2 arc_len = square_uv;
+    const float2 angle_from_image_center = arc_len / geom_radius;
+    const float2 xy_pos = sin(angle_from_image_center) * geom_radius;
+    const float z_pos = sqrt(geom_radius*geom_radius - dot(xy_pos, xy_pos));
+    return float3(xy_pos.x, -xy_pos.y, z_pos);
+}
+
+inline float2 intersect(const float3 view_vec_local, const float3 eye_pos_local,
+    const float geom_mode)
+{
+    return geom_mode < 2.5 ? intersect_sphere(view_vec_local, eye_pos_local) :
+        intersect_cylinder(view_vec_local, eye_pos_local);
+}
+
+inline float2 xyz_to_uv(const float3 intersection_pos_local,
+    const float2 geom_aspect, const float geom_mode)
+{
+    return geom_mode < 1.5 ?
+            sphere_xyz_to_uv(intersection_pos_local, geom_aspect) :
+        geom_mode < 2.5 ?
+            sphere_alt_xyz_to_uv(intersection_pos_local, geom_aspect) :
+            cylinder_xyz_to_uv(intersection_pos_local, geom_aspect);
+}
+
+inline float3 uv_to_xyz(const float2 uv, const float2 geom_aspect,
+    const float geom_mode)
+{
+    return geom_mode < 1.5 ? sphere_uv_to_xyz(uv, geom_aspect) :
+        geom_mode < 2.5 ? sphere_alt_uv_to_xyz(uv, geom_aspect) :
+        cylinder_uv_to_xyz(uv, geom_aspect);
+}
+
+float2 view_vec_to_uv(const float3 view_vec_local, const float3 eye_pos_local,
+    const float2 geom_aspect, const float geom_mode, out float3 intersection_pos)
+{
+    //  Get the intersection point on the primitive, given an eye position
+    //  and view vector already in its local coordinate frame:
+    const float2 intersect_dist_and_discriminant = intersect(view_vec_local,
+        eye_pos_local, geom_mode);
+    const float3 intersection_pos_local = eye_pos_local +
+        view_vec_local * intersect_dist_and_discriminant.x;
+    //  Save the intersection position to an output parameter:
+    intersection_pos = intersection_pos_local;
+    //  Transform into uv coords, but give out-of-range coords if the
+    //  view ray doesn't intersect the primitive in the first place:
+    return intersect_dist_and_discriminant.y > 0.005 ?
+        xyz_to_uv(intersection_pos_local, geom_aspect, geom_mode) : float2(1.0);
+}
+
+float3 get_ideal_global_eye_pos_for_points(float3 eye_pos,
+    const float2 geom_aspect, const float3 global_coords[MAX_POINT_CLOUD_SIZE],
+    const int num_points)
+{
+    //  Requires:   Parameters:
+    //              1.) Starting eye_pos is a global 3D position at which the
+    //                  camera contains all points in global_coords[] in its FOV
+    //              2.) geom_aspect = get_aspect_vector(
+    //                      output_size.x / output_size.y);
+    //              3.) global_coords is a point cloud containing global xyz
+    //                  coords of extreme points on the simulated CRT screen.
+    //              Globals:
+    //              1.) geom_view_dist must be > 0.0.  It controls the "near
+    //                  plane" used to interpret flat_video_uv as a view
+    //                  vector, which controls the field of view (FOV).
+    //              Eyespace coordinate frame: +x = right, +y = up, +z = back
+    //  Returns:    Return an eye position at which the point cloud spans as
+    //              much of the screen as possible (given the FOV controlled by
+    //              geom_view_dist) without being cropped or sheared.
+    //  Algorithm:
+    //  1.) Move the eye laterally to a point which attempts to maximize the
+    //      the amount we can move forward without clipping the CRT screen.
+    //  2.) Move forward by as much as possible without clipping the CRT.
+    //  Get the allowed movement range by solving for the eye_pos offsets
+    //  that result in each point being projected to a screen edge/corner in
+    //  pseudo-normalized device coords (where xy ranges from [-0.5, 0.5]
+    //  and z = eyespace z):
+    //      pndc_coord = float3(float2(eyespace_xyz.x, -eyespace_xyz.y)*
+    //      geom_view_dist / (geom_aspect * -eyespace_xyz.z), eyespace_xyz.z);
+    //  Notes:
+    //  The field of view is controlled by geom_view_dist's magnitude relative to
+    //  the view vector's x and y components:
+    //      view_vec.xy ranges from [-0.5, 0.5] * geom_aspect
+    //      view_vec.z = -geom_view_dist
+    //  But for the purposes of perspective divide, it should be considered:
+    //      view_vec.xy ranges from [-0.5, 0.5] * geom_aspect / geom_view_dist
+    //      view_vec.z = -1.0
+    static const int max_centering_iters = 1;  //  Keep for easy testing.
+    for(int iter = 0; iter < max_centering_iters; iter++)
+    {
+        //  0.) Get the eyespace coordinates of our point cloud:
+        float3 eyespace_coords[MAX_POINT_CLOUD_SIZE];
+        for(int i = 0; i < num_points; i++)
+        {
+            eyespace_coords[i] = global_coords[i] - eye_pos;
+        }
+        //  1a.)For each point, find out how far we can move eye_pos in each
+        //      lateral direction without the point clipping the frustum.
+        //      Eyespace +y = up, screenspace +y = down, so flip y after
+        //      applying the eyespace offset (on the way to "clip space").
+        //  Solve for two offsets per point based on:
+        //      (eyespace_xyz.xy - offset_dr) * float2(1.0, -1.0) *
+        //      geom_view_dist / (geom_aspect * -eyespace_xyz.z) = float2(-0.5)
+        //      (eyespace_xyz.xy - offset_dr) * float2(1.0, -1.0) *
+        //      geom_view_dist / (geom_aspect * -eyespace_xyz.z) = float2(0.5)
+        //  offset_ul and offset_dr represent the farthest we can move the
+        //  eye_pos up-left and down-right.  Save the min of all offset_dr's
+        //  and the max of all offset_ul's (since it's negative).
+        float abs_radius = abs(geom_radius);  //  In case anyone gets ideas. ;)
+        float2 offset_dr_min = float2(10.0 * abs_radius, 10.0 * abs_radius);
+        float2 offset_ul_max = float2(-10.0 * abs_radius, -10.0 * abs_radius);
+        for(int i = 0; i < num_points; i++)
+        {
+            static const float2 flipy = float2(1.0, -1.0);
+            float3 eyespace_xyz = eyespace_coords[i];
+            float2 offset_dr = eyespace_xyz.xy - float2(-0.5) *
+                (geom_aspect * -eyespace_xyz.z) / (geom_view_dist * flipy);
+            float2 offset_ul = eyespace_xyz.xy - float2(0.5) *
+                (geom_aspect * -eyespace_xyz.z) / (geom_view_dist * flipy);
+            offset_dr_min = min(offset_dr_min, offset_dr);
+            offset_ul_max = max(offset_ul_max, offset_ul);
+        }
+        //  1b.)Update eye_pos: Adding the average of offset_ul_max and
+        //      offset_dr_min gives it equal leeway on the top vs. bottom
+        //      and left vs. right.  Recalculate eyespace_coords accordingly.
+        float2 center_offset = 0.5 * (offset_ul_max + offset_dr_min);
+        eye_pos.xy += center_offset;
+        for(int i = 0; i < num_points; i++)
+        {
+            eyespace_coords[i] = global_coords[i] - eye_pos;
+        }
+        //  2a.)For each point, find out how far we can move eye_pos forward
+        //      without the point clipping the frustum.  Flip the y
+        //      direction in advance (matters for a later step, not here).
+        //      Solve for four offsets per point based on:
+        //      eyespace_xyz_flipy.x * geom_view_dist /
+        //          (geom_aspect.x * (offset_z - eyespace_xyz_flipy.z)) =-0.5
+        //      eyespace_xyz_flipy.y * geom_view_dist /
+        //          (geom_aspect.y * (offset_z - eyespace_xyz_flipy.z)) =-0.5
+        //      eyespace_xyz_flipy.x * geom_view_dist /
+        //          (geom_aspect.x * (offset_z - eyespace_xyz_flipy.z)) = 0.5
+        //      eyespace_xyz_flipy.y * geom_view_dist /
+        //          (geom_aspect.y * (offset_z - eyespace_xyz_flipy.z)) = 0.5
+        //      We'll vectorize the actual computation.  Take the maximum of
+        //      these four for a single offset, and continue taking the max
+        //      for every point (use max because offset.z is negative).
+        float offset_z_max = -10.0 * geom_radius * geom_view_dist;
+        for(int i = 0; i < num_points; i++)
+        {
+            float3 eyespace_xyz_flipy = eyespace_coords[i] *
+                float3(1.0, -1.0, 1.0);
+            float4 offset_zzzz = eyespace_xyz_flipy.zzzz +
+                (eyespace_xyz_flipy.xyxy * geom_view_dist) /
+                (float4(-0.5, -0.5, 0.5, 0.5) * float4(geom_aspect, geom_aspect));
+            //  Ignore offsets that push positive x/y values to opposite
+            //  boundaries, and vice versa, and don't let the camera move
+            //  past a point in the dead center of the screen:
+            offset_z_max = (eyespace_xyz_flipy.x < 0.0) ?
+                max(offset_z_max, offset_zzzz.x) : offset_z_max;
+            offset_z_max = (eyespace_xyz_flipy.y < 0.0) ?
+                max(offset_z_max, offset_zzzz.y) : offset_z_max;
+            offset_z_max = (eyespace_xyz_flipy.x > 0.0) ?
+                max(offset_z_max, offset_zzzz.z) : offset_z_max;
+            offset_z_max = (eyespace_xyz_flipy.y > 0.0) ?
+                max(offset_z_max, offset_zzzz.w) : offset_z_max;
+            offset_z_max = max(offset_z_max, eyespace_xyz_flipy.z);
+        }
+        //  2b.)Update eye_pos: Add the maximum (smallest negative) z offset.
+        eye_pos.z += offset_z_max;
+    }
+    return eye_pos;
+}
+
+float3 get_ideal_global_eye_pos(const float3x3 local_to_global,
+    const float2 geom_aspect, const float geom_mode)
+{
+    //  Start with an initial eye_pos that includes the entire primitive
+    //  (sphere or cylinder) in its field-of-view:
+    const float3 high_view = float3(0.0, geom_aspect.y, -geom_view_dist);
+    const float3 low_view = high_view * float3(1.0, -1.0, 1.0);
+    const float len_sq = dot(high_view, high_view);
+    const float fov = abs(acos(dot(high_view, low_view)/len_sq));
+    //  Trigonometry/similar triangles say distance = geom_radius/sin(fov/2):
+    const float eye_z_spherical = geom_radius/sin(fov*0.5);
+    const float3 eye_pos = geom_mode < 2.5 ?
+        float3(0.0, 0.0, eye_z_spherical) :
+        float3(0.0, 0.0, max(geom_view_dist, eye_z_spherical));
+
+    //  Get global xyz coords of extreme sample points on the simulated CRT
+    //  screen.  Start with the center, edge centers, and corners of the
+    //  video image.  We can't ignore backfacing points: They're occluded
+    //  by closer points on the primitive, but they may NOT be occluded by
+    //  the convex hull of the remaining samples (i.e. the remaining convex
+    //  hull might not envelope points that do occlude a back-facing point.)
+    static const int num_points = MAX_POINT_CLOUD_SIZE;
+    float3 global_coords[MAX_POINT_CLOUD_SIZE];
+    global_coords[0] = mul(local_to_global, uv_to_xyz(float2(0.0, 0.0), geom_aspect, geom_mode));
+    global_coords[1] = mul(local_to_global, uv_to_xyz(float2(0.0, -0.5), geom_aspect, geom_mode));
+    global_coords[2] = mul(local_to_global, uv_to_xyz(float2(0.0, 0.5), geom_aspect, geom_mode));
+    global_coords[3] = mul(local_to_global, uv_to_xyz(float2(-0.5, 0.0), geom_aspect, geom_mode));
+    global_coords[4] = mul(local_to_global, uv_to_xyz(float2(0.5, 0.0), geom_aspect, geom_mode));
+    global_coords[5] = mul(local_to_global, uv_to_xyz(float2(-0.5, -0.5), geom_aspect, geom_mode));
+    global_coords[6] = mul(local_to_global, uv_to_xyz(float2(0.5, -0.5), geom_aspect, geom_mode));
+    global_coords[7] = mul(local_to_global, uv_to_xyz(float2(-0.5, 0.5), geom_aspect, geom_mode));
+    global_coords[8] = mul(local_to_global, uv_to_xyz(float2(0.5, 0.5), geom_aspect, geom_mode));
+    //  Adding more inner image points could help in extreme cases, but too many
+    //  points will kille the framerate.  For safety, default to the initial
+    //  eye_pos if any z coords are negative:
+    float num_negative_z_coords = 0.0;
+    for(int i = 0; i < num_points; i++)
+    {
+        num_negative_z_coords += float(global_coords[0].z < 0.0);
+    }
+    //  Outsource the optimized eye_pos calculation:
+    return num_negative_z_coords > 0.5 ? eye_pos :
+        get_ideal_global_eye_pos_for_points(eye_pos, geom_aspect,
+            global_coords, num_points);
+}
+
+float3x3 get_pixel_to_object_matrix(const float3x3 global_to_local,
+    const float3 eye_pos_local, const float3 view_vec_global,
+    const float3 intersection_pos_local, const float3 normal,
+    const float2 output_size_inv)
+{
+    //  Requires:   See get_curved_video_uv_coords_and_tangent_matrix for
+    //              descriptions of each parameter.
+    //  Returns:    Return a transformation matrix from 2D pixel-space vectors
+    //              (where (+1.0, +1.0) is a vector to one pixel down-right,
+    //              i.e. same directionality as uv texels) to 3D object-space
+    //              vectors in the CRT's local coordinate frame (right-handed)
+    //              ***which are tangent to the CRT surface at the intersection
+    //              position.***  (Basically, we want to convert pixel-space
+    //              vectors to 3D vectors along the CRT's surface, for later
+    //              conversion to uv vectors.)
+    //  Shorthand inputs:
+    const float3 pos = intersection_pos_local;
+    const float3 eye_pos = eye_pos_local;
+    //  Get a piecewise-linear matrix transforming from "pixelspace" offset
+    //  vectors (1.0 = one pixel) to object space vectors in the tangent
+    //  plane (faster than finding 3 view-object intersections).
+    //  1.) Get the local view vecs for the pixels to the right and down:
+    const float3 view_vec_right_global = view_vec_global +
+        float3(output_size_inv.x, 0.0, 0.0);
+    const float3 view_vec_down_global = view_vec_global +
+        float3(0.0, -output_size_inv.y, 0.0);
+    const float3 view_vec_right_local =
+        mul(global_to_local, view_vec_right_global);
+    const float3 view_vec_down_local =
+        mul(global_to_local, view_vec_down_global);
+    //  2.) Using the true intersection point, intersect the neighboring
+    //      view vectors with the tangent plane:
+    const float3 intersection_vec_dot_normal = float3(dot(pos - eye_pos, normal), dot(pos - eye_pos, normal), dot(pos - eye_pos, normal));
+    const float3 right_pos = eye_pos + (intersection_vec_dot_normal /
+        dot(view_vec_right_local, normal))*view_vec_right_local;
+    const float3 down_pos = eye_pos + (intersection_vec_dot_normal /
+        dot(view_vec_down_local, normal))*view_vec_down_local;
+    //  3.) Subtract the original intersection pos from its neighbors; the
+    //      resulting vectors are object-space vectors tangent to the plane.
+    //      These vectors are the object-space transformations of (1.0, 0.0)
+    //      and (0.0, 1.0) pixel offsets, so they form the first two basis
+    //      vectors of a pixelspace to object space transformation.  This
+    //      transformation is 2D to 3D, so use (0, 0, 0) for the third vector.
+    const float3 object_right_vec = right_pos - pos;
+    const float3 object_down_vec = down_pos - pos;
+    const float3x3 pixel_to_object = float3x3(
+        object_right_vec.x, object_down_vec.x, 0.0,
+        object_right_vec.y, object_down_vec.y, 0.0,
+        object_right_vec.z, object_down_vec.z, 0.0);
+    return pixel_to_object;
+}
+
+float3x3 get_object_to_tangent_matrix(const float3 intersection_pos_local,
+    const float3 normal, const float2 geom_aspect, const float geom_mode)
+{
+    //  Requires:   See get_curved_video_uv_coords_and_tangent_matrix for
+    //              descriptions of each parameter.
+    //  Returns:    Return a transformation matrix from 3D object-space vectors
+    //              in the CRT's local coordinate frame (right-handed, +y = up)
+    //              to 2D video_uv vectors (+v = down).
+    //  Description:
+    //  The TBN matrix formed by the [tangent, bitangent, normal] basis
+    //  vectors transforms ordinary vectors from tangent->object space.
+    //  The cotangent matrix formed by the [cotangent, cobitangent, normal]
+    //  basis vectors transforms normal vectors (covectors) from
+    //  tangent->object space.  It's the inverse-transpose of the TBN matrix.
+    //  We want the inverse of the TBN matrix (transpose of the cotangent
+    //  matrix), which transforms ordinary vectors from object->tangent space.
+    //  Start by calculating the relevant basis vectors in accordance with
+    //  Christian Schüler's blog post "Followup: Normal Mapping Without
+    //  Precomputed Tangents":  http://www.thetenthplanet.de/archives/1180
+    //  With our particular uv mapping, the scale of the u and v directions
+    //  is determined entirely by the aspect ratio for cylindrical and ordinary
+    //  spherical mappings, and so tangent and bitangent lengths are also
+    //  determined by it (the alternate mapping is more complex).  Therefore, we
+    //  must ensure appropriate cotangent and cobitangent lengths as well.
+    //  Base these off the uv<=>xyz mappings for each primitive.
+    const float3 pos = intersection_pos_local;
+    static const float3 x_vec = float3(1.0, 0.0, 0.0);
+    static const float3 y_vec = float3(0.0, 1.0, 0.0);
+    //  The tangent and bitangent vectors correspond with increasing u and v,
+    //  respectively.  Mathematically we'd base the cotangent/cobitangent on
+    //  those, but we'll compute the cotangent/cobitangent directly when we can.
+    float3 cotangent_unscaled, cobitangent_unscaled;
+    //  geom_mode should be constant-folded without RUNTIME_GEOMETRY_MODE.
+    if(geom_mode < 1.5)
+    {
+        //  Sphere:
+        //  tangent = normalize(cross(normal, cross(x_vec, pos))) * geom_aspect.x
+        //  bitangent = normalize(cross(cross(y_vec, pos), normal)) * geom_aspect.y
+        //  inv_determinant = 1.0/length(cross(bitangent, tangent))
+        //  cotangent = cross(normal, bitangent) * inv_determinant
+        //            == normalize(cross(y_vec, pos)) * geom_aspect.y * inv_determinant
+        //  cobitangent = cross(tangent, normal) * inv_determinant
+        //            == normalize(cross(x_vec, pos)) * geom_aspect.x * inv_determinant
+        //  Simplified (scale by inv_determinant below):
+        cotangent_unscaled = normalize(cross(y_vec, pos)) * geom_aspect.y;
+        cobitangent_unscaled = normalize(cross(x_vec, pos)) * geom_aspect.x;
+    }
+    else if(geom_mode < 2.5)
+    {
+        //  Sphere, alternate mapping:
+        //  This mapping works a bit like the cylindrical mapping in two
+        //  directions, which makes the lengths and directions more complex.
+        //  Unfortunately, I can't find much of a shortcut:
+        const float3 tangent = normalize(
+            cross(y_vec, float3(pos.x, 0.0, pos.z))) * geom_aspect.x;
+        const float3 bitangent = normalize(
+            cross(x_vec, float3(0.0, pos.yz))) * geom_aspect.y;
+        cotangent_unscaled = cross(normal, bitangent);
+        cobitangent_unscaled = cross(tangent, normal);
+    }
+    else
+    {
+        //  Cylinder:
+        //  tangent = normalize(cross(y_vec, normal)) * geom_aspect.x;
+        //  bitangent = float3(0.0, -geom_aspect.y, 0.0);
+        //  inv_determinant = 1.0/length(cross(bitangent, tangent))
+        //  cotangent = cross(normal, bitangent) * inv_determinant
+        //            == normalize(cross(y_vec, pos)) * geom_aspect.y * inv_determinant
+        //  cobitangent = cross(tangent, normal) * inv_determinant
+        //            == float3(0.0, -geom_aspect.x, 0.0) * inv_determinant
+        cotangent_unscaled = cross(y_vec, normal) * geom_aspect.y;
+        cobitangent_unscaled = float3(0.0, -geom_aspect.x, 0.0);
+    }
+    const float3 computed_normal =
+        cross(cobitangent_unscaled, cotangent_unscaled);
+    const float inv_determinant = rsqrt(dot(computed_normal, computed_normal));
+    const float3 cotangent = cotangent_unscaled * inv_determinant;
+    const float3 cobitangent = cobitangent_unscaled * inv_determinant;
+    //  The [cotangent, cobitangent, normal] column vecs form the cotangent
+    //  frame, i.e. the inverse-transpose TBN matrix.  Get its transpose:
+    const float3x3 object_to_tangent = float3x3(cotangent, cobitangent, normal);
+    return object_to_tangent;
+}
+
+float2 get_curved_video_uv_coords_and_tangent_matrix(
+    const float2 flat_video_uv, const float3 eye_pos_local,
+    const float2 output_size_inv, const float2 geom_aspect,
+    const float geom_mode, const float3x3 global_to_local,
+    out float2x2 pixel_to_tangent_video_uv)
+{
+    //  Requires:   Parameters:
+    //              1.) flat_video_uv coords are in range [0.0, 1.0], where
+    //                  (0.0, 0.0) is the top-left corner of the screen and
+    //                  (1.0, 1.0) is the bottom-right corner.
+    //              2.) eye_pos_local is the 3D camera position in the simulated
+    //                  CRT's local coordinate frame.  For best results, it must
+    //                  be computed based on the same geom_view_dist used here.
+    //              3.) output_size_inv = float2(1.0)/output_size
+    //              4.) geom_aspect = get_aspect_vector(
+    //                      output_size.x / output_size.y);
+    //              5.) geom_mode is a static or runtime mode setting:
+    //                  0 = off, 1 = sphere, 2 = sphere alt., 3 = cylinder
+    //              6.) global_to_local is a 3x3 matrix transforming (ordinary)
+    //                  worldspace vectors to the CRT's local coordinate frame
+    //              Globals:
+    //              1.) geom_view_dist must be > 0.0.  It controls the "near
+    //                  plane" used to interpret flat_video_uv as a view
+    //                  vector, which controls the field of view (FOV).
+    //  Returns:    Return final uv coords in [0.0, 1.0], and return a pixel-
+    //              space to video_uv tangent-space matrix in the out parameter.
+    //              (This matrix assumes pixel-space +y = down, like +v = down.)
+    //              We'll transform flat_video_uv into a view vector, project
+    //              the view vector from the camera/eye, intersect with a sphere
+    //              or cylinder representing the simulated CRT, and convert the
+    //              intersection position into final uv coords and a local
+    //              transformation matrix.
+    //  First get the 3D view vector (geom_aspect and geom_view_dist are globals):
+    //  1.) Center uv around (0.0, 0.0) and make (-0.5, -0.5) and (0.5, 0.5)
+    //      correspond to the top-left/bottom-right output screen corners.
+    //  2.) Multiply by geom_aspect to preemptively "undo" Retroarch's screen-
+    //      space 2D aspect correction.  We'll reapply it in uv-space.
+    //  3.) (x, y) = (u, -v), because +v is down in 2D screenspace, but +y
+    //      is up in 3D worldspace (enforce a right-handed system).
+    //  4.) The view vector z controls the "near plane" distance and FOV.
+    //      For the effect of "looking through a window" at a CRT, it should be
+    //      set equal to the user's distance from their physical screen, in
+    //      units of the viewport's physical diagonal size.
+    const float2 view_uv = (flat_video_uv - float2(0.5)) * geom_aspect;
+    const float3 view_vec_global =
+        float3(view_uv.x, -view_uv.y, -geom_view_dist);
+    //  Transform the view vector into the CRT's local coordinate frame, convert
+    //  to video_uv coords, and get the local 3D intersection position:
+    const float3 view_vec_local = mul(global_to_local, view_vec_global);
+    float3 pos;
+    const float2 centered_uv = view_vec_to_uv(
+        view_vec_local, eye_pos_local, geom_aspect, geom_mode, pos);
+    const float2 video_uv = centered_uv + float2(0.5);
+    //  Get a pixel-to-tangent-video-uv matrix.  The caller could deal with
+    //  all but one of these cases, but that would be more complicated.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        //  Derivatives obtain a matrix very fast, but the direction of pixel-
+        //  space +y seems to depend on the pass.  Enforce the correct direction
+        //  on a best-effort basis (but it shouldn't matter for antialiasing).
+        const float2 duv_dx = ddx(video_uv);
+        const float2 duv_dy = ddy(video_uv);
+        #ifdef LAST_PASS
+            pixel_to_tangent_video_uv = float2x2(
+                duv_dx.x, duv_dy.x,
+                -duv_dx.y, -duv_dy.y);
+        #else
+            pixel_to_tangent_video_uv = float2x2(
+                duv_dx.x, duv_dy.x,
+                duv_dx.y, duv_dy.y);
+        #endif
+    #else
+        //  Manually define a transformation matrix.  We'll assume pixel-space
+        //  +y = down, just like +v = down.
+        if(geom_force_correct_tangent_matrix)
+        {
+            //  Get the surface normal based on the local intersection position:
+            const float3 normal_base = geom_mode < 2.5 ? pos :
+                float3(pos.x, 0.0, pos.z);
+            const float3 normal = normalize(normal_base);
+            //  Get pixel-to-object and object-to-tangent matrices and combine
+            //  them into a 2x2 pixel-to-tangent matrix for video_uv offsets:
+            const float3x3 pixel_to_object = get_pixel_to_object_matrix(
+                global_to_local, eye_pos_local, view_vec_global, pos, normal,
+                output_size_inv);
+            const float3x3 object_to_tangent = get_object_to_tangent_matrix(
+                pos, normal, geom_aspect, geom_mode);
+            const float3x3 pixel_to_tangent3x3 =
+                mul(object_to_tangent, pixel_to_object);
+            pixel_to_tangent_video_uv = float2x2(
+                pixel_to_tangent3x3[0][0], pixel_to_tangent3x3[0][1], pixel_to_tangent3x3[1][0], pixel_to_tangent3x3[1][1]);//._m00_m01_m10_m11); //TODO/FIXME: needs to correct for column-major??
+        }
+        else
+        {
+            //  Ignore curvature, and just consider flat scaling.  The
+            //  difference is only apparent with strong curvature:
+            pixel_to_tangent_video_uv = float2x2(
+                output_size_inv.x, 0.0, 0.0, output_size_inv.y);
+        }
+    #endif
+    return video_uv;
+}
+
+float get_border_dim_factor(const float2 video_uv, const float2 geom_aspect)
+{
+    //  COPYRIGHT NOTE FOR THIS FUNCTION:
+    //  Copyright (C) 2010-2012 cgwg, 2014 TroggleMonkey
+    //  This function uses an algorithm first coded in several of cgwg's GPL-
+    //  licensed lines in crt-geom-curved.cg and its ancestors.  The line
+    //  between algorithm and code is nearly indistinguishable here, so it's
+    //  unclear whether I could even release this project under a non-GPL
+    //  license with this function included.
+
+    //  Calculate border_dim_factor from the proximity to uv-space image
+    //  borders; geom_aspect/border_size/border/darkness/border_compress are globals:
+    const float2 edge_dists = min(video_uv, float2(1.0) - video_uv) *
+        geom_aspect;
+    const float2 border_penetration =
+        max(float2(border_size) - edge_dists, float2(0.0));
+    const float penetration_ratio = length(border_penetration)/border_size;
+    const float border_escape_ratio = max(1.0 - penetration_ratio, 0.0);
+    const float border_dim_factor =
+        pow(border_escape_ratio, border_darkness) * max(1.0, border_compress);
+    return min(border_dim_factor, 1.0);
+}
+
+
+
+#endif  //  GEOMETRY_FUNCTIONS_H
+
+/////////////////////////  END GEOMETRY-FUNCTIONS  /////////////////////////
+
+///////////////////////////////////  HELPERS  //////////////////////////////////
+
+float2x2 mul_scale(float2 scale, float2x2 matrix)
+{
+    //float2x2 scale_matrix = float2x2(scale.x, 0.0, 0.0, scale.y);
+    //return mul(scale_matrix, matrix);
+    float4 intermed = float4(matrix[0][0],matrix[0][1],matrix[1][0],matrix[1][1]) * scale.xxyy;
+    return float2x2(intermed.x, intermed.y, intermed.z, intermed.w);
+}
+
+#undef COMPAT_PRECISION
+#undef COMPAT_TEXTURE
+
+void main() {
+    //  Localize some parameters:
+    const float2 geom_aspect = geom_aspect_and_overscan.xy;
+    const float2 geom_overscan = geom_aspect_and_overscan.zw;
+    const float2 video_size_inv = video_and_texture_size_inv.xy;
+    const float2 texture_size_inv = video_and_texture_size_inv.zw;
+    //const float2 output_size_inv = output_size_inv;
+    #ifdef RUNTIME_GEOMETRY_TILT
+        const float3x3 global_to_local = float3x3(global_to_local_row0,
+            global_to_local_row1, global_to_local_row2);
+    #else
+        static const float3x3 global_to_local = geom_global_to_local_static;
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        const float geom_mode = geom_mode_runtime;
+    #else
+        static const float geom_mode = geom_mode_static;
+    #endif
+
+    //  Get flat and curved texture coords for the current fragment point sample
+    //  and a pixel_to_tangent_video_uv matrix for transforming pixel offsets:
+    //  video_uv = relative position in video frame, mapped to [0.0, 1.0] range
+    //  tex_uv = relative position in padded texture, mapped to [0.0, 1.0] range
+    const float2 flat_video_uv = tex_uv * (texture_size * video_size_inv);
+    float2x2 pixel_to_video_uv;
+    float2 video_uv_no_geom_overscan;
+    if(geom_mode > 0.5)
+    {
+        video_uv_no_geom_overscan =
+            get_curved_video_uv_coords_and_tangent_matrix(flat_video_uv,
+                eye_pos_local, output_size_inv, geom_aspect,
+                geom_mode, global_to_local, pixel_to_video_uv);
+    }
+    else
+    {
+        video_uv_no_geom_overscan = flat_video_uv;
+        pixel_to_video_uv = float2x2(
+            output_size_inv.x, 0.0, 0.0, output_size_inv.y);
+    }
+    //  Correct for overscan here (not in curvature code):
+    const float2 video_uv =
+        (video_uv_no_geom_overscan - float2(0.5, 0.5))/geom_overscan + float2(0.5, 0.5);
+    const float2 tex_uv = video_uv * (video_size * texture_size_inv);
+
+    //  Get a matrix transforming pixel vectors to tex_uv vectors:
+    const float2x2 pixel_to_tex_uv =
+        mul_scale(video_size * texture_size_inv /
+            geom_aspect_and_overscan.zw, pixel_to_video_uv);
+
+    //  Sample!  Skip antialiasing if aa_level < 0.5 or both of these hold:
+    //  1.) Geometry/curvature isn't used
+    //  2.) Overscan == float2(1.0, 1.0)
+    //  Skipping AA is sharper, but it's only faster with dynamic branches.
+    const float2 abs_aa_r_offset = abs(get_aa_subpixel_r_offset());
+    const bool need_subpixel_aa = abs_aa_r_offset.x + abs_aa_r_offset.y > 0.0;
+    float3 color;
+    if(aa_level > 0.5 && (geom_mode > 0.5 || any(bool2((geom_overscan.x != 1.0), (geom_overscan.y != 1.0)))))
+    {
+        //  Sample the input with antialiasing (due to sharp phosphors, etc.):
+        color = tex2Daa(input_texture, tex_uv, pixel_to_tex_uv, float(frame_count));
+    }
+
+    else if(aa_level > 0.5 && need_subpixel_aa)
+    {
+        //  Sample at each subpixel location:
+        color = tex2Daa_subpixel_weights_only(
+            input_texture, tex_uv, pixel_to_tex_uv);
+    }
+    else
+    {
+        color = tex2D_linearize(input_texture, tex_uv).rgb;
+    }
+
+    //  Dim borders and output the final result:
+    const float border_dim_factor = get_border_dim_factor(video_uv, geom_aspect);
+    const float3 final_color = color * border_dim_factor;
+
+    FragColor = encode_output(float4(final_color, 1.0));
+}
\ No newline at end of file
diff --git a/shaders/CRT-Royale.shader/geometry-aa-last-pass.vs b/shaders/CRT-Royale.shader/geometry-aa-last-pass.vs
new file mode 100644
index 00000000..1c99650d
--- /dev/null
+++ b/shaders/CRT-Royale.shader/geometry-aa-last-pass.vs
@@ -0,0 +1,5263 @@
+#version 150
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+in vec4 position;
+in vec2 texCoord;
+
+out Vertex {
+   vec2 vTexCoord;
+   vec2 tex_uv;
+   vec4 video_and_texture_size_inv;
+   vec2 output_size_inv;
+   vec3 eye_pos_local;
+   vec4 geom_aspect_and_overscan;
+   vec3 global_to_local_row0;
+   vec3 global_to_local_row1;
+   vec3 global_to_local_row2;
+};
+
+uniform vec4 targetSize;
+uniform vec4 sourceSize[];
+
+// USER SETTINGS BLOCK //
+
+#define crt_gamma 2.500000
+#define lcd_gamma 2.200000
+#define levels_contrast 1.0
+#define halation_weight 0.0
+#define diffusion_weight 0.075
+#define bloom_underestimate_levels 0.8
+#define bloom_excess 0.000000
+#define beam_min_sigma 0.020000
+#define beam_max_sigma 0.300000
+#define beam_spot_power 0.330000
+#define beam_min_shape 2.000000
+#define beam_max_shape 4.000000
+#define beam_shape_power 0.250000
+#define beam_horiz_filter 0.000000
+#define beam_horiz_sigma 0.35
+#define beam_horiz_linear_rgb_weight 1.000000
+#define convergence_offset_x_r -0.000000
+#define convergence_offset_x_g 0.000000
+#define convergence_offset_x_b 0.000000
+#define convergence_offset_y_r 0.000000
+#define convergence_offset_y_g -0.000000
+#define convergence_offset_y_b 0.000000
+#define mask_type 1.000000
+#define mask_sample_mode_desired 0.000000
+#define mask_specify_num_triads 0.000000
+#define mask_triad_size_desired 3.000000
+#define mask_num_triads_desired 480.000000
+#define aa_subpixel_r_offset_x_runtime -0.0
+#define aa_subpixel_r_offset_y_runtime 0.000000
+#define aa_cubic_c 0.500000
+#define aa_gauss_sigma 0.500000
+#define geom_mode_runtime 0.000000
+#define geom_radius 2.000000
+#define geom_view_dist 2.000000
+#define geom_tilt_angle_x 0.000000
+#define geom_tilt_angle_y 0.000000
+#define geom_aspect_ratio_x 432.000000
+#define geom_aspect_ratio_y 329.000000
+#define geom_overscan_x 1.000000
+#define geom_overscan_y 1.000000
+#define border_size 0.015
+#define border_darkness 2.0
+#define border_compress 2.500000
+#define interlace_bff 0.000000
+#define interlace_1080i 0.000000
+
+// END USER SETTINGS BLOCK //
+
+// compatibility macros for transparently converting HLSLisms into GLSLisms
+#define mul(a,b) (b*a)
+#define lerp(a,b,c) mix(a,b,c)
+#define saturate(c) clamp(c, 0.0, 1.0)
+#define frac(x) (fract(x))
+#define float2 vec2
+#define float3 vec3
+#define float4 vec4
+#define bool2 bvec2
+#define bool3 bvec3
+#define bool4 bvec4
+#define float2x2 mat2x2
+#define float3x3 mat3x3
+#define float4x4 mat4x4
+#define float4x3 mat4x3
+#define float2x4 mat2x4
+#define IN params
+#define texture_size sourceSize[0].xy
+#define video_size sourceSize[0].xy
+#define output_size targetSize.xy
+#define frame_count phase
+#define static  
+#define inline  
+#define const  
+#define fmod(x,y) mod(x,y)
+#define ddx(c) dFdx(c)
+#define ddy(c) dFdy(c)
+#define atan2(x,y) atan(x,y)
+#define rsqrt(c) inversesqrt(c)
+
+#if defined(GL_ES)
+	#define COMPAT_PRECISION mediump
+#else
+	#define COMPAT_PRECISION
+#endif
+
+#if __VERSION__ >= 130
+	#define COMPAT_TEXTURE texture
+#else
+	#define COMPAT_TEXTURE texture2D
+#endif
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+#define LAST_PASS
+#define SIMULATE_CRT_ON_LCD
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+////////////////////////////  END USER-SETTINGS  //////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  END DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////////////
+
+//#include "bind-shader-h"
+
+/////////////////////////////  BEGIN BIND-SHADER-PARAMS  ////////////////////////////
+
+#ifndef BIND_SHADER_PARAMS_H
+#define BIND_SHADER_PARAMS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+/////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+////////////////////   END DERIVED-SETTINGS-AND-CONSTANTS   /////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+//  Override some parameters for gamma-management.h and tex2Dantialias.h:
+#define OVERRIDE_DEVICE_GAMMA
+static const float gba_gamma = 3.5; //  Irrelevant but necessary to define.
+#define ANTIALIAS_OVERRIDE_BASICS
+#define ANTIALIAS_OVERRIDE_PARAMETERS
+
+//  Provide accessors for vector constants that pack scalar uniforms:
+inline float2 get_aspect_vector(const float geom_aspect_ratio)
+{
+    //  Get an aspect ratio vector.  Enforce geom_max_aspect_ratio, and prevent
+    //  the absolute scale from affecting the uv-mapping for curvature:
+    const float geom_clamped_aspect_ratio =
+        min(geom_aspect_ratio, geom_max_aspect_ratio);
+    const float2 geom_aspect =
+        normalize(float2(geom_clamped_aspect_ratio, 1.0));
+    return geom_aspect;
+}
+
+inline float2 get_geom_overscan_vector()
+{
+    return float2(geom_overscan_x, geom_overscan_y);
+}
+
+inline float2 get_geom_tilt_angle_vector()
+{
+    return float2(geom_tilt_angle_x, geom_tilt_angle_y);
+}
+
+inline float3 get_convergence_offsets_x_vector()
+{
+    return float3(convergence_offset_x_r, convergence_offset_x_g,
+        convergence_offset_x_b);
+}
+
+inline float3 get_convergence_offsets_y_vector()
+{
+    return float3(convergence_offset_y_r, convergence_offset_y_g,
+        convergence_offset_y_b);
+}
+
+inline float2 get_convergence_offsets_r_vector()
+{
+    return float2(convergence_offset_x_r, convergence_offset_y_r);
+}
+
+inline float2 get_convergence_offsets_g_vector()
+{
+    return float2(convergence_offset_x_g, convergence_offset_y_g);
+}
+
+inline float2 get_convergence_offsets_b_vector()
+{
+    return float2(convergence_offset_x_b, convergence_offset_y_b);
+}
+
+inline float2 get_aa_subpixel_r_offset()
+{
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+            //  WARNING: THIS IS EXTREMELY EXPENSIVE.
+            return float2(aa_subpixel_r_offset_x_runtime,
+                aa_subpixel_r_offset_y_runtime);
+        #else
+            return aa_subpixel_r_offset_static;
+        #endif
+    #else
+        return aa_subpixel_r_offset_static;
+    #endif
+}
+
+//  Provide accessors settings which still need "cooking:"
+inline float get_mask_amplify()
+{
+    static const float mask_grille_amplify = 1.0/mask_grille_avg_color;
+    static const float mask_slot_amplify = 1.0/mask_slot_avg_color;
+    static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color;
+    return mask_type < 0.5 ? mask_grille_amplify :
+        mask_type < 1.5 ? mask_slot_amplify :
+        mask_shadow_amplify;
+}
+
+inline float get_mask_sample_mode()
+{
+    #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_desired;
+        #else
+            return clamp(mask_sample_mode_desired, 1.0, 2.0);
+        #endif
+    #else
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_static;
+        #else
+            return clamp(mask_sample_mode_static, 1.0, 2.0);
+        #endif
+    #endif
+}
+
+#endif  //  BIND_SHADER_PARAMS_H
+
+////////////////////////////  END BIND-SHADER-PARAMS  ///////////////////////////
+
+#ifndef RUNTIME_GEOMETRY_TILT
+    //  Create a local-to-global rotation matrix for the CRT's coordinate frame
+    //  and its global-to-local inverse.  See the vertex shader for details.
+    //  It's faster to compute these statically if possible.
+    static const float2 sin_tilt = sin(geom_tilt_angle_static);
+    static const float2 cos_tilt = cos(geom_tilt_angle_static);
+    static const float3x3 geom_local_to_global_static = float3x3(
+        cos_tilt.x, sin_tilt.y*sin_tilt.x, cos_tilt.y*sin_tilt.x,
+        0.0, cos_tilt.y, -sin_tilt.y,
+        -sin_tilt.x, sin_tilt.y*cos_tilt.x, cos_tilt.y*cos_tilt.x);
+    static const float3x3 geom_global_to_local_static = float3x3(
+        cos_tilt.x, 0.0, -sin_tilt.x,
+        sin_tilt.y*sin_tilt.x, cos_tilt.y, sin_tilt.y*cos_tilt.x,
+        cos_tilt.y*sin_tilt.x, -sin_tilt.y, cos_tilt.y*cos_tilt.x);
+#endif
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//#include "../../../../include/gamma-management.h"
+
+////////////////////////////  BEGIN GAMMA-MANAGEMENT  //////////////////////////
+
+#ifndef GAMMA_MANAGEMENT_H
+#define GAMMA_MANAGEMENT_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//  
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides gamma-aware tex*D*() and encode_output() functions.
+//  Requires:   Before #include-ing this file, the including file must #define
+//              the following macros when applicable and follow their rules:
+//              1.) #define FIRST_PASS if this is the first pass.
+//              2.) #define LAST_PASS if this is the last pass.
+//              3.) If sRGB is available, set srgb_framebufferN = "true" for
+//                  every pass except the last in your .cgp preset.
+//              4.) If sRGB isn't available but you want gamma-correctness with
+//                  no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
+//              5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
+//              6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
+//              7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
+//              8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
+//              If an option in [5, 8] is #defined in the first or last pass, it
+//              should be #defined for both.  It shouldn't make a difference
+//              whether it's #defined for intermediate passes or not.
+//  Optional:   The including file (or an earlier included file) may optionally
+//              #define a number of macros indicating it will override certain
+//              macros and associated constants are as follows:
+//              static constants with either static or uniform constants.  The
+//              1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
+//                  static const float ntsc_gamma
+//                  static const float pal_gamma
+//                  static const float crt_reference_gamma_high
+//                  static const float crt_reference_gamma_low
+//                  static const float lcd_reference_gamma
+//                  static const float crt_office_gamma
+//                  static const float lcd_office_gamma
+//              2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
+//                  static const float crt_gamma
+//                  static const float gba_gamma
+//                  static const float lcd_gamma
+//              3.) OVERRIDE_FINAL_GAMMA: The user must first define:
+//                  static const float input_gamma
+//                  static const float intermediate_gamma
+//                  static const float output_gamma
+//                  (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
+//              4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
+//                  static const bool assume_opaque_alpha
+//              The gamma constant overrides must be used in every pass or none,
+//              and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
+//              OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
+//  Usage:      After setting macros appropriately, ignore gamma correction and
+//              replace all tex*D*() calls with equivalent gamma-aware
+//              tex*D*_linearize calls, except:
+//              1.) When you read an LUT, use regular tex*D or a gamma-specified
+//                  function, depending on its gamma encoding:
+//                      tex*D*_linearize_gamma (takes a runtime gamma parameter)
+//              2.) If you must read pass0's original input in a later pass, use
+//                  tex2D_linearize_ntsc_gamma.  If you want to read pass0's
+//                  input with gamma-corrected bilinear filtering, consider
+//                  creating a first linearizing pass and reading from the input
+//                  of pass1 later.
+//              Then, return encode_output(color) from every fragment shader.
+//              Finally, use the global gamma_aware_bilinear boolean if you want
+//              to statically branch based on whether bilinear filtering is
+//              gamma-correct or not (e.g. for placing Gaussian blur samples).
+//
+//  Detailed Policy:
+//  tex*D*_linearize() functions enforce a consistent gamma-management policy
+//  based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings.  They assume
+//  their input texture has the same encoding characteristics as the input for
+//  the current pass (which doesn't apply to the exceptions listed above).
+//  Similarly, encode_output() enforces a policy based on the LAST_PASS and
+//  GAMMA_ENCODE_EVERY_FBO settings.  Together, they result in one of the
+//  following two pipelines.
+//  Typical pipeline with intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = linear_color;     //  Automatic sRGB encoding
+//      linear_color = intermediate_output;     //  Automatic sRGB decoding
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Typical pipeline without intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
+//      linear_color = pow(intermediate_output, intermediate_gamma);
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
+//  easily get gamma-correctness without banding on devices where sRGB isn't
+//  supported.
+//
+//  Use This Header to Maximize Code Reuse:
+//  The purpose of this header is to provide a consistent interface for texture
+//  reads and output gamma-encoding that localizes and abstracts away all the
+//  annoying details.  This greatly reduces the amount of code in each shader
+//  pass that depends on the pass number in the .cgp preset or whether sRGB
+//  FBO's are being used: You can trivially change the gamma behavior of your
+//  whole pass by commenting or uncommenting 1-3 #defines.  To reuse the same
+//  code in your first, Nth, and last passes, you can even put it all in another
+//  header file and #include it from skeleton .cg files that #define the
+//  appropriate pass-specific settings.
+//
+//  Rationale for Using Three Macros:
+//  This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
+//  SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
+//  a lower maintenance burden on each pass.  At first glance it seems we could
+//  accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
+//  This works for simple use cases where input_gamma == output_gamma, but it
+//  breaks down for more complex scenarios like CRT simulation, where the pass
+//  number determines the gamma encoding of the input and output.
+
+
+///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
+
+//  Set standard gamma constants, but allow users to override them:
+#ifndef OVERRIDE_STANDARD_GAMMA
+    //  Standard encoding gammas:
+    static const float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
+    static const float pal_gamma = 2.8;     //  Never actually 2.8 in practice
+    //  Typical device decoding gammas (only use for emulating devices):
+    //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
+    //  gammas: The standards purposely undercorrected for an analog CRT's
+    //  assumed 2.5 reference display gamma to maintain contrast in assumed
+    //  [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
+    //  These unstated assumptions about display gamma and perceptual rendering
+    //  intent caused a lot of confusion, and more modern CRT's seemed to target
+    //  NTSC 2.2 gamma with circuitry.  LCD displays seem to have followed suit
+    //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
+    //  displays designed to view sRGB in bright environments.  (Standards are
+    //  also in flux again with BT.1886, but it's underspecified for displays.)
+    static const float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
+    static const float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
+    static const float lcd_reference_gamma = 2.5;       //  To match CRT
+    static const float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
+    static const float lcd_office_gamma = 2.2;  //  Approximates sRGB
+#endif  //  OVERRIDE_STANDARD_GAMMA
+
+//  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
+//  but only if they're aware of it.
+#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
+    static const bool assume_opaque_alpha = false;
+#endif
+
+
+///////////////////////  DERIVED CONSTANTS AS FUNCTIONS  ///////////////////////
+
+//  gamma-management.h should be compatible with overriding gamma values with
+//  runtime user parameters, but we can only define other global constants in
+//  terms of static constants, not uniform user parameters.  To get around this
+//  limitation, we need to define derived constants using functions.
+
+//  Set device gamma constants, but allow users to override them:
+#ifdef OVERRIDE_DEVICE_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_crt_gamma()    {   return crt_gamma;   }
+    inline float get_gba_gamma()    {   return gba_gamma;   }
+    inline float get_lcd_gamma()    {   return lcd_gamma;   }
+#else
+    inline float get_crt_gamma()    {   return crt_reference_gamma_high;    }
+    inline float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
+    inline float get_lcd_gamma()    {   return lcd_office_gamma;            }
+#endif  //  OVERRIDE_DEVICE_GAMMA
+
+//  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
+#ifdef OVERRIDE_FINAL_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_intermediate_gamma()   {   return intermediate_gamma;  }
+    inline float get_input_gamma()          {   return input_gamma;         }
+    inline float get_output_gamma()         {   return output_gamma;        }
+#else
+    //  If we gamma-correct every pass, always use ntsc_gamma between passes to
+    //  ensure middle passes don't need to care if anything is being simulated:
+    inline float get_intermediate_gamma()   {   return ntsc_gamma;          }
+    #ifdef SIMULATE_CRT_ON_LCD
+        inline float get_input_gamma()      {   return get_crt_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_LCD
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_LCD_ON_CRT
+        inline float get_input_gamma()      {   return get_lcd_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_CRT
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else   //  Don't simulate anything:
+        inline float get_input_gamma()      {   return ntsc_gamma;          }
+        inline float get_output_gamma()     {   return ntsc_gamma;          }
+    #endif  //  SIMULATE_GBA_ON_CRT
+    #endif  //  SIMULATE_LCD_ON_CRT
+    #endif  //  SIMULATE_GBA_ON_LCD
+    #endif  //  SIMULATE_CRT_ON_LCD
+#endif  //  OVERRIDE_FINAL_GAMMA
+
+//  Set decoding/encoding gammas for the current pass.  Use static constants for
+//  linearize_input and gamma_encode_output, because they aren't derived, and
+//  they let the compiler do dead-code elimination.
+#ifndef GAMMA_ENCODE_EVERY_FBO
+    #ifdef FIRST_PASS
+        static const bool linearize_input = true;
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        static const bool linearize_input = false;
+        inline float get_pass_input_gamma()     {   return 1.0;                 }
+    #endif
+    #ifdef LAST_PASS
+        static const bool gamma_encode_output = true;
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        static const bool gamma_encode_output = false;
+        inline float get_pass_output_gamma()    {   return 1.0;                 }
+    #endif
+#else
+    static const bool linearize_input = true;
+    static const bool gamma_encode_output = true;
+    #ifdef FIRST_PASS
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        inline float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
+    #endif
+    #ifdef LAST_PASS
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        inline float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
+    #endif
+#endif
+
+//  Users might want to know if bilinear filtering will be gamma-correct:
+static const bool gamma_aware_bilinear = !linearize_input;
+
+
+//////////////////////  COLOR ENCODING/DECODING FUNCTIONS  /////////////////////
+
+inline float4 encode_output(const float4 color)
+{
+    if(gamma_encode_output)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_input(const float4 color)
+{
+    if(linearize_input)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_gamma_input(const float4 color, const float3 gamma)
+{
+    if(assume_opaque_alpha)
+    {
+        return float4(pow(color.rgb, gamma), 1.0);
+    }
+    else
+    {
+        return float4(pow(color.rgb, gamma), color.a);
+    }
+}
+
+//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯
+//#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D)))
+// EDIT: it's the 'const' in front of the coords that's doing it
+
+///////////////////////////  TEXTURE LOOKUP WRAPPERS  //////////////////////////
+
+//  "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a wide array of linearizing texture lookup wrapper functions.  The
+//  Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
+//  lookups are provided for completeness in case that changes someday.  Nobody
+//  is likely to use the *fetch and *proj functions, but they're included just
+//  in case.  The only tex*D texture sampling functions omitted are:
+//      - tex*Dcmpbias
+//      - tex*Dcmplod
+//      - tex*DARRAY*
+//      - tex*DMS*
+//      - Variants returning integers
+//  Standard line length restrictions are ignored below for vertical brevity.
+/*
+//  tex1D:
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex1Dbias:
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dbias(tex, tex_coords));   }
+
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dbias(tex, tex_coords, texel_off));    }
+
+//  tex1Dfetch:
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
+{   return decode_input(tex1Dfetch(tex, tex_coords));  }
+
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex1Dlod:
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dlod(tex, tex_coords));    }
+
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dlod(tex, tex_coords, texel_off));     }
+
+//  tex1Dproj:
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+*/
+//  tex2D:
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords, texel_off));    }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex2Dbias:
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
+//{   return decode_input(tex2Dbias(tex, tex_coords));   }
+
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dbias(tex, tex_coords, texel_off));    }
+
+//  tex2Dfetch:
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
+//{   return decode_input(tex2Dfetch(tex, tex_coords));  }
+
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords)
+{   return decode_input(textureLod(tex, tex_coords.xy, 0.0));    }
+
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));     }
+/*
+//  tex2Dproj:
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+*/
+/*
+//  tex3D:
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
+{   return decode_input(tex3D(tex, tex_coords));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, texel_off));    }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex3Dbias:
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dbias(tex, tex_coords));   }
+
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dbias(tex, tex_coords, texel_off));    }
+
+//  tex3Dfetch:
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
+{   return decode_input(tex3Dfetch(tex, tex_coords));  }
+
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex3Dlod:
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dlod(tex, tex_coords));    }
+
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dlod(tex, tex_coords, texel_off));     }
+
+//  tex3Dproj:
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dproj(tex, tex_coords));   }
+
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dproj(tex, tex_coords, texel_off));    }
+/////////*
+
+//  NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  This narrow selection of nonstandard tex2D* functions can be useful:
+
+//  tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0)));   }
+
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off));    }
+
+
+//  MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a narrower selection of tex2D* wrapper functions that decode an
+//  input sample with a specified gamma value.  These are useful for reading
+//  LUT's and for reading the input of pass0 in a later pass.
+
+//  tex2D:
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma);   }
+
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+/*
+//  tex2Dbias:
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma);   }
+
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma);    }
+
+//  tex2Dfetch:
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma);  }
+
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma);   }
+*/
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma);    }
+
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma);     }
+
+
+#endif  //  GAMMA_MANAGEMENT_H
+
+////////////////////////////  END GAMMA-MANAGEMENT  //////////////////////////
+
+//#include "tex2Dantialias.h"
+
+/////////////////////////  BEGIN TEX2DANTIALIAS  /////////////////////////
+
+#ifndef TEX2DANTIALIAS_H
+#define TEX2DANTIALIAS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides antialiased and subpixel-aware tex2D lookups.
+//  Requires:   All functions share these requirements:
+//              1.) All requirements of gamma-management.h must be satisfied!
+//              2.) pixel_to_tex_uv must be a 2x2 matrix that transforms pixe-
+//                  space offsets to texture uv offsets.  You can get this with:
+//                      const float2 duv_dx = ddx(tex_uv);
+//                      const float2 duv_dy = ddy(tex_uv);
+//                      const float2x2 pixel_to_tex_uv = float2x2(
+//                          duv_dx.x, duv_dy.x,
+//                          duv_dx.y, duv_dy.y);
+//                  This is left to the user in case the current Cg profile
+//                  doesn't support ddx()/ddy().  Ideally, the user could find
+//                  calculate a distorted tangent-space mapping analytically.
+//                  If not, a simple flat mapping can be obtained with:
+//                      const float2 xy_to_uv_scale = output_size *
+//                          video_size/texture_size;
+//                      const float2x2 pixel_to_tex_uv = float2x2(
+//                          xy_to_uv_scale.x, 0.0,
+//                          0.0, xy_to_uv_scale.y);
+//  Optional:   To set basic AA settings, #define ANTIALIAS_OVERRIDE_BASICS and:
+//              1.) Set an antialiasing level:
+//                      static const float aa_level = {0 (none),
+//                          1 (sample subpixels), 4, 5, 6, 7, 8, 12, 16, 20, 24}
+//              2.) Set a filter type:
+//                      static const float aa_filter = {
+//                          0 (Box, Separable), 1 (Box, Cylindrical),
+//                          2 (Tent, Separable), 3 (Tent, Cylindrical)
+//                          4 (Gaussian, Separable), 5 (Gaussian, Cylindrical)
+//                          6 (Cubic, Separable), 7 (Cubic, Cylindrical)
+//                          8 (Lanczos Sinc, Separable),
+//                          9 (Lanczos Jinc, Cylindrical)}
+//                  If the input is unknown, a separable box filter is used.
+//                  Note: Lanczos Jinc is terrible for sparse sampling, and
+//                  using aa_axis_importance (see below) defeats the purpose.
+//              3.) Mirror the sample pattern on odd frames?
+//                      static const bool aa_temporal = {true, false]
+//                  This helps rotational invariance but can look "fluttery."
+//              The user may #define ANTIALIAS_OVERRIDE_PARAMETERS to override
+//              (all of) the following default parameters with static or uniform
+//              constants (or an accessor function for subpixel offsets):
+//              1.) Cubic parameters:
+//                      static const float aa_cubic_c = 0.5;
+//                  See http://www.imagemagick.org/Usage/filter/#mitchell
+//              2.) Gaussian parameters:
+//                      static const float aa_gauss_sigma =
+//                          0.5/aa_pixel_diameter;
+//              3.) Set subpixel offsets.  This requires an accessor function
+//                  for compatibility with scalar runtime shader   Return
+//                  a float2 pixel offset in [-0.5, 0.5] for the red subpixel:
+//                      float2 get_aa_subpixel_r_offset()
+//              The user may also #define ANTIALIAS_OVERRIDE_STATIC_CONSTANTS to
+//              override (all of) the following default static values.  However,
+//              the file's structure requires them to be declared static const:
+//              1.) static const float aa_lanczos_lobes = 3.0;
+//              2.) static const float aa_gauss_support = 1.0/aa_pixel_diameter;
+//                  Note the default tent/Gaussian support radii may appear
+//                  arbitrary, but extensive testing found them nearly optimal
+//                  for tough cases like strong distortion at low AA levels.
+//                  (The Gaussian default is only best for practical gauss_sigma
+//                  values; much larger gauss_sigmas ironically prefer slightly
+//                  smaller support given sparse sampling, and vice versa.)
+//              3.) static const float aa_tent_support = 1.0 / aa_pixel_diameter;
+//              4.) static const float2 aa_xy_axis_importance:
+//                  The sparse N-queens sampling grid interacts poorly with
+//                  negative-lobed 2D filters.  However, if aliasing is much
+//                  stronger in one direction (e.g. horizontally with a phosphor
+//                  mask), it can be useful to downplay sample offsets along the
+//                  other axis.  The support radius in each direction scales with
+//                  aa_xy_axis_importance down to a minimum of 0.5 (box support),
+//                  after which point only the offsets used for calculating
+//                  weights continue to scale downward.  This works as follows:
+//                  If aa_xy_axis_importance = float2(1.0, 1.0/support_radius),
+//                  the vertical support radius will drop to 1.0, and we'll just
+//                  filter vertical offsets with the first filter lobe, while
+//                  horizontal offsets go through the full multi-lobe filter.
+//                  If aa_xy_axis_importance = float2(1.0, 0.0), the vertical
+//                  support radius will drop to box support, and the vertical
+//                  offsets will be ignored entirely (essentially giving us a
+//                  box filter vertically).  The former is potentially smoother
+//                  (but less predictable) and the default behavior of Lanczos
+//                  jinc, whereas the latter is sharper and the default behavior
+//                  of cubics and Lanczos sinc.
+//              5.) static const float aa_pixel_diameter: You can expand the
+//                  pixel diameter to e.g. sqrt(2.0), which may be a better
+//                  support range for cylindrical filters (they don't
+//                  currently discard out-of-circle samples though).
+//              Finally, there are two miscellaneous options:
+//              1.) If you want to antialias a manually tiled texture, you can
+//                  #define ANTIALIAS_DISABLE_ANISOTROPIC to use tex2Dlod() to
+//                  fix incompatibilities with anisotropic filtering.  This is
+//                  slower, and the Cg profile must support tex2Dlod().
+//              2.) If aa_cubic_c is a runtime uniform, you can #define
+//                  RUNTIME_ANTIALIAS_WEIGHTS to evaluate cubic weights once per
+//                  fragment instead of at the usage site (which is used by
+//                  default, because it enables static evaluation).
+//  Description:
+//  Each antialiased lookup follows these steps:
+//  1.) Define a sample pattern of pixel offsets in the range of [-0.5, 0.5]
+//      pixels, spanning the diameter of a rectangular box filter.
+//  2.) Scale these offsets by the support diameter of the user's chosen filter.
+//  3.) Using these pixel offsets from the pixel center, compute the offsets to
+//      predefined subpixel locations.
+//  4.) Compute filter weights based on subpixel offsets.
+//  Much of that can often be done at compile-time.  At runtime:
+//  1.) Project pixel-space offsets into uv-space with a matrix multiplication
+//      to get the uv offsets for each sample.  Rectangular pixels have a
+//      diameter of 1.0.  Circular pixels are not currently supported, but they
+//      might be better with a diameter of sqrt(2.0) to ensure there are no gaps
+//      between them.
+//  2.) Load, weight, and sum samples.
+//  We use a sparse bilinear sampling grid, so there are two major implications:
+//  1.) We can directly project the pixel-space support box into uv-space even
+//      if we're upsizing.  This wouldn't be the case for nearest neighbor,
+//      where we'd have to expand the uv-space diameter to at least the support
+//      size to ensure sufficient filter support.  In our case, this allows us
+//      to treat upsizing the same as downsizing and use static weighting. :)
+//  2.) For decent results, negative-lobed filters must be computed based on
+//      separable weights, not radial distances, because the sparse sampling
+//      makes no guarantees about radial distributions.  Even then, it's much
+//      better to set aa_xy_axis_importance to e.g. float2(1.0, 0.0) to use e.g.
+//      Lanczos2 horizontally and a box filter vertically.  This is mainly due
+//      to the sparse N-queens sampling and a statistically enormous positive or
+//      negative covariance between horizontal and vertical weights.
+//
+//  Design Decision Comments:
+//  "aa_temporal" mirrors the sample pattern on odd frames along the axis that
+//  keeps subpixel weights constant.  This helps with rotational invariance, but
+//  it can cause distracting fluctuations, and horizontal and vertical edges
+//  will look the same.  Using a different pattern on a shifted grid would
+//  exploit temporal AA better, but it would require a dynamic branch or a lot
+//  of conditional moves, so it's prohibitively slow for the minor benefit.
+
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+#ifndef ANTIALIAS_OVERRIDE_BASICS
+    //  The following settings must be static constants:
+    static const float aa_level = 12.0;
+    static const float aa_filter = 0.0;
+    static const bool aa_temporal = false;
+#endif
+
+#ifndef ANTIALIAS_OVERRIDE_STATIC_CONSTANTS
+    //  Users may override these parameters, but the file structure requires
+    //  them to be static constants; see the descriptions above.
+    static const float aa_pixel_diameter = 1.0;
+    static const float aa_lanczos_lobes = 3.0;
+    static const float aa_gauss_support = 1.0 / aa_pixel_diameter;
+    static const float aa_tent_support = 1.0 / aa_pixel_diameter;
+    
+    //  If we're using a negative-lobed filter, default to using it horizontally
+    //  only, and use only the first lobe vertically or a box filter, over a
+    //  correspondingly smaller range.  This compensates for the sparse sampling
+    //  grid's typically large positive/negative x/y covariance.
+    static const float2 aa_xy_axis_importance =
+        aa_filter < 5.5 ? float2(1.0) :         //  Box, tent, Gaussian
+        aa_filter < 8.5 ? float2(1.0, 0.0) :    //  Cubic and Lanczos sinc
+        aa_filter < 9.5 ? float2(1.0, 1.0/aa_lanczos_lobes) :   //  Lanczos jinc
+        float2(1.0);                            //  Default to box
+#endif
+
+#ifndef ANTIALIAS_OVERRIDE_PARAMETERS
+    //  Users may override these values with their own uniform or static consts.
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c = 0.5;
+    static const float aa_gauss_sigma = 0.5 / aa_pixel_diameter;
+    //  Users may override the subpixel offset accessor function with their own.
+    //  A function is used for compatibility with scalar runtime shader 
+    inline float2 get_aa_subpixel_r_offset()
+    {
+        return float2(0.0, 0.0);
+    }
+#endif
+
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//#include "../../../../include/gamma-management.h"
+
+
+//////////////////////////////////  CONSTANTS  /////////////////////////////////
+
+static const float aa_box_support = 0.5;
+static const float aa_cubic_support = 2.0;
+
+
+////////////////////////////  GLOBAL NON-CONSTANTS  ////////////////////////////
+
+//  We'll want to define these only once per fragment at most.
+#ifdef RUNTIME_ANTIALIAS_WEIGHTS
+    float aa_cubic_b;
+    float cubic_branch1_x3_coeff;
+    float cubic_branch1_x2_coeff;
+    float cubic_branch1_x0_coeff;
+    float cubic_branch2_x3_coeff;
+    float cubic_branch2_x2_coeff;
+    float cubic_branch2_x1_coeff;
+    float cubic_branch2_x0_coeff;
+#endif
+
+
+///////////////////////////////////  HELPERS  //////////////////////////////////
+
+void assign_aa_cubic_constants()
+{
+    //  Compute cubic coefficients on demand at runtime, and save them to global
+    //  uniforms.  The B parameter is computed from C, because "Keys cubics"
+    //  with B = 1 - 2C are considered the highest quality.
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        if(aa_filter > 5.5 && aa_filter < 7.5)
+        {
+            aa_cubic_b = 1.0 - 2.0*aa_cubic_c;
+            cubic_branch1_x3_coeff = 12.0 - 9.0*aa_cubic_b - 6.0*aa_cubic_c;
+            cubic_branch1_x2_coeff = -18.0 + 12.0*aa_cubic_b + 6.0*aa_cubic_c;
+            cubic_branch1_x0_coeff = 6.0 - 2.0 * aa_cubic_b;
+            cubic_branch2_x3_coeff = -aa_cubic_b - 6.0 * aa_cubic_c;
+            cubic_branch2_x2_coeff = 6.0*aa_cubic_b + 30.0*aa_cubic_c;
+            cubic_branch2_x1_coeff = -12.0*aa_cubic_b - 48.0*aa_cubic_c;
+            cubic_branch2_x0_coeff = 8.0*aa_cubic_b + 24.0*aa_cubic_c;
+        }
+    #endif
+}
+
+inline float4 get_subpixel_support_diam_and_final_axis_importance()
+{
+    //  Statically select the base support radius:
+    static const float base_support_radius =
+        aa_filter < 1.5 ? aa_box_support :
+        aa_filter < 3.5 ? aa_tent_support :
+        aa_filter < 5.5 ? aa_gauss_support :
+        aa_filter < 7.5 ? aa_cubic_support :
+        aa_filter < 9.5 ? aa_lanczos_lobes :
+        aa_box_support; //  Default to box
+    //  Expand the filter support for subpixel filtering.
+    const float2 subpixel_support_radius_raw =
+        float2(base_support_radius) + abs(get_aa_subpixel_r_offset());
+    if(aa_filter < 1.5)
+    {
+        //  Ignore aa_xy_axis_importance for box filtering.
+        const float2 subpixel_support_diam =
+            2.0 * subpixel_support_radius_raw;
+        const float2 final_axis_importance = float2(1.0);
+        return float4(subpixel_support_diam, final_axis_importance);
+    }
+    else
+    {
+        //  Scale the support window by aa_xy_axis_importance, but don't narrow
+        //  it further than box support.  This allows decent vertical AA without
+        //  messing up horizontal weights or using something silly like Lanczos4
+        //  horizontally with a huge vertical average over an 8-pixel radius.
+        const float2 subpixel_support_radius = max(float2(aa_box_support, aa_box_support),
+            subpixel_support_radius_raw * aa_xy_axis_importance);
+        //  Adjust aa_xy_axis_importance to compensate for what's already done:
+        const float2 final_axis_importance = aa_xy_axis_importance *
+            subpixel_support_radius_raw/subpixel_support_radius;
+        const float2 subpixel_support_diam = 2.0 * subpixel_support_radius;
+        return float4(subpixel_support_diam, final_axis_importance);
+    }
+}
+
+
+///////////////////////////  FILTER WEIGHT FUNCTIONS  //////////////////////////
+
+inline float eval_box_filter(const float dist)
+{
+    return float(abs(dist) <= aa_box_support);
+}
+
+inline float eval_separable_box_filter(const float2 offset)
+{
+    return float(all(bool2((abs(offset.x) <= aa_box_support), (abs(offset.y) <= aa_box_support))));
+}
+
+inline float eval_tent_filter(const float dist)
+{
+    return clamp((aa_tent_support - dist)/
+        aa_tent_support, 0.0, 1.0);
+}
+
+inline float eval_gaussian_filter(const float dist)
+{
+    return exp(-(dist*dist) / (2.0*aa_gauss_sigma*aa_gauss_sigma));
+}
+
+inline float eval_cubic_filter(const float dist)
+{
+    //  Compute coefficients like assign_aa_cubic_constants(), but statically.
+    #ifndef RUNTIME_ANTIALIAS_WEIGHTS
+        //  When runtime weights are used, these values are instead written to
+        //  global uniforms at the beginning of each tex2Daa* call.
+        const float aa_cubic_b = 1.0 - 2.0*aa_cubic_c;
+        const float cubic_branch1_x3_coeff = 12.0 - 9.0*aa_cubic_b - 6.0*aa_cubic_c;
+        const float cubic_branch1_x2_coeff = -18.0 + 12.0*aa_cubic_b + 6.0*aa_cubic_c;
+        const float cubic_branch1_x0_coeff = 6.0 - 2.0 * aa_cubic_b;
+        const float cubic_branch2_x3_coeff = -aa_cubic_b - 6.0 * aa_cubic_c;
+        const float cubic_branch2_x2_coeff = 6.0*aa_cubic_b + 30.0*aa_cubic_c;
+        const float cubic_branch2_x1_coeff = -12.0*aa_cubic_b - 48.0*aa_cubic_c;
+        const float cubic_branch2_x0_coeff = 8.0*aa_cubic_b + 24.0*aa_cubic_c;
+    #endif
+    const float abs_dist = abs(dist);
+    //  Compute the cubic based on the Horner's method formula in:
+    //  http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/Mitchell.pdf
+    return (abs_dist < 1.0 ?
+        (cubic_branch1_x3_coeff*abs_dist +
+            cubic_branch1_x2_coeff)*abs_dist*abs_dist +
+            cubic_branch1_x0_coeff :
+        abs_dist < 2.0 ?
+            ((cubic_branch2_x3_coeff*abs_dist +
+                cubic_branch2_x2_coeff)*abs_dist +
+                cubic_branch2_x1_coeff)*abs_dist + cubic_branch2_x0_coeff :
+            0.0)/6.0;
+}
+
+inline float eval_separable_cubic_filter(const float2 offset)
+{
+    //  This is faster than using a specific float2 version:
+    return eval_cubic_filter(offset.x) *
+        eval_cubic_filter(offset.y);
+}
+
+inline float2 eval_sinc_filter(const float2 offset)
+{
+    //  It's faster to let the caller handle the zero case, or at least it
+    //  was when I used macros and the shader preset took a full minute to load.
+    const float2 pi_offset = pi * offset;
+    return sin(pi_offset)/pi_offset;
+}
+
+inline float eval_separable_lanczos_sinc_filter(const float2 offset_unsafe)
+{
+    //  Note: For sparse sampling, you really need to pick an axis to use
+    //  Lanczos along (e.g. set aa_xy_axis_importance = float2(1.0, 0.0)).
+    const float2 offset = FIX_ZERO(offset_unsafe);
+    const float2 xy_weights = eval_sinc_filter(offset) *
+        eval_sinc_filter(offset/aa_lanczos_lobes);
+    return xy_weights.x * xy_weights.y;
+}
+
+inline float eval_jinc_filter_unorm(const float x)
+{
+    //  This is a Jinc approximation for x in [0, 45).  We'll use x in range
+    //  [0, 4*pi) or so.  There are faster/closer approximations based on
+    //  piecewise cubics from [0, 45) and asymptotic approximations beyond that,
+    //  but this has a maximum absolute error < 1/512, and it's simpler/faster
+    //  for shaders...not that it's all that useful for sparse sampling anyway.
+    const float point3845_x = 0.38448566093564*x;
+    const float exp_term = exp(-(point3845_x*point3845_x));
+    const float point8154_plus_x = 0.815362332840791 + x;
+    const float cos_term = cos(point8154_plus_x);
+    return (
+        0.0264727330997042*min(x, 6.83134964622778) +
+        0.680823557250528*exp_term +
+        -0.0597255978950933*min(7.41043194481873, x)*cos_term /
+            (point8154_plus_x + 0.0646074538634482*(x*x) +
+            cos(x)*max(exp_term, cos(x) + cos_term)) -
+        0.180837503591406);
+}
+
+inline float eval_jinc_filter(const float dist)
+{
+    return eval_jinc_filter_unorm(pi * dist);
+}
+
+inline float eval_lanczos_jinc_filter(const float dist)
+{
+    return eval_jinc_filter(dist) * eval_jinc_filter(dist/aa_lanczos_lobes);
+}
+
+
+inline float3 eval_unorm_rgb_weights(const float2 offset,
+    const float2 final_axis_importance)
+{
+    //  Requires:   1.) final_axis_impportance must be computed according to
+    //                  get_subpixel_support_diam_and_final_axis_importance().
+    //              2.) aa_filter must be a global constant.
+    //              3.) offset must be an xy pixel offset in the range:
+    //                      ([-subpixel_support_diameter.x/2,
+    //                      subpixel_support_diameter.x/2],
+    //                      [-subpixel_support_diameter.y/2,
+    //                      subpixel_support_diameter.y/2])
+    //  Returns:    Sample weights at R/G/B destination subpixels for the
+    //              given xy pixel offset.
+    const float2 offset_g = offset * final_axis_importance;
+    const float2 aa_r_offset = get_aa_subpixel_r_offset();
+    const float2 offset_r = offset_g - aa_r_offset * final_axis_importance;
+    const float2 offset_b = offset_g + aa_r_offset * final_axis_importance;
+    //  Statically select a filter:
+    if(aa_filter < 0.5)
+    {
+        return float3(eval_separable_box_filter(offset_r),
+            eval_separable_box_filter(offset_g),
+            eval_separable_box_filter(offset_b));
+    }
+    else if(aa_filter < 1.5)
+    {
+        return float3(eval_box_filter(length(offset_r)),
+            eval_box_filter(length(offset_g)),
+            eval_box_filter(length(offset_b)));
+    }
+    else if(aa_filter < 2.5)
+    {
+        return float3(
+            eval_tent_filter(offset_r.x) * eval_tent_filter(offset_r.y),
+            eval_tent_filter(offset_g.x) * eval_tent_filter(offset_g.y),
+            eval_tent_filter(offset_b.x) * eval_tent_filter(offset_b.y));
+    }
+    else if(aa_filter < 3.5)
+    {
+        return float3(eval_tent_filter(length(offset_r)),
+            eval_tent_filter(length(offset_g)),
+            eval_tent_filter(length(offset_b)));
+    }
+    else if(aa_filter < 4.5)
+    {
+        return float3(
+            eval_gaussian_filter(offset_r.x) * eval_gaussian_filter(offset_r.y),
+            eval_gaussian_filter(offset_g.x) * eval_gaussian_filter(offset_g.y),
+            eval_gaussian_filter(offset_b.x) * eval_gaussian_filter(offset_b.y));
+    }
+    else if(aa_filter < 5.5)
+    {
+        return float3(eval_gaussian_filter(length(offset_r)),
+            eval_gaussian_filter(length(offset_g)),
+            eval_gaussian_filter(length(offset_b)));
+    }
+    else if(aa_filter < 6.5)
+    {
+        return float3(
+            eval_cubic_filter(offset_r.x) * eval_cubic_filter(offset_r.y),
+            eval_cubic_filter(offset_g.x) * eval_cubic_filter(offset_g.y),
+            eval_cubic_filter(offset_b.x) * eval_cubic_filter(offset_b.y));
+    }
+    else if(aa_filter < 7.5)
+    {
+        return float3(eval_cubic_filter(length(offset_r)),
+            eval_cubic_filter(length(offset_g)),
+            eval_cubic_filter(length(offset_b)));
+    }
+    else if(aa_filter < 8.5)
+    {
+        return float3(eval_separable_lanczos_sinc_filter(offset_r),
+            eval_separable_lanczos_sinc_filter(offset_g),
+            eval_separable_lanczos_sinc_filter(offset_b));
+    }
+    else if(aa_filter < 9.5)
+    {
+        return float3(eval_lanczos_jinc_filter(length(offset_r)),
+            eval_lanczos_jinc_filter(length(offset_g)),
+            eval_lanczos_jinc_filter(length(offset_b)));
+    }
+    else
+    {
+        //  Default to a box, because Lanczos Jinc is so bad. ;)
+        return float3(eval_separable_box_filter(offset_r),
+            eval_separable_box_filter(offset_g),
+            eval_separable_box_filter(offset_b));
+    }
+}
+
+
+//////////////////////////////  HELPER FUNCTIONS  //////////////////////////////
+
+inline float4 tex2Daa_tiled_linearize(const sampler2D samp, const float2 s)
+{
+    //  If we're manually tiling a texture, anisotropic filtering can get
+    //  confused.  This is one workaround:
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        //  TODO: Use tex2Dlod_linearize with a calculated mip level.
+        return tex2Dlod_linearize(samp, float4(s, 0.0, 0.0));
+    #else
+        return tex2D_linearize(samp, s);
+    #endif
+}
+
+inline float2 get_frame_sign(const float frame)
+{
+    if(aa_temporal)
+    {
+        //  Mirror the sampling pattern for odd frames in a direction that
+        //  lets us keep the same subpixel sample weights:
+        const float frame_odd = float(fmod(frame, 2.0) > 0.5);
+        const float2 aa_r_offset = get_aa_subpixel_r_offset();
+        const float2 mirror = -float2(abs(aa_r_offset.x) < (FIX_ZERO(0.0)), abs(aa_r_offset.y) < (FIX_ZERO(0.0)));
+        return mirror;
+    }
+    else
+    {
+        return float2(1.0, 1.0);
+    }
+}
+
+
+/////////////////////////  ANTIALIASED TEXTURE LOOKUPS  ////////////////////////
+
+float3 tex2Daa_subpixel_weights_only(const sampler2D tex,
+    const float2 tex_uv, const float2x2 pixel_to_tex_uv)
+{
+    //  This function is unlike the others: Just perform a single independent
+    //  lookup for each subpixel.  It may be very aliased.
+    const float2 aa_r_offset = get_aa_subpixel_r_offset();
+    const float2 aa_r_offset_uv_offset = mul(pixel_to_tex_uv, aa_r_offset);
+    const float color_g = tex2D_linearize(tex, tex_uv).g;
+    const float color_r = tex2D_linearize(tex, tex_uv + aa_r_offset_uv_offset).r;
+    const float color_b = tex2D_linearize(tex, tex_uv - aa_r_offset_uv_offset).b;
+    return float3(color_r, color_g, color_b);
+}
+
+//  The tex2Daa* functions compile very slowly due to all the macros and
+//  compile-time math, so only include the ones we'll actually use!
+float3 tex2Daa4x(const sampler2D tex, const float2 tex_uv,
+    const float2x2 pixel_to_tex_uv, const float frame)
+{
+    //  Use an RGMS4 pattern (4-queens):
+    //  . . Q .  : off =(-1.5, -1.5)/4 + (2.0, 0.0)/4
+    //  Q . . .  : off =(-1.5, -1.5)/4 + (0.0, 1.0)/4
+    //  . . . Q  : off =(-1.5, -1.5)/4 + (3.0, 2.0)/4
+    //  . Q . .  : off =(-1.5, -1.5)/4 + (1.0, 3.0)/4
+    //  Static screenspace sample offsets (compute some implicitly):
+    static const float grid_size = 4.0;
+    assign_aa_cubic_constants();
+    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const float2 subpixel_support_diameter = ssd_fai.xy;
+    const float2 final_axis_importance = ssd_fai.zw;
+    const float2 xy_step = float2(1.0,1.0)/grid_size * subpixel_support_diameter;
+    const float2 xy_start_offset = float2(0.5 - grid_size*0.5,0.5 - grid_size*0.5) * xy_step;
+    //  Get the xy offset of each sample.  Exploit diagonal symmetry:
+    const float2 xy_offset0 = xy_start_offset + float2(2.0, 0.0) * xy_step;
+    const float2 xy_offset1 = xy_start_offset + float2(0.0, 1.0) * xy_step;
+    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
+    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const float3 w2 = w1.bgr;
+    const float3 w3 = w0.bgr;
+    //  Get the weight sum to normalize the total to 1.0 later:
+    const float3 half_sum = w0 + w1;
+    const float3 w_sum = half_sum + half_sum.bgr;
+    const float3 w_sum_inv = float3(1.0,1.0,1.0)/(w_sum);
+    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
+    const float2x2 true_pixel_to_tex_uv =
+        float2x2(pixel_to_tex_uv * aa_pixel_diameter);
+    //  Get uv sample offsets, mirror on odd frames if directed, and exploit
+    //  diagonal symmetry:
+    const float2 frame_sign = get_frame_sign(frame);
+    const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
+    const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
+    //  Load samples, linearizing if necessary, etc.:
+    const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
+    const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
+    const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
+    const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
+    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
+    return w_sum_inv * (w0 * sample0 + w1 * sample1 +
+        w2 * sample2 + w3 * sample3);
+}
+
+float3 tex2Daa5x(const sampler2D tex, const float2 tex_uv,
+    const float2x2 pixel_to_tex_uv, const float frame)
+{
+    //  Use a diagonally symmetric 5-queens pattern:
+    //  . Q . . .  : off =(-2.0, -2.0)/5 + (1.0, 0.0)/5
+    //  . . . . Q  : off =(-2.0, -2.0)/5 + (4.0, 1.0)/5
+    //  . . Q . .  : off =(-2.0, -2.0)/5 + (2.0, 2.0)/5
+    //  Q . . . .  : off =(-2.0, -2.0)/5 + (0.0, 3.0)/5
+    //  . . . Q .  : off =(-2.0, -2.0)/5 + (3.0, 4.0)/5
+    //  Static screenspace sample offsets (compute some implicitly):
+    static const float grid_size = 5.0;
+    assign_aa_cubic_constants();
+    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const float2 subpixel_support_diameter = ssd_fai.xy;
+    const float2 final_axis_importance = ssd_fai.zw;
+    const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter;
+    const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step;
+    //  Get the xy offset of each sample.  Exploit diagonal symmetry:
+    const float2 xy_offset0 = xy_start_offset + float2(1.0, 0.0) * xy_step;
+    const float2 xy_offset1 = xy_start_offset + float2(4.0, 1.0) * xy_step;
+    const float2 xy_offset2 = xy_start_offset + float2(2.0, 2.0) * xy_step;
+    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
+    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
+    const float3 w3 = w1.bgr;
+    const float3 w4 = w0.bgr;
+    //  Get the weight sum to normalize the total to 1.0 later:
+    const float3 w_sum_inv = float3(1.0)/(w0 + w1 + w2 + w3 + w4);
+    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
+    const float2x2 true_pixel_to_tex_uv =
+        float2x2(pixel_to_tex_uv * aa_pixel_diameter);
+    //  Get uv sample offsets, mirror on odd frames if directed, and exploit
+    //  diagonal symmetry:
+    const float2 frame_sign = get_frame_sign(frame);
+    const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
+    const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
+    //  Load samples, linearizing if necessary, etc.:
+    const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
+    const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
+    const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv).rgb;
+    const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
+    const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
+    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
+    return w_sum_inv * (w0 * sample0 + w1 * sample1 +
+        w2 * sample2 + w3 * sample3 + w4 * sample4);
+}
+
+float3 tex2Daa6x(const sampler2D tex, const float2 tex_uv,
+    const float2x2 pixel_to_tex_uv, const float frame)
+{
+    //  Use a diagonally symmetric 6-queens pattern with a stronger horizontal
+    //  than vertical slant:
+    //  . . . . Q .  : off =(-2.5, -2.5)/6 + (4.0, 0.0)/6
+    //  . . Q . . .  : off =(-2.5, -2.5)/6 + (2.0, 1.0)/6
+    //  Q . . . . .  : off =(-2.5, -2.5)/6 + (0.0, 2.0)/6
+    //  . . . . . Q  : off =(-2.5, -2.5)/6 + (5.0, 3.0)/6
+    //  . . . Q . .  : off =(-2.5, -2.5)/6 + (3.0, 4.0)/6
+    //  . Q . . . .  : off =(-2.5, -2.5)/6 + (1.0, 5.0)/6
+    //  Static screenspace sample offsets (compute some implicitly):
+    static const float grid_size = 6.0;
+    assign_aa_cubic_constants();
+    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const float2 subpixel_support_diameter = ssd_fai.xy;
+    const float2 final_axis_importance = ssd_fai.zw;
+    const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter;
+    const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step;
+    //  Get the xy offset of each sample.  Exploit diagonal symmetry:
+    const float2 xy_offset0 = xy_start_offset + float2(4.0, 0.0) * xy_step;
+    const float2 xy_offset1 = xy_start_offset + float2(2.0, 1.0) * xy_step;
+    const float2 xy_offset2 = xy_start_offset + float2(0.0, 2.0) * xy_step;
+    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
+    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
+    const float3 w3 = w2.bgr;
+    const float3 w4 = w1.bgr;
+    const float3 w5 = w0.bgr;
+    //  Get the weight sum to normalize the total to 1.0 later:
+    const float3 half_sum = w0 + w1 + w2;
+    const float3 w_sum = half_sum + half_sum.bgr;
+    const float3 w_sum_inv = float3(1.0)/(w_sum);
+    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
+    const float2x2 true_pixel_to_tex_uv =
+        float2x2(pixel_to_tex_uv * aa_pixel_diameter);
+    //  Get uv sample offsets, mirror on odd frames if directed, and exploit
+    //  diagonal symmetry:
+    const float2 frame_sign = get_frame_sign(frame);
+    const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
+    const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
+    const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign);
+    //  Load samples, linearizing if necessary, etc.:
+    const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
+    const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
+    const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb;
+    const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb;
+    const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
+    const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
+    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
+    return w_sum_inv * (w0 * sample0 + w1 * sample1 + w2 * sample2 +
+        w3 * sample3 + w4 * sample4 + w5 * sample5);
+}
+
+float3 tex2Daa7x(const sampler2D tex, const float2 tex_uv,
+    const float2x2 pixel_to_tex_uv, const float frame)
+{
+    //  Use a diagonally symmetric 7-queens pattern with a queen in the center:
+    //  . Q . . . . .  : off =(-3.0, -3.0)/7 + (1.0, 0.0)/7
+    //  . . . . Q . .  : off =(-3.0, -3.0)/7 + (4.0, 1.0)/7
+    //  Q . . . . . .  : off =(-3.0, -3.0)/7 + (0.0, 2.0)/7
+    //  . . . Q . . .  : off =(-3.0, -3.0)/7 + (3.0, 3.0)/7
+    //  . . . . . . Q  : off =(-3.0, -3.0)/7 + (6.0, 4.0)/7
+    //  . . Q . . . .  : off =(-3.0, -3.0)/7 + (2.0, 5.0)/7
+    //  . . . . . Q .  : off =(-3.0, -3.0)/7 + (5.0, 6.0)/7
+    static const float grid_size = 7.0;
+    assign_aa_cubic_constants();
+    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const float2 subpixel_support_diameter = ssd_fai.xy;
+    const float2 final_axis_importance = ssd_fai.zw;
+    const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter;
+    const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step;
+    //  Get the xy offset of each sample.  Exploit diagonal symmetry:
+    const float2 xy_offset0 = xy_start_offset + float2(1.0, 0.0) * xy_step;
+    const float2 xy_offset1 = xy_start_offset + float2(4.0, 1.0) * xy_step;
+    const float2 xy_offset2 = xy_start_offset + float2(0.0, 2.0) * xy_step;
+    const float2 xy_offset3 = xy_start_offset + float2(3.0, 3.0) * xy_step;
+    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
+    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
+    const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
+    const float3 w4 = w2.bgr;
+    const float3 w5 = w1.bgr;
+    const float3 w6 = w0.bgr;
+    //  Get the weight sum to normalize the total to 1.0 later:
+    const float3 half_sum = w0 + w1 + w2;
+    const float3 w_sum = half_sum + half_sum.bgr + w3;
+    const float3 w_sum_inv = float3(1.0)/(w_sum);
+    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
+    const float2x2 true_pixel_to_tex_uv =
+        float2x2(pixel_to_tex_uv * aa_pixel_diameter);
+    //  Get uv sample offsets, mirror on odd frames if directed, and exploit
+    //  diagonal symmetry:
+    const float2 frame_sign = get_frame_sign(frame);
+    const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
+    const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
+    const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign);
+    //  Load samples, linearizing if necessary, etc.:
+    const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
+    const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
+    const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb;
+    const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv).rgb;
+    const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb;
+    const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
+    const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
+    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
+    return w_sum_inv * (
+        w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
+        w4 * sample4 + w5 * sample5 + w6 * sample6);
+}
+
+float3 tex2Daa8x(const sampler2D tex, const float2 tex_uv,
+    const float2x2 pixel_to_tex_uv, const float frame)
+{
+    //  Use a diagonally symmetric 8-queens pattern.
+    //  . . Q . . . . .  : off =(-3.5, -3.5)/8 + (2.0, 0.0)/8
+    //  . . . . Q . . .  : off =(-3.5, -3.5)/8 + (4.0, 1.0)/8
+    //  . Q . . . . . .  : off =(-3.5, -3.5)/8 + (1.0, 2.0)/8
+    //  . . . . . . . Q  : off =(-3.5, -3.5)/8 + (7.0, 3.0)/8
+    //  Q . . . . . . .  : off =(-3.5, -3.5)/8 + (0.0, 4.0)/8
+    //  . . . . . . Q .  : off =(-3.5, -3.5)/8 + (6.0, 5.0)/8
+    //  . . . Q . . . .  : off =(-3.5, -3.5)/8 + (3.0, 6.0)/8
+    //  . . . . . Q . .  : off =(-3.5, -3.5)/8 + (5.0, 7.0)/8
+    static const float grid_size = 8.0;
+    assign_aa_cubic_constants();
+    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const float2 subpixel_support_diameter = ssd_fai.xy;
+    const float2 final_axis_importance = ssd_fai.zw;
+    const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter;
+    const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step;
+    //  Get the xy offset of each sample.  Exploit diagonal symmetry:
+    const float2 xy_offset0 = xy_start_offset + float2(2.0, 0.0) * xy_step;
+    const float2 xy_offset1 = xy_start_offset + float2(4.0, 1.0) * xy_step;
+    const float2 xy_offset2 = xy_start_offset + float2(1.0, 2.0) * xy_step;
+    const float2 xy_offset3 = xy_start_offset + float2(7.0, 3.0) * xy_step;
+    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
+    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
+    const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
+    const float3 w4 = w3.bgr;
+    const float3 w5 = w2.bgr;
+    const float3 w6 = w1.bgr;
+    const float3 w7 = w0.bgr;
+    //  Get the weight sum to normalize the total to 1.0 later:
+    const float3 half_sum = w0 + w1 + w2 + w3;
+    const float3 w_sum = half_sum + half_sum.bgr;
+    const float3 w_sum_inv = float3(1.0)/(w_sum);
+    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
+    const float2x2 true_pixel_to_tex_uv =
+        float2x2(pixel_to_tex_uv * aa_pixel_diameter);
+    //  Get uv sample offsets, and mirror on odd frames if directed:
+    const float2 frame_sign = get_frame_sign(frame);
+    const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
+    const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
+    const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign);
+    const float2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign);
+    //  Load samples, linearizing if necessary, etc.:
+    const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
+    const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
+    const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb;
+    const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb;
+    const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb;
+    const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb;
+    const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
+    const float3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
+    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
+    return w_sum_inv * (
+        w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
+        w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7);
+}
+
+float3 tex2Daa12x(const sampler2D tex, const float2 tex_uv,
+    const float2x2 pixel_to_tex_uv, const float frame)
+{
+    //  Use a diagonally symmetric 12-superqueens pattern where no 3 points are
+    //  exactly collinear.
+    //  . . . Q . . . . . . . .  : off =(-5.5, -5.5)/12 + (3.0, 0.0)/12
+    //  . . . . . . . . . Q . .  : off =(-5.5, -5.5)/12 + (9.0, 1.0)/12
+    //  . . . . . . Q . . . . .  : off =(-5.5, -5.5)/12 + (6.0, 2.0)/12
+    //  . Q . . . . . . . . . .  : off =(-5.5, -5.5)/12 + (1.0, 3.0)/12
+    //  . . . . . . . . . . . Q  : off =(-5.5, -5.5)/12 + (11.0, 4.0)/12
+    //  . . . . Q . . . . . . .  : off =(-5.5, -5.5)/12 + (4.0, 5.0)/12
+    //  . . . . . . . Q . . . .  : off =(-5.5, -5.5)/12 + (7.0, 6.0)/12
+    //  Q . . . . . . . . . . .  : off =(-5.5, -5.5)/12 + (0.0, 7.0)/12
+    //  . . . . . . . . . . Q .  : off =(-5.5, -5.5)/12 + (10.0, 8.0)/12
+    //  . . . . . Q . . . . . .  : off =(-5.5, -5.5)/12 + (5.0, 9.0)/12
+    //  . . Q . . . . . . . . .  : off =(-5.5, -5.5)/12 + (2.0, 10.0)/12
+    //  . . . . . . . . Q . . .  : off =(-5.5, -5.5)/12 + (8.0, 11.0)/12
+    static const float grid_size = 12.0;
+    assign_aa_cubic_constants();
+    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const float2 subpixel_support_diameter = ssd_fai.xy;
+    const float2 final_axis_importance = ssd_fai.zw;
+    const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter;
+    const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step;
+    //  Get the xy offset of each sample.  Exploit diagonal symmetry:
+    const float2 xy_offset0 = xy_start_offset + float2(3.0, 0.0) * xy_step;
+    const float2 xy_offset1 = xy_start_offset + float2(9.0, 1.0) * xy_step;
+    const float2 xy_offset2 = xy_start_offset + float2(6.0, 2.0) * xy_step;
+    const float2 xy_offset3 = xy_start_offset + float2(1.0, 3.0) * xy_step;
+    const float2 xy_offset4 = xy_start_offset + float2(11.0, 4.0) * xy_step;
+    const float2 xy_offset5 = xy_start_offset + float2(4.0, 5.0) * xy_step;
+    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
+    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
+    const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
+    const float3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance);
+    const float3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance);
+    const float3 w6 = w5.bgr;
+    const float3 w7 = w4.bgr;
+    const float3 w8 = w3.bgr;
+    const float3 w9 = w2.bgr;
+    const float3 w10 = w1.bgr;
+    const float3 w11 = w0.bgr;
+    //  Get the weight sum to normalize the total to 1.0 later:
+    const float3 half_sum = w0 + w1 + w2 + w3 + w4 + w5;
+    const float3 w_sum = half_sum + half_sum.bgr;
+    const float3 w_sum_inv = float3(1.0)/w_sum;
+    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
+    const float2x2 true_pixel_to_tex_uv =
+        float2x2(pixel_to_tex_uv * aa_pixel_diameter);
+    //  Get uv sample offsets, mirror on odd frames if directed, and exploit
+    //  diagonal symmetry:
+    const float2 frame_sign = get_frame_sign(frame);
+    const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
+    const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
+    const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign);
+    const float2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign);
+    const float2 uv_offset4 = mul(true_pixel_to_tex_uv, xy_offset4 * frame_sign);
+    const float2 uv_offset5 = mul(true_pixel_to_tex_uv, xy_offset5 * frame_sign);
+    //  Load samples, linearizing if necessary, etc.:
+    const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
+    const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
+    const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb;
+    const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb;
+    const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset4).rgb;
+    const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset5).rgb;
+    const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset5).rgb;
+    const float3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset4).rgb;
+    const float3 sample8 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb;
+    const float3 sample9 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb;
+    const float3 sample10 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
+    const float3 sample11 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
+    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
+    return w_sum_inv * (
+        w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
+        w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 +
+        w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11);
+}
+
+float3 tex2Daa16x(const sampler2D tex, const float2 tex_uv,
+    const float2x2 pixel_to_tex_uv, const float frame)
+{
+    //  Use a diagonally symmetric 16-superqueens pattern where no 3 points are
+    //  exactly collinear.
+    //  . . Q . . . . . . . . . . . . .  : off =(-7.5, -7.5)/16 + (2.0, 0.0)/16
+    //  . . . . . . . . . Q . . . . . .  : off =(-7.5, -7.5)/16 + (9.0, 1.0)/16
+    //  . . . . . . . . . . . . Q . . .  : off =(-7.5, -7.5)/16 + (12.0, 2.0)/16
+    //  . . . . Q . . . . . . . . . . .  : off =(-7.5, -7.5)/16 + (4.0, 3.0)/16
+    //  . . . . . . . . Q . . . . . . .  : off =(-7.5, -7.5)/16 + (8.0, 4.0)/16
+    //  . . . . . . . . . . . . . . Q .  : off =(-7.5, -7.5)/16 + (14.0, 5.0)/16
+    //  Q . . . . . . . . . . . . . . .  : off =(-7.5, -7.5)/16 + (0.0, 6.0)/16
+    //  . . . . . . . . . . Q . . . . .  : off =(-7.5, -7.5)/16 + (10.0, 7.0)/16
+    //  . . . . . Q . . . . . . . . . .  : off =(-7.5, -7.5)/16 + (5.0, 8.0)/16
+    //  . . . . . . . . . . . . . . . Q  : off =(-7.5, -7.5)/16 + (15.0, 9.0)/16
+    //  . Q . . . . . . . . . . . . . .  : off =(-7.5, -7.5)/16 + (1.0, 10.0)/16
+    //  . . . . . . . Q . . . . . . . .  : off =(-7.5, -7.5)/16 + (7.0, 11.0)/16
+    //  . . . . . . . . . . . Q . . . .  : off =(-7.5, -7.5)/16 + (11.0, 12.0)/16
+    //  . . . Q . . . . . . . . . . . .  : off =(-7.5, -7.5)/16 + (3.0, 13.0)/16
+    //  . . . . . . Q . . . . . . . . .  : off =(-7.5, -7.5)/16 + (6.0, 14.0)/16
+    //  . . . . . . . . . . . . . Q . .  : off =(-7.5, -7.5)/16 + (13.0, 15.0)/16
+    static const float grid_size = 16.0;
+    assign_aa_cubic_constants();
+    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const float2 subpixel_support_diameter = ssd_fai.xy;
+    const float2 final_axis_importance = ssd_fai.zw;
+    const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter;
+    const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step;
+    //  Get the xy offset of each sample.  Exploit diagonal symmetry:
+    const float2 xy_offset0 = xy_start_offset + float2(2.0, 0.0) * xy_step;
+    const float2 xy_offset1 = xy_start_offset + float2(9.0, 1.0) * xy_step;
+    const float2 xy_offset2 = xy_start_offset + float2(12.0, 2.0) * xy_step;
+    const float2 xy_offset3 = xy_start_offset + float2(4.0, 3.0) * xy_step;
+    const float2 xy_offset4 = xy_start_offset + float2(8.0, 4.0) * xy_step;
+    const float2 xy_offset5 = xy_start_offset + float2(14.0, 5.0) * xy_step;
+    const float2 xy_offset6 = xy_start_offset + float2(0.0, 6.0) * xy_step;
+    const float2 xy_offset7 = xy_start_offset + float2(10.0, 7.0) * xy_step;
+    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
+    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
+    const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
+    const float3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance);
+    const float3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance);
+    const float3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance);
+    const float3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance);
+    const float3 w8 = w7.bgr;
+    const float3 w9 = w6.bgr;
+    const float3 w10 = w5.bgr;
+    const float3 w11 = w4.bgr;
+    const float3 w12 = w3.bgr;
+    const float3 w13 = w2.bgr;
+    const float3 w14 = w1.bgr;
+    const float3 w15 = w0.bgr;
+    //  Get the weight sum to normalize the total to 1.0 later:
+    const float3 half_sum = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7;
+    const float3 w_sum = half_sum + half_sum.bgr;
+    const float3 w_sum_inv = float3(1.0)/(w_sum);
+    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
+    const float2x2 true_pixel_to_tex_uv =
+        float2x2(pixel_to_tex_uv * aa_pixel_diameter);
+    //  Get uv sample offsets, mirror on odd frames if directed, and exploit
+    //  diagonal symmetry:
+    const float2 frame_sign = get_frame_sign(frame);
+    const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
+    const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
+    const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign);
+    const float2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign);
+    const float2 uv_offset4 = mul(true_pixel_to_tex_uv, xy_offset4 * frame_sign);
+    const float2 uv_offset5 = mul(true_pixel_to_tex_uv, xy_offset5 * frame_sign);
+    const float2 uv_offset6 = mul(true_pixel_to_tex_uv, xy_offset6 * frame_sign);
+    const float2 uv_offset7 = mul(true_pixel_to_tex_uv, xy_offset7 * frame_sign);
+    //  Load samples, linearizing if necessary, etc.:
+    const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
+    const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
+    const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb;
+    const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb;
+    const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset4).rgb;
+    const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset5).rgb;
+    const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset6).rgb;
+    const float3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset7).rgb;
+    const float3 sample8 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset7).rgb;
+    const float3 sample9 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset6).rgb;
+    const float3 sample10 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset5).rgb;
+    const float3 sample11 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset4).rgb;
+    const float3 sample12 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb;
+    const float3 sample13 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb;
+    const float3 sample14 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
+    const float3 sample15 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
+    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
+    return w_sum_inv * (
+        w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
+        w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 +
+        w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 +
+        w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15);
+}
+
+float3 tex2Daa20x(const sampler2D tex, const float2 tex_uv,
+    const float2x2 pixel_to_tex_uv, const float frame)
+{
+    //  Use a diagonally symmetric 20-superqueens pattern where no 3 points are
+    //  exactly collinear and superqueens have a squared attack radius of 13.
+    //  . . . . . . . Q . . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (7.0, 0.0)/20
+    //  . . . . . . . . . . . . . . . . Q . . .  : off =(-9.5, -9.5)/20 + (16.0, 1.0)/20
+    //  . . . . . . . . . . . Q . . . . . . . .  : off =(-9.5, -9.5)/20 + (11.0, 2.0)/20
+    //  . Q . . . . . . . . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (1.0, 3.0)/20
+    //  . . . . . Q . . . . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (5.0, 4.0)/20
+    //  . . . . . . . . . . . . . . . Q . . . .  : off =(-9.5, -9.5)/20 + (15.0, 5.0)/20
+    //  . . . . . . . . . . Q . . . . . . . . .  : off =(-9.5, -9.5)/20 + (10.0, 6.0)/20
+    //  . . . . . . . . . . . . . . . . . . . Q  : off =(-9.5, -9.5)/20 + (19.0, 7.0)/20
+    //  . . Q . . . . . . . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (2.0, 8.0)/20
+    //  . . . . . . Q . . . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (6.0, 9.0)/20
+    //  . . . . . . . . . . . . . Q . . . . . .  : off =(-9.5, -9.5)/20 + (13.0, 10.0)/20
+    //  . . . . . . . . . . . . . . . . . Q . .  : off =(-9.5, -9.5)/20 + (17.0, 11.0)/20
+    //  Q . . . . . . . . . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (0.0, 12.0)/20
+    //  . . . . . . . . . Q . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (9.0, 13.0)/20
+    //  . . . . Q . . . . . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (4.0, 14.0)/20
+    //  . . . . . . . . . . . . . . Q . . . . .  : off =(-9.5, -9.5)/20 + (14.0, 15.0)/20
+    //  . . . . . . . . . . . . . . . . . . Q .  : off =(-9.5, -9.5)/20 + (18.0, 16.0)/20
+    //  . . . . . . . . Q . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (8.0, 17.0)/20
+    //  . . . Q . . . . . . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (3.0, 18.0)/20
+    //  . . . . . . . . . . . . Q . . . . . . .  : off =(-9.5, -9.5)/20 + (12.0, 19.0)/20
+    static const float grid_size = 20.0;
+    assign_aa_cubic_constants();
+    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const float2 subpixel_support_diameter = ssd_fai.xy;
+    const float2 final_axis_importance = ssd_fai.zw;
+    const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter;
+    const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step;
+    //  Get the xy offset of each sample.  Exploit diagonal symmetry:
+    const float2 xy_offset0 = xy_start_offset + float2(7.0, 0.0) * xy_step;
+    const float2 xy_offset1 = xy_start_offset + float2(16.0, 1.0) * xy_step;
+    const float2 xy_offset2 = xy_start_offset + float2(11.0, 2.0) * xy_step;
+    const float2 xy_offset3 = xy_start_offset + float2(1.0, 3.0) * xy_step;
+    const float2 xy_offset4 = xy_start_offset + float2(5.0, 4.0) * xy_step;
+    const float2 xy_offset5 = xy_start_offset + float2(15.0, 5.0) * xy_step;
+    const float2 xy_offset6 = xy_start_offset + float2(10.0, 6.0) * xy_step;
+    const float2 xy_offset7 = xy_start_offset + float2(19.0, 7.0) * xy_step;
+    const float2 xy_offset8 = xy_start_offset + float2(2.0, 8.0) * xy_step;
+    const float2 xy_offset9 = xy_start_offset + float2(6.0, 9.0) * xy_step;
+    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
+    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
+    const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
+    const float3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance);
+    const float3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance);
+    const float3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance);
+    const float3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance);
+    const float3 w8 = eval_unorm_rgb_weights(xy_offset8, final_axis_importance);
+    const float3 w9 = eval_unorm_rgb_weights(xy_offset9, final_axis_importance);
+    const float3 w10 = w9.bgr;
+    const float3 w11 = w8.bgr;
+    const float3 w12 = w7.bgr;
+    const float3 w13 = w6.bgr;
+    const float3 w14 = w5.bgr;
+    const float3 w15 = w4.bgr;
+    const float3 w16 = w3.bgr;
+    const float3 w17 = w2.bgr;
+    const float3 w18 = w1.bgr;
+    const float3 w19 = w0.bgr;
+    //  Get the weight sum to normalize the total to 1.0 later:
+    const float3 half_sum = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9;
+    const float3 w_sum = half_sum + half_sum.bgr;
+    const float3 w_sum_inv = float3(1.0)/(w_sum);
+    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
+    const float2x2 true_pixel_to_tex_uv =
+        float2x2(pixel_to_tex_uv * aa_pixel_diameter);
+    //  Get uv sample offsets, mirror on odd frames if directed, and exploit
+    //  diagonal symmetry:
+    const float2 frame_sign = get_frame_sign(frame);
+    const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
+    const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
+    const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign);
+    const float2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign);
+    const float2 uv_offset4 = mul(true_pixel_to_tex_uv, xy_offset4 * frame_sign);
+    const float2 uv_offset5 = mul(true_pixel_to_tex_uv, xy_offset5 * frame_sign);
+    const float2 uv_offset6 = mul(true_pixel_to_tex_uv, xy_offset6 * frame_sign);
+    const float2 uv_offset7 = mul(true_pixel_to_tex_uv, xy_offset7 * frame_sign);
+    const float2 uv_offset8 = mul(true_pixel_to_tex_uv, xy_offset8 * frame_sign);
+    const float2 uv_offset9 = mul(true_pixel_to_tex_uv, xy_offset9 * frame_sign);
+    //  Load samples, linearizing if necessary, etc.:
+    const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
+    const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
+    const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb;
+    const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb;
+    const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset4).rgb;
+    const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset5).rgb;
+    const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset6).rgb;
+    const float3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset7).rgb;
+    const float3 sample8 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset8).rgb;
+    const float3 sample9 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset9).rgb;
+    const float3 sample10 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset9).rgb;
+    const float3 sample11 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset8).rgb;
+    const float3 sample12 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset7).rgb;
+    const float3 sample13 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset6).rgb;
+    const float3 sample14 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset5).rgb;
+    const float3 sample15 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset4).rgb;
+    const float3 sample16 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb;
+    const float3 sample17 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb;
+    const float3 sample18 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
+    const float3 sample19 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
+    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
+    return w_sum_inv * (
+        w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
+        w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 +
+        w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 +
+        w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15 +
+        w16 * sample16 + w17 * sample17 + w18 * sample18 + w19 * sample19);
+}
+
+float3 tex2Daa24x(const sampler2D tex, const float2 tex_uv,
+    const float2x2 pixel_to_tex_uv, const float frame)
+{
+    //  Use a diagonally symmetric 24-superqueens pattern where no 3 points are
+    //  exactly collinear and superqueens have a squared attack radius of 13.
+    //  . . . . . . Q . . . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (6.0, 0.0)/24
+    //  . . . . . . . . . . . . . . . . Q . . . . . . .  : off =(-11.5, -11.5)/24 + (16.0, 1.0)/24
+    //  . . . . . . . . . . Q . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (10.0, 2.0)/24
+    //  . . . . . . . . . . . . . . . . . . . . . Q . .  : off =(-11.5, -11.5)/24 + (21.0, 3.0)/24
+    //  . . . . . Q . . . . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (5.0, 4.0)/24
+    //  . . . . . . . . . . . . . . . Q . . . . . . . .  : off =(-11.5, -11.5)/24 + (15.0, 5.0)/24
+    //  . Q . . . . . . . . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (1.0, 6.0)/24
+    //  . . . . . . . . . . . Q . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (11.0, 7.0)/24
+    //  . . . . . . . . . . . . . . . . . . . Q . . . .  : off =(-11.5, -11.5)/24 + (19.0, 8.0)/24
+    //  . . . . . . . . . . . . . . . . . . . . . . . Q  : off =(-11.5, -11.5)/24 + (23.0, 9.0)/24
+    //  . . . Q . . . . . . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (3.0, 10.0)/24
+    //  . . . . . . . . . . . . . . Q . . . . . . . . .  : off =(-11.5, -11.5)/24 + (14.0, 11.0)/24
+    //  . . . . . . . . . Q . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (9.0, 12.0)/24
+    //  . . . . . . . . . . . . . . . . . . . . Q . . .  : off =(-11.5, -11.5)/24 + (20.0, 13.0)/24
+    //  Q . . . . . . . . . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (0.0, 14.0)/24
+    //  . . . . Q . . . . . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (4.0, 15.0)/24
+    //  . . . . . . . . . . . . Q . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (12.0, 16.0)/24
+    //  . . . . . . . . . . . . . . . . . . . . . . Q .  : off =(-11.5, -11.5)/24 + (22.0, 17.0)/24
+    //  . . . . . . . . Q . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (8.0, 18.0)/24
+    //  . . . . . . . . . . . . . . . . . . Q . . . . .  : off =(-11.5, -11.5)/24 + (18.0, 19.0)/24
+    //  . . Q . . . . . . . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (2.0, 20.0)/24
+    //  . . . . . . . . . . . . . Q . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (13.0, 21.0)/24
+    //  . . . . . . . Q . . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (7.0, 22.0)/24
+    //  . . . . . . . . . . . . . . . . . Q . . . . . .  : off =(-11.5, -11.5)/24 + (17.0, 23.0)/24
+    static const float grid_size = 24.0;
+    assign_aa_cubic_constants();
+    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const float2 subpixel_support_diameter = ssd_fai.xy;
+    const float2 final_axis_importance = ssd_fai.zw;
+    const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter;
+    const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step;
+    //  Get the xy offset of each sample.  Exploit diagonal symmetry:
+    const float2 xy_offset0 = xy_start_offset + float2(6.0, 0.0) * xy_step;
+    const float2 xy_offset1 = xy_start_offset + float2(16.0, 1.0) * xy_step;
+    const float2 xy_offset2 = xy_start_offset + float2(10.0, 2.0) * xy_step;
+    const float2 xy_offset3 = xy_start_offset + float2(21.0, 3.0) * xy_step;
+    const float2 xy_offset4 = xy_start_offset + float2(5.0, 4.0) * xy_step;
+    const float2 xy_offset5 = xy_start_offset + float2(15.0, 5.0) * xy_step;
+    const float2 xy_offset6 = xy_start_offset + float2(1.0, 6.0) * xy_step;
+    const float2 xy_offset7 = xy_start_offset + float2(11.0, 7.0) * xy_step;
+    const float2 xy_offset8 = xy_start_offset + float2(19.0, 8.0) * xy_step;
+    const float2 xy_offset9 = xy_start_offset + float2(23.0, 9.0) * xy_step;
+    const float2 xy_offset10 = xy_start_offset + float2(3.0, 10.0) * xy_step;
+    const float2 xy_offset11 = xy_start_offset + float2(14.0, 11.0) * xy_step;
+    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
+    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
+    const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
+    const float3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance);
+    const float3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance);
+    const float3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance);
+    const float3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance);
+    const float3 w8 = eval_unorm_rgb_weights(xy_offset8, final_axis_importance);
+    const float3 w9 = eval_unorm_rgb_weights(xy_offset9, final_axis_importance);
+    const float3 w10 = eval_unorm_rgb_weights(xy_offset10, final_axis_importance);
+    const float3 w11 = eval_unorm_rgb_weights(xy_offset11, final_axis_importance);
+    const float3 w12 = w11.bgr;
+    const float3 w13 = w10.bgr;
+    const float3 w14 = w9.bgr;
+    const float3 w15 = w8.bgr;
+    const float3 w16 = w7.bgr;
+    const float3 w17 = w6.bgr;
+    const float3 w18 = w5.bgr;
+    const float3 w19 = w4.bgr;
+    const float3 w20 = w3.bgr;
+    const float3 w21 = w2.bgr;
+    const float3 w22 = w1.bgr;
+    const float3 w23 = w0.bgr;
+    //  Get the weight sum to normalize the total to 1.0 later:
+    const float3 half_sum = w0 + w1 + w2 + w3 + w4 +
+        w5 + w6 + w7 + w8 + w9 + w10 + w11;
+    const float3 w_sum = half_sum + half_sum.bgr;
+    const float3 w_sum_inv = float3(1.0)/(w_sum);
+    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
+    const float2x2 true_pixel_to_tex_uv =
+        float2x2(pixel_to_tex_uv * aa_pixel_diameter);
+    //  Get uv sample offsets, mirror on odd frames if directed, and exploit
+    //  diagonal symmetry:
+    const float2 frame_sign = get_frame_sign(frame);
+    const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
+    const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
+    const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign);
+    const float2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign);
+    const float2 uv_offset4 = mul(true_pixel_to_tex_uv, xy_offset4 * frame_sign);
+    const float2 uv_offset5 = mul(true_pixel_to_tex_uv, xy_offset5 * frame_sign);
+    const float2 uv_offset6 = mul(true_pixel_to_tex_uv, xy_offset6 * frame_sign);
+    const float2 uv_offset7 = mul(true_pixel_to_tex_uv, xy_offset7 * frame_sign);
+    const float2 uv_offset8 = mul(true_pixel_to_tex_uv, xy_offset8 * frame_sign);
+    const float2 uv_offset9 = mul(true_pixel_to_tex_uv, xy_offset9 * frame_sign);
+    const float2 uv_offset10 = mul(true_pixel_to_tex_uv, xy_offset10 * frame_sign);
+    const float2 uv_offset11 = mul(true_pixel_to_tex_uv, xy_offset11 * frame_sign);
+    //  Load samples, linearizing if necessary, etc.:
+    const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
+    const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
+    const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb;
+    const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb;
+    const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset4).rgb;
+    const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset5).rgb;
+    const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset6).rgb;
+    const float3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset7).rgb;
+    const float3 sample8 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset8).rgb;
+    const float3 sample9 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset9).rgb;
+    const float3 sample10 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset10).rgb;
+    const float3 sample11 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset11).rgb;
+    const float3 sample12 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset11).rgb;
+    const float3 sample13 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset10).rgb;
+    const float3 sample14 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset9).rgb;
+    const float3 sample15 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset8).rgb;
+    const float3 sample16 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset7).rgb;
+    const float3 sample17 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset6).rgb;
+    const float3 sample18 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset5).rgb;
+    const float3 sample19 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset4).rgb;
+    const float3 sample20 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb;
+    const float3 sample21 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb;
+    const float3 sample22 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
+    const float3 sample23 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
+    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
+    return w_sum_inv * (
+        w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
+        w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 +
+        w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 +
+        w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15 +
+        w16 * sample16 + w17 * sample17 + w18 * sample18 + w19 * sample19 +
+        w20 * sample20 + w21 * sample21 + w22 * sample22 + w23 * sample23);
+}
+
+float3 tex2Daa_debug_16x_regular(const sampler2D tex, const float2 tex_uv,
+    const float2x2 pixel_to_tex_uv, const float frame)
+{
+    //  Sample on a regular 4x4 grid.  This is mainly for testing.
+    static const float grid_size = 4.0;
+    assign_aa_cubic_constants();
+    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const float2 subpixel_support_diameter = ssd_fai.xy;
+    const float2 final_axis_importance = ssd_fai.zw;
+    const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter;
+    const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step;
+    //  Get the xy offset of each sample:
+    const float2 xy_offset0 = xy_start_offset + float2(0.0, 0.0) * xy_step;
+    const float2 xy_offset1 = xy_start_offset + float2(1.0, 0.0) * xy_step;
+    const float2 xy_offset2 = xy_start_offset + float2(2.0, 0.0) * xy_step;
+    const float2 xy_offset3 = xy_start_offset + float2(3.0, 0.0) * xy_step;
+    const float2 xy_offset4 = xy_start_offset + float2(0.0, 1.0) * xy_step;
+    const float2 xy_offset5 = xy_start_offset + float2(1.0, 1.0) * xy_step;
+    const float2 xy_offset6 = xy_start_offset + float2(2.0, 1.0) * xy_step;
+    const float2 xy_offset7 = xy_start_offset + float2(3.0, 1.0) * xy_step;
+    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
+    //  (We can't exploit vertical or horizontal symmetry due to uncertain
+    //  subpixel offsets.  We could fix that by rotating xy offsets with the
+    //  subpixel structure, but...no.)
+    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
+    const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
+    const float3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance);
+    const float3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance);
+    const float3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance);
+    const float3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance);
+    const float3 w8 = w7.bgr;
+    const float3 w9 = w6.bgr;
+    const float3 w10 = w5.bgr;
+    const float3 w11 = w4.bgr;
+    const float3 w12 = w3.bgr;
+    const float3 w13 = w2.bgr;
+    const float3 w14 = w1.bgr;
+    const float3 w15 = w0.bgr;
+    //  Get the weight sum to normalize the total to 1.0 later:
+    const float3 half_sum = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7;
+    const float3 w_sum = half_sum + half_sum.bgr;
+    const float3 w_sum_inv = float3(1.0)/(w_sum);
+    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
+    const float2x2 true_pixel_to_tex_uv =
+        float2x2(pixel_to_tex_uv * aa_pixel_diameter);
+    //  Get uv sample offsets, taking advantage of row alignment:
+    const float2 uv_step_x = mul(true_pixel_to_tex_uv, float2(xy_step.x, 0.0));
+    const float2 uv_step_y = mul(true_pixel_to_tex_uv, float2(0.0, xy_step.y));
+    const float2 uv_offset0 = -1.5 * (uv_step_x + uv_step_y);
+    const float2 sample0_uv = tex_uv + uv_offset0;
+    const float2 sample4_uv = sample0_uv + uv_step_y;
+    const float2 sample8_uv = sample0_uv + uv_step_y * 2.0;
+    const float2 sample12_uv = sample0_uv + uv_step_y * 3.0;
+    //  Load samples, linearizing if necessary, etc.:
+    const float3 sample0 = tex2Daa_tiled_linearize(tex, sample0_uv).rgb;
+    const float3 sample1 = tex2Daa_tiled_linearize(tex, sample0_uv + uv_step_x).rgb;
+    const float3 sample2 = tex2Daa_tiled_linearize(tex, sample0_uv + uv_step_x * 2.0).rgb;
+    const float3 sample3 = tex2Daa_tiled_linearize(tex, sample0_uv + uv_step_x * 3.0).rgb;
+    const float3 sample4 = tex2Daa_tiled_linearize(tex, sample4_uv).rgb;
+    const float3 sample5 = tex2Daa_tiled_linearize(tex, sample4_uv + uv_step_x).rgb;
+    const float3 sample6 = tex2Daa_tiled_linearize(tex, sample4_uv + uv_step_x * 2.0).rgb;
+    const float3 sample7 = tex2Daa_tiled_linearize(tex, sample4_uv + uv_step_x * 3.0).rgb;
+    const float3 sample8 = tex2Daa_tiled_linearize(tex, sample8_uv).rgb;
+    const float3 sample9 = tex2Daa_tiled_linearize(tex, sample8_uv + uv_step_x).rgb;
+    const float3 sample10 = tex2Daa_tiled_linearize(tex, sample8_uv + uv_step_x * 2.0).rgb;
+    const float3 sample11 = tex2Daa_tiled_linearize(tex, sample8_uv + uv_step_x * 3.0).rgb;
+    const float3 sample12 = tex2Daa_tiled_linearize(tex, sample12_uv).rgb;
+    const float3 sample13 = tex2Daa_tiled_linearize(tex, sample12_uv + uv_step_x).rgb;
+    const float3 sample14 = tex2Daa_tiled_linearize(tex, sample12_uv + uv_step_x * 2.0).rgb;
+    const float3 sample15 = tex2Daa_tiled_linearize(tex, sample12_uv + uv_step_x * 3.0).rgb;
+    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
+    return w_sum_inv * (
+        w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
+        w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 +
+        w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 +
+        w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15);
+}
+
+float3 tex2Daa_debug_dynamic(const sampler2D tex, const float2 tex_uv,
+    const float2x2 pixel_to_tex_uv, const float frame)
+{
+    //  This function is for testing only: Use an NxN grid with dynamic weights.
+    static const int grid_size = 8;
+    assign_aa_cubic_constants();
+    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const float2 subpixel_support_diameter = ssd_fai.xy;
+    const float2 final_axis_importance = ssd_fai.zw;
+    const float grid_radius_in_samples = (float(grid_size) - 1.0)/2.0;
+    const float2 filter_space_offset_step =
+        subpixel_support_diameter/float2(grid_size);
+    const float2 sample0_filter_space_offset =
+        -grid_radius_in_samples * filter_space_offset_step;
+    //  Compute xy sample offsets and subpixel weights:
+    float3 weights[64]; //originally grid_size * grid_size
+    float3 weight_sum = float3(0.0, 0.0, 0.0);
+    for(int i = 0; i < grid_size; ++i)
+    {
+        for(int j = 0; j < grid_size; ++j)
+        {
+            //  Weights based on xy distances:
+            const float2 offset = sample0_filter_space_offset +
+                float2(j, i) * filter_space_offset_step;
+            const float3 weight = eval_unorm_rgb_weights(offset, final_axis_importance);
+            weights[i*grid_size + j] = weight;
+            weight_sum += weight;
+        }
+    }
+    //  Get uv offset vectors along x and y directions:
+    const float2x2 true_pixel_to_tex_uv =
+        float2x2(pixel_to_tex_uv * aa_pixel_diameter);
+    const float2 uv_offset_step_x = mul(true_pixel_to_tex_uv,
+        float2(filter_space_offset_step.x, 0.0));
+    const float2 uv_offset_step_y = mul(true_pixel_to_tex_uv,
+        float2(0.0, filter_space_offset_step.y));
+    //  Get a starting sample location:
+    const float2 sample0_uv_offset = -grid_radius_in_samples *
+        (uv_offset_step_x + uv_offset_step_y);
+    const float2 sample0_uv = tex_uv + sample0_uv_offset;
+    //  Load, weight, and sum [linearized] samples:
+    float3 sum = float3(0.0, 0.0, 0.0);
+    const float3 weight_sum_inv = float3(1.0)/weight_sum;
+    for(int i = 0; i < grid_size; ++i)
+    {
+        const float2 row_i_first_sample_uv =
+            sample0_uv + i * uv_offset_step_y;
+        for(int j = 0; j < grid_size; ++j)
+        {
+            const float2 sample_uv =
+                row_i_first_sample_uv + j * uv_offset_step_x;
+            sum += weights[i*grid_size + j] *
+                tex2Daa_tiled_linearize(tex, sample_uv).rgb;
+        }
+    }
+    return sum * weight_sum_inv;
+}
+
+
+///////////////////////  ANTIALIASING CODEPATH SELECTION  //////////////////////
+
+inline float3 tex2Daa(const sampler2D tex, const float2 tex_uv,
+    const float2x2 pixel_to_tex_uv, const float frame)
+{
+#define DEBUG
+#ifdef DEBUG
+	return tex2Daa_subpixel_weights_only(
+            tex, tex_uv, pixel_to_tex_uv);
+#else
+	//  Statically switch between antialiasing modes/levels:
+    return (aa_level < 0.5) ? tex2D_linearize(tex, tex_uv).rgb :
+        (aa_level < 3.5) ? tex2Daa_subpixel_weights_only(
+            tex, tex_uv, pixel_to_tex_uv) :
+        (aa_level < 4.5) ? tex2Daa4x(tex, tex_uv, pixel_to_tex_uv, frame) :
+        (aa_level < 5.5) ? tex2Daa5x(tex, tex_uv, pixel_to_tex_uv, frame) :
+        (aa_level < 6.5) ? tex2Daa6x(tex, tex_uv, pixel_to_tex_uv, frame) :
+        (aa_level < 7.5) ? tex2Daa7x(tex, tex_uv, pixel_to_tex_uv, frame) :
+        (aa_level < 11.5) ? tex2Daa8x(tex, tex_uv, pixel_to_tex_uv, frame) :
+        (aa_level < 15.5) ? tex2Daa12x(tex, tex_uv, pixel_to_tex_uv, frame) :
+        (aa_level < 19.5) ? tex2Daa16x(tex, tex_uv, pixel_to_tex_uv, frame) :
+        (aa_level < 23.5) ? tex2Daa20x(tex, tex_uv, pixel_to_tex_uv, frame) :
+        (aa_level < 253.5) ? tex2Daa24x(tex, tex_uv, pixel_to_tex_uv, frame) :
+        (aa_level < 254.5) ? tex2Daa_debug_16x_regular(
+            tex, tex_uv, pixel_to_tex_uv, frame) :
+        tex2Daa_debug_dynamic(tex, tex_uv, pixel_to_tex_uv, frame);
+#endif
+}
+
+
+#endif  //  TEX2DANTIALIAS_H
+
+/////////////////////////  END TEX2DANTIALIAS  /////////////////////////
+
+//#include "geometry-functions.h"
+
+/////////////////////////  BEGIN GEOMETRY-FUNCTIONS  /////////////////////////
+
+#ifndef GEOMETRY_FUNCTIONS_H
+#define GEOMETRY_FUNCTIONS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+// already included elsewhere
+//#include "../user-settings.h"
+//#include "derived-settings-and-constants.h"
+//#include "bind-shader-h"
+
+
+////////////////////////////  MACROS AND CONSTANTS  ////////////////////////////
+
+//  Curvature-related constants:
+#define MAX_POINT_CLOUD_SIZE 9
+
+
+/////////////////////////////  CURVATURE FUNCTIONS /////////////////////////////
+
+float2 quadratic_solve(const float a, const float b_over_2, const float c)
+{
+    //  Requires:   1.) a, b, and c are quadratic formula coefficients
+    //              2.) b_over_2 = b/2.0 (simplifies terms to factor 2 out)
+    //              3.) b_over_2 must be guaranteed < 0.0 (avoids a branch)
+    //  Returns:    Returns float2(first_solution, discriminant), so the caller
+    //              can choose how to handle the "no intersection" case.  The
+    //              Kahan or Citardauq formula is used for numerical robustness.
+    const float discriminant = b_over_2*b_over_2 - a*c;
+    const float solution0 = c/(-b_over_2 + sqrt(discriminant));
+    return float2(solution0, discriminant);
+}
+
+float2 intersect_sphere(const float3 view_vec, const float3 eye_pos_vec)
+{
+    //  Requires:   1.) view_vec and eye_pos_vec are 3D vectors in the sphere's
+    //                  local coordinate frame (eye_pos_vec is a position, i.e.
+    //                  a vector from the origin to the eye/camera)
+    //              2.) geom_radius is a global containing the sphere's radius
+    //  Returns:    Cast a ray of direction view_vec from eye_pos_vec at a
+    //              sphere of radius geom_radius, and return the distance to
+    //              the first intersection in units of length(view_vec).
+    //              http://wiki.cgsociety.org/index.php/Ray_Sphere_Intersection
+    //  Quadratic formula coefficients (b_over_2 is guaranteed negative):
+    const float a = dot(view_vec, view_vec);
+    const float b_over_2 = dot(view_vec, eye_pos_vec);  //  * 2.0 factored out
+    const float c = dot(eye_pos_vec, eye_pos_vec) - geom_radius*geom_radius;
+    return quadratic_solve(a, b_over_2, c);
+}
+
+float2 intersect_cylinder(const float3 view_vec, const float3 eye_pos_vec)
+{
+    //  Requires:   1.) view_vec and eye_pos_vec are 3D vectors in the sphere's
+    //                  local coordinate frame (eye_pos_vec is a position, i.e.
+    //                  a vector from the origin to the eye/camera)
+    //              2.) geom_radius is a global containing the cylinder's radius
+    //  Returns:    Cast a ray of direction view_vec from eye_pos_vec at a
+    //              cylinder of radius geom_radius, and return the distance to
+    //              the first intersection in units of length(view_vec).  The
+    //              derivation of the coefficients is in Christer Ericson's
+    //              Real-Time Collision Detection, p. 195-196, and this version
+    //              uses LaGrange's identity to reduce operations.
+    //  Arbitrary "cylinder top" reference point for an infinite cylinder:
+    const float3 cylinder_top_vec = float3(0.0, geom_radius, 0.0);
+    const float3 cylinder_axis_vec = float3(0.0, 1.0, 0.0);//float3(0.0, 2.0*geom_radius, 0.0);
+    const float3 top_to_eye_vec = eye_pos_vec - cylinder_top_vec;
+    const float3 axis_x_view = cross(cylinder_axis_vec, view_vec);
+    const float3 axis_x_top_to_eye = cross(cylinder_axis_vec, top_to_eye_vec);
+    //  Quadratic formula coefficients (b_over_2 is guaranteed negative):
+    const float a = dot(axis_x_view, axis_x_view);
+    const float b_over_2 = dot(axis_x_top_to_eye, axis_x_view);
+    const float c = dot(axis_x_top_to_eye, axis_x_top_to_eye) -
+        geom_radius*geom_radius;//*dot(cylinder_axis_vec, cylinder_axis_vec);
+    return quadratic_solve(a, b_over_2, c);
+}
+
+float2 cylinder_xyz_to_uv(const float3 intersection_pos_local,
+    const float2 geom_aspect)
+{
+    //  Requires:   An xyz intersection position on a cylinder.
+    //  Returns:    video_uv coords mapped to range [-0.5, 0.5]
+    //  Mapping:    Define square_uv.x to be the signed arc length in xz-space,
+    //              and define square_uv.y = -intersection_pos_local.y (+v = -y).
+    //  Start with a numerically robust arc length calculation.
+    const float angle_from_image_center = atan2(intersection_pos_local.x,
+        intersection_pos_local.z);
+    const float signed_arc_len = angle_from_image_center * geom_radius;
+    //  Get a uv-mapping where [-0.5, 0.5] maps to a "square" area, then divide
+    //  by the aspect ratio to stretch the mapping appropriately:
+    const float2 square_uv = float2(signed_arc_len, -intersection_pos_local.y);
+    const float2 video_uv = square_uv / geom_aspect;
+    return video_uv;
+}
+
+float3 cylinder_uv_to_xyz(const float2 video_uv, const float2 geom_aspect)
+{
+    //  Requires:   video_uv coords mapped to range [-0.5, 0.5]
+    //  Returns:    An xyz intersection position on a cylinder.  This is the
+    //              inverse of cylinder_xyz_to_uv().
+    //  Expand video_uv by the aspect ratio to get proportionate x/y lengths,
+    //  then calculate an xyz position for the cylindrical mapping above.
+    const float2 square_uv = video_uv * geom_aspect;
+    const float arc_len = square_uv.x;
+    const float angle_from_image_center = arc_len / geom_radius;
+    const float x_pos = sin(angle_from_image_center) * geom_radius;
+    const float z_pos = cos(angle_from_image_center) * geom_radius;
+    //  Or: z = sqrt(geom_radius**2 - x**2)
+    //  Or: z = geom_radius/sqrt(1.0 + tan(angle)**2), x = z * tan(angle)
+    const float3 intersection_pos_local = float3(x_pos, -square_uv.y, z_pos);
+    return intersection_pos_local;
+}
+
+float2 sphere_xyz_to_uv(const float3 intersection_pos_local,
+    const float2 geom_aspect)
+{
+    //  Requires:   An xyz intersection position on a sphere.
+    //  Returns:    video_uv coords mapped to range [-0.5, 0.5]
+    //  Mapping:    First define square_uv.x/square_uv.y ==
+    //              intersection_pos_local.x/intersection_pos_local.y.  Then,
+    //              length(square_uv) is the arc length from the image center
+    //              at (0.0, 0.0, geom_radius) along the tangent great circle.
+    //              Credit for this mapping goes to cgwg: I never managed to
+    //              understand his code, but he told me his mapping was based on
+    //              great circle distances when I asked him about it, which
+    //              informed this very similar (almost identical) mapping.
+    //  Start with a numerically robust arc length calculation between the ray-
+    //  sphere intersection point and the image center using a method posted by
+    //  Roger Stafford on comp.soft-sys.matlab:
+    //  https://groups.google.com/d/msg/comp.soft-sys.matlab/zNbUui3bjcA/c0HV_bHSx9cJ
+    const float3 image_center_pos_local = float3(0.0, 0.0, geom_radius);
+    const float cp_len =
+        length(cross(intersection_pos_local, image_center_pos_local));
+    const float dp = dot(intersection_pos_local, image_center_pos_local);
+    const float angle_from_image_center = atan2(cp_len, dp);
+    const float arc_len = angle_from_image_center * geom_radius;
+    //  Get a uv-mapping where [-0.5, 0.5] maps to a "square" area, then divide
+    //  by the aspect ratio to stretch the mapping appropriately:
+    const float2 square_uv_unit = normalize(float2(intersection_pos_local.x,
+        -intersection_pos_local.y));
+    const float2 square_uv = arc_len * square_uv_unit;
+    const float2 video_uv = square_uv / geom_aspect;
+    return video_uv;
+}
+
+float3 sphere_uv_to_xyz(const float2 video_uv, const float2 geom_aspect)
+{
+    //  Requires:   video_uv coords mapped to range [-0.5, 0.5]
+    //  Returns:    An xyz intersection position on a sphere.  This is the
+    //              inverse of sphere_xyz_to_uv().
+    //  Expand video_uv by the aspect ratio to get proportionate x/y lengths,
+    //  then calculate an xyz position for the spherical mapping above.
+    const float2 square_uv = video_uv * geom_aspect;
+    //  Using length or sqrt here butchers the framerate on my 8800GTS if
+    //  this function is called too many times, and so does taking the max
+    //  component of square_uv/square_uv_unit (program length threshold?).
+    //float arc_len = length(square_uv);
+    const float2 square_uv_unit = normalize(square_uv);
+    const float arc_len = square_uv.y/square_uv_unit.y;
+    const float angle_from_image_center = arc_len / geom_radius;
+    const float xy_dist_from_sphere_center =
+        sin(angle_from_image_center) * geom_radius;
+    //float2 xy_pos = xy_dist_from_sphere_center * (square_uv/FIX_ZERO(arc_len));
+    const float2 xy_pos = xy_dist_from_sphere_center * square_uv_unit;
+    const float z_pos = cos(angle_from_image_center) * geom_radius;
+    const float3 intersection_pos_local = float3(xy_pos.x, -xy_pos.y, z_pos);
+    return intersection_pos_local;
+}
+
+float2 sphere_alt_xyz_to_uv(const float3 intersection_pos_local,
+    const float2 geom_aspect)
+{
+    //  Requires:   An xyz intersection position on a cylinder.
+    //  Returns:    video_uv coords mapped to range [-0.5, 0.5]
+    //  Mapping:    Define square_uv.x to be the signed arc length in xz-space,
+    //              and define square_uv.y == signed arc length in yz-space.
+    //  See cylinder_xyz_to_uv() for implementation details (very similar).
+    const float2 angle_from_image_center = atan2(
+        float2(intersection_pos_local.x, -intersection_pos_local.y),
+        intersection_pos_local.zz);
+    const float2 signed_arc_len = angle_from_image_center * geom_radius;
+    const float2 video_uv = signed_arc_len / geom_aspect;
+    return video_uv;
+}
+
+float3 sphere_alt_uv_to_xyz(const float2 video_uv, const float2 geom_aspect)
+{
+    //  Requires:   video_uv coords mapped to range [-0.5, 0.5]
+    //  Returns:    An xyz intersection position on a sphere.  This is the
+    //              inverse of sphere_alt_xyz_to_uv().
+    //  See cylinder_uv_to_xyz() for implementation details (very similar).
+    const float2 square_uv = video_uv * geom_aspect;
+    const float2 arc_len = square_uv;
+    const float2 angle_from_image_center = arc_len / geom_radius;
+    const float2 xy_pos = sin(angle_from_image_center) * geom_radius;
+    const float z_pos = sqrt(geom_radius*geom_radius - dot(xy_pos, xy_pos));
+    return float3(xy_pos.x, -xy_pos.y, z_pos);
+}
+
+inline float2 intersect(const float3 view_vec_local, const float3 eye_pos_local,
+    const float geom_mode)
+{
+    return geom_mode < 2.5 ? intersect_sphere(view_vec_local, eye_pos_local) :
+        intersect_cylinder(view_vec_local, eye_pos_local);
+}
+
+inline float2 xyz_to_uv(const float3 intersection_pos_local,
+    const float2 geom_aspect, const float geom_mode)
+{
+    return geom_mode < 1.5 ?
+            sphere_xyz_to_uv(intersection_pos_local, geom_aspect) :
+        geom_mode < 2.5 ?
+            sphere_alt_xyz_to_uv(intersection_pos_local, geom_aspect) :
+            cylinder_xyz_to_uv(intersection_pos_local, geom_aspect);
+}
+
+inline float3 uv_to_xyz(const float2 uv, const float2 geom_aspect,
+    const float geom_mode)
+{
+    return geom_mode < 1.5 ? sphere_uv_to_xyz(uv, geom_aspect) :
+        geom_mode < 2.5 ? sphere_alt_uv_to_xyz(uv, geom_aspect) :
+        cylinder_uv_to_xyz(uv, geom_aspect);
+}
+
+float2 view_vec_to_uv(const float3 view_vec_local, const float3 eye_pos_local,
+    const float2 geom_aspect, const float geom_mode, out float3 intersection_pos)
+{
+    //  Get the intersection point on the primitive, given an eye position
+    //  and view vector already in its local coordinate frame:
+    const float2 intersect_dist_and_discriminant = intersect(view_vec_local,
+        eye_pos_local, geom_mode);
+    const float3 intersection_pos_local = eye_pos_local +
+        view_vec_local * intersect_dist_and_discriminant.x;
+    //  Save the intersection position to an output parameter:
+    intersection_pos = intersection_pos_local;
+    //  Transform into uv coords, but give out-of-range coords if the
+    //  view ray doesn't intersect the primitive in the first place:
+    return intersect_dist_and_discriminant.y > 0.005 ?
+        xyz_to_uv(intersection_pos_local, geom_aspect, geom_mode) : float2(1.0);
+}
+
+float3 get_ideal_global_eye_pos_for_points(float3 eye_pos,
+    const float2 geom_aspect, const float3 global_coords[MAX_POINT_CLOUD_SIZE],
+    const int num_points)
+{
+    //  Requires:   Parameters:
+    //              1.) Starting eye_pos is a global 3D position at which the
+    //                  camera contains all points in global_coords[] in its FOV
+    //              2.) geom_aspect = get_aspect_vector(
+    //                      output_size.x / output_size.y);
+    //              3.) global_coords is a point cloud containing global xyz
+    //                  coords of extreme points on the simulated CRT screen.
+    //              Globals:
+    //              1.) geom_view_dist must be > 0.0.  It controls the "near
+    //                  plane" used to interpret flat_video_uv as a view
+    //                  vector, which controls the field of view (FOV).
+    //              Eyespace coordinate frame: +x = right, +y = up, +z = back
+    //  Returns:    Return an eye position at which the point cloud spans as
+    //              much of the screen as possible (given the FOV controlled by
+    //              geom_view_dist) without being cropped or sheared.
+    //  Algorithm:
+    //  1.) Move the eye laterally to a point which attempts to maximize the
+    //      the amount we can move forward without clipping the CRT screen.
+    //  2.) Move forward by as much as possible without clipping the CRT.
+    //  Get the allowed movement range by solving for the eye_pos offsets
+    //  that result in each point being projected to a screen edge/corner in
+    //  pseudo-normalized device coords (where xy ranges from [-0.5, 0.5]
+    //  and z = eyespace z):
+    //      pndc_coord = float3(float2(eyespace_xyz.x, -eyespace_xyz.y)*
+    //      geom_view_dist / (geom_aspect * -eyespace_xyz.z), eyespace_xyz.z);
+    //  Notes:
+    //  The field of view is controlled by geom_view_dist's magnitude relative to
+    //  the view vector's x and y components:
+    //      view_vec.xy ranges from [-0.5, 0.5] * geom_aspect
+    //      view_vec.z = -geom_view_dist
+    //  But for the purposes of perspective divide, it should be considered:
+    //      view_vec.xy ranges from [-0.5, 0.5] * geom_aspect / geom_view_dist
+    //      view_vec.z = -1.0
+    static const int max_centering_iters = 1;  //  Keep for easy testing.
+    for(int iter = 0; iter < max_centering_iters; iter++)
+    {
+        //  0.) Get the eyespace coordinates of our point cloud:
+        float3 eyespace_coords[MAX_POINT_CLOUD_SIZE];
+        for(int i = 0; i < num_points; i++)
+        {
+            eyespace_coords[i] = global_coords[i] - eye_pos;
+        }
+        //  1a.)For each point, find out how far we can move eye_pos in each
+        //      lateral direction without the point clipping the frustum.
+        //      Eyespace +y = up, screenspace +y = down, so flip y after
+        //      applying the eyespace offset (on the way to "clip space").
+        //  Solve for two offsets per point based on:
+        //      (eyespace_xyz.xy - offset_dr) * float2(1.0, -1.0) *
+        //      geom_view_dist / (geom_aspect * -eyespace_xyz.z) = float2(-0.5)
+        //      (eyespace_xyz.xy - offset_dr) * float2(1.0, -1.0) *
+        //      geom_view_dist / (geom_aspect * -eyespace_xyz.z) = float2(0.5)
+        //  offset_ul and offset_dr represent the farthest we can move the
+        //  eye_pos up-left and down-right.  Save the min of all offset_dr's
+        //  and the max of all offset_ul's (since it's negative).
+        float abs_radius = abs(geom_radius);  //  In case anyone gets ideas. ;)
+        float2 offset_dr_min = float2(10.0 * abs_radius, 10.0 * abs_radius);
+        float2 offset_ul_max = float2(-10.0 * abs_radius, -10.0 * abs_radius);
+        for(int i = 0; i < num_points; i++)
+        {
+            static const float2 flipy = float2(1.0, -1.0);
+            float3 eyespace_xyz = eyespace_coords[i];
+            float2 offset_dr = eyespace_xyz.xy - float2(-0.5) *
+                (geom_aspect * -eyespace_xyz.z) / (geom_view_dist * flipy);
+            float2 offset_ul = eyespace_xyz.xy - float2(0.5) *
+                (geom_aspect * -eyespace_xyz.z) / (geom_view_dist * flipy);
+            offset_dr_min = min(offset_dr_min, offset_dr);
+            offset_ul_max = max(offset_ul_max, offset_ul);
+        }
+        //  1b.)Update eye_pos: Adding the average of offset_ul_max and
+        //      offset_dr_min gives it equal leeway on the top vs. bottom
+        //      and left vs. right.  Recalculate eyespace_coords accordingly.
+        float2 center_offset = 0.5 * (offset_ul_max + offset_dr_min);
+        eye_pos.xy += center_offset;
+        for(int i = 0; i < num_points; i++)
+        {
+            eyespace_coords[i] = global_coords[i] - eye_pos;
+        }
+        //  2a.)For each point, find out how far we can move eye_pos forward
+        //      without the point clipping the frustum.  Flip the y
+        //      direction in advance (matters for a later step, not here).
+        //      Solve for four offsets per point based on:
+        //      eyespace_xyz_flipy.x * geom_view_dist /
+        //          (geom_aspect.x * (offset_z - eyespace_xyz_flipy.z)) =-0.5
+        //      eyespace_xyz_flipy.y * geom_view_dist /
+        //          (geom_aspect.y * (offset_z - eyespace_xyz_flipy.z)) =-0.5
+        //      eyespace_xyz_flipy.x * geom_view_dist /
+        //          (geom_aspect.x * (offset_z - eyespace_xyz_flipy.z)) = 0.5
+        //      eyespace_xyz_flipy.y * geom_view_dist /
+        //          (geom_aspect.y * (offset_z - eyespace_xyz_flipy.z)) = 0.5
+        //      We'll vectorize the actual computation.  Take the maximum of
+        //      these four for a single offset, and continue taking the max
+        //      for every point (use max because offset.z is negative).
+        float offset_z_max = -10.0 * geom_radius * geom_view_dist;
+        for(int i = 0; i < num_points; i++)
+        {
+            float3 eyespace_xyz_flipy = eyespace_coords[i] *
+                float3(1.0, -1.0, 1.0);
+            float4 offset_zzzz = eyespace_xyz_flipy.zzzz +
+                (eyespace_xyz_flipy.xyxy * geom_view_dist) /
+                (float4(-0.5, -0.5, 0.5, 0.5) * float4(geom_aspect, geom_aspect));
+            //  Ignore offsets that push positive x/y values to opposite
+            //  boundaries, and vice versa, and don't let the camera move
+            //  past a point in the dead center of the screen:
+            offset_z_max = (eyespace_xyz_flipy.x < 0.0) ?
+                max(offset_z_max, offset_zzzz.x) : offset_z_max;
+            offset_z_max = (eyespace_xyz_flipy.y < 0.0) ?
+                max(offset_z_max, offset_zzzz.y) : offset_z_max;
+            offset_z_max = (eyespace_xyz_flipy.x > 0.0) ?
+                max(offset_z_max, offset_zzzz.z) : offset_z_max;
+            offset_z_max = (eyespace_xyz_flipy.y > 0.0) ?
+                max(offset_z_max, offset_zzzz.w) : offset_z_max;
+            offset_z_max = max(offset_z_max, eyespace_xyz_flipy.z);
+        }
+        //  2b.)Update eye_pos: Add the maximum (smallest negative) z offset.
+        eye_pos.z += offset_z_max;
+    }
+    return eye_pos;
+}
+
+float3 get_ideal_global_eye_pos(const float3x3 local_to_global,
+    const float2 geom_aspect, const float geom_mode)
+{
+    //  Start with an initial eye_pos that includes the entire primitive
+    //  (sphere or cylinder) in its field-of-view:
+    const float3 high_view = float3(0.0, geom_aspect.y, -geom_view_dist);
+    const float3 low_view = high_view * float3(1.0, -1.0, 1.0);
+    const float len_sq = dot(high_view, high_view);
+    const float fov = abs(acos(dot(high_view, low_view)/len_sq));
+    //  Trigonometry/similar triangles say distance = geom_radius/sin(fov/2):
+    const float eye_z_spherical = geom_radius/sin(fov*0.5);
+    const float3 eye_pos = geom_mode < 2.5 ?
+        float3(0.0, 0.0, eye_z_spherical) :
+        float3(0.0, 0.0, max(geom_view_dist, eye_z_spherical));
+
+    //  Get global xyz coords of extreme sample points on the simulated CRT
+    //  screen.  Start with the center, edge centers, and corners of the
+    //  video image.  We can't ignore backfacing points: They're occluded
+    //  by closer points on the primitive, but they may NOT be occluded by
+    //  the convex hull of the remaining samples (i.e. the remaining convex
+    //  hull might not envelope points that do occlude a back-facing point.)
+    static const int num_points = MAX_POINT_CLOUD_SIZE;
+    float3 global_coords[MAX_POINT_CLOUD_SIZE];
+    global_coords[0] = mul(local_to_global, uv_to_xyz(float2(0.0, 0.0), geom_aspect, geom_mode));
+    global_coords[1] = mul(local_to_global, uv_to_xyz(float2(0.0, -0.5), geom_aspect, geom_mode));
+    global_coords[2] = mul(local_to_global, uv_to_xyz(float2(0.0, 0.5), geom_aspect, geom_mode));
+    global_coords[3] = mul(local_to_global, uv_to_xyz(float2(-0.5, 0.0), geom_aspect, geom_mode));
+    global_coords[4] = mul(local_to_global, uv_to_xyz(float2(0.5, 0.0), geom_aspect, geom_mode));
+    global_coords[5] = mul(local_to_global, uv_to_xyz(float2(-0.5, -0.5), geom_aspect, geom_mode));
+    global_coords[6] = mul(local_to_global, uv_to_xyz(float2(0.5, -0.5), geom_aspect, geom_mode));
+    global_coords[7] = mul(local_to_global, uv_to_xyz(float2(-0.5, 0.5), geom_aspect, geom_mode));
+    global_coords[8] = mul(local_to_global, uv_to_xyz(float2(0.5, 0.5), geom_aspect, geom_mode));
+    //  Adding more inner image points could help in extreme cases, but too many
+    //  points will kille the framerate.  For safety, default to the initial
+    //  eye_pos if any z coords are negative:
+    float num_negative_z_coords = 0.0;
+    for(int i = 0; i < num_points; i++)
+    {
+        num_negative_z_coords += float(global_coords[0].z < 0.0);
+    }
+    //  Outsource the optimized eye_pos calculation:
+    return num_negative_z_coords > 0.5 ? eye_pos :
+        get_ideal_global_eye_pos_for_points(eye_pos, geom_aspect,
+            global_coords, num_points);
+}
+
+float3x3 get_pixel_to_object_matrix(const float3x3 global_to_local,
+    const float3 eye_pos_local, const float3 view_vec_global,
+    const float3 intersection_pos_local, const float3 normal,
+    const float2 output_size_inv)
+{
+    //  Requires:   See get_curved_video_uv_coords_and_tangent_matrix for
+    //              descriptions of each parameter.
+    //  Returns:    Return a transformation matrix from 2D pixel-space vectors
+    //              (where (+1.0, +1.0) is a vector to one pixel down-right,
+    //              i.e. same directionality as uv texels) to 3D object-space
+    //              vectors in the CRT's local coordinate frame (right-handed)
+    //              ***which are tangent to the CRT surface at the intersection
+    //              position.***  (Basically, we want to convert pixel-space
+    //              vectors to 3D vectors along the CRT's surface, for later
+    //              conversion to uv vectors.)
+    //  Shorthand inputs:
+    const float3 pos = intersection_pos_local;
+    const float3 eye_pos = eye_pos_local;
+    //  Get a piecewise-linear matrix transforming from "pixelspace" offset
+    //  vectors (1.0 = one pixel) to object space vectors in the tangent
+    //  plane (faster than finding 3 view-object intersections).
+    //  1.) Get the local view vecs for the pixels to the right and down:
+    const float3 view_vec_right_global = view_vec_global +
+        float3(output_size_inv.x, 0.0, 0.0);
+    const float3 view_vec_down_global = view_vec_global +
+        float3(0.0, -output_size_inv.y, 0.0);
+    const float3 view_vec_right_local =
+        mul(global_to_local, view_vec_right_global);
+    const float3 view_vec_down_local =
+        mul(global_to_local, view_vec_down_global);
+    //  2.) Using the true intersection point, intersect the neighboring
+    //      view vectors with the tangent plane:
+    const float3 intersection_vec_dot_normal = float3(dot(pos - eye_pos, normal), dot(pos - eye_pos, normal), dot(pos - eye_pos, normal));
+    const float3 right_pos = eye_pos + (intersection_vec_dot_normal /
+        dot(view_vec_right_local, normal))*view_vec_right_local;
+    const float3 down_pos = eye_pos + (intersection_vec_dot_normal /
+        dot(view_vec_down_local, normal))*view_vec_down_local;
+    //  3.) Subtract the original intersection pos from its neighbors; the
+    //      resulting vectors are object-space vectors tangent to the plane.
+    //      These vectors are the object-space transformations of (1.0, 0.0)
+    //      and (0.0, 1.0) pixel offsets, so they form the first two basis
+    //      vectors of a pixelspace to object space transformation.  This
+    //      transformation is 2D to 3D, so use (0, 0, 0) for the third vector.
+    const float3 object_right_vec = right_pos - pos;
+    const float3 object_down_vec = down_pos - pos;
+    const float3x3 pixel_to_object = float3x3(
+        object_right_vec.x, object_down_vec.x, 0.0,
+        object_right_vec.y, object_down_vec.y, 0.0,
+        object_right_vec.z, object_down_vec.z, 0.0);
+    return pixel_to_object;
+}
+
+float3x3 get_object_to_tangent_matrix(const float3 intersection_pos_local,
+    const float3 normal, const float2 geom_aspect, const float geom_mode)
+{
+    //  Requires:   See get_curved_video_uv_coords_and_tangent_matrix for
+    //              descriptions of each parameter.
+    //  Returns:    Return a transformation matrix from 3D object-space vectors
+    //              in the CRT's local coordinate frame (right-handed, +y = up)
+    //              to 2D video_uv vectors (+v = down).
+    //  Description:
+    //  The TBN matrix formed by the [tangent, bitangent, normal] basis
+    //  vectors transforms ordinary vectors from tangent->object space.
+    //  The cotangent matrix formed by the [cotangent, cobitangent, normal]
+    //  basis vectors transforms normal vectors (covectors) from
+    //  tangent->object space.  It's the inverse-transpose of the TBN matrix.
+    //  We want the inverse of the TBN matrix (transpose of the cotangent
+    //  matrix), which transforms ordinary vectors from object->tangent space.
+    //  Start by calculating the relevant basis vectors in accordance with
+    //  Christian Schüler's blog post "Followup: Normal Mapping Without
+    //  Precomputed Tangents":  http://www.thetenthplanet.de/archives/1180
+    //  With our particular uv mapping, the scale of the u and v directions
+    //  is determined entirely by the aspect ratio for cylindrical and ordinary
+    //  spherical mappings, and so tangent and bitangent lengths are also
+    //  determined by it (the alternate mapping is more complex).  Therefore, we
+    //  must ensure appropriate cotangent and cobitangent lengths as well.
+    //  Base these off the uv<=>xyz mappings for each primitive.
+    const float3 pos = intersection_pos_local;
+    static const float3 x_vec = float3(1.0, 0.0, 0.0);
+    static const float3 y_vec = float3(0.0, 1.0, 0.0);
+    //  The tangent and bitangent vectors correspond with increasing u and v,
+    //  respectively.  Mathematically we'd base the cotangent/cobitangent on
+    //  those, but we'll compute the cotangent/cobitangent directly when we can.
+    float3 cotangent_unscaled, cobitangent_unscaled;
+    //  geom_mode should be constant-folded without RUNTIME_GEOMETRY_MODE.
+    if(geom_mode < 1.5)
+    {
+        //  Sphere:
+        //  tangent = normalize(cross(normal, cross(x_vec, pos))) * geom_aspect.x
+        //  bitangent = normalize(cross(cross(y_vec, pos), normal)) * geom_aspect.y
+        //  inv_determinant = 1.0/length(cross(bitangent, tangent))
+        //  cotangent = cross(normal, bitangent) * inv_determinant
+        //            == normalize(cross(y_vec, pos)) * geom_aspect.y * inv_determinant
+        //  cobitangent = cross(tangent, normal) * inv_determinant
+        //            == normalize(cross(x_vec, pos)) * geom_aspect.x * inv_determinant
+        //  Simplified (scale by inv_determinant below):
+        cotangent_unscaled = normalize(cross(y_vec, pos)) * geom_aspect.y;
+        cobitangent_unscaled = normalize(cross(x_vec, pos)) * geom_aspect.x;
+    }
+    else if(geom_mode < 2.5)
+    {
+        //  Sphere, alternate mapping:
+        //  This mapping works a bit like the cylindrical mapping in two
+        //  directions, which makes the lengths and directions more complex.
+        //  Unfortunately, I can't find much of a shortcut:
+        const float3 tangent = normalize(
+            cross(y_vec, float3(pos.x, 0.0, pos.z))) * geom_aspect.x;
+        const float3 bitangent = normalize(
+            cross(x_vec, float3(0.0, pos.yz))) * geom_aspect.y;
+        cotangent_unscaled = cross(normal, bitangent);
+        cobitangent_unscaled = cross(tangent, normal);
+    }
+    else
+    {
+        //  Cylinder:
+        //  tangent = normalize(cross(y_vec, normal)) * geom_aspect.x;
+        //  bitangent = float3(0.0, -geom_aspect.y, 0.0);
+        //  inv_determinant = 1.0/length(cross(bitangent, tangent))
+        //  cotangent = cross(normal, bitangent) * inv_determinant
+        //            == normalize(cross(y_vec, pos)) * geom_aspect.y * inv_determinant
+        //  cobitangent = cross(tangent, normal) * inv_determinant
+        //            == float3(0.0, -geom_aspect.x, 0.0) * inv_determinant
+        cotangent_unscaled = cross(y_vec, normal) * geom_aspect.y;
+        cobitangent_unscaled = float3(0.0, -geom_aspect.x, 0.0);
+    }
+    const float3 computed_normal =
+        cross(cobitangent_unscaled, cotangent_unscaled);
+    const float inv_determinant = rsqrt(dot(computed_normal, computed_normal));
+    const float3 cotangent = cotangent_unscaled * inv_determinant;
+    const float3 cobitangent = cobitangent_unscaled * inv_determinant;
+    //  The [cotangent, cobitangent, normal] column vecs form the cotangent
+    //  frame, i.e. the inverse-transpose TBN matrix.  Get its transpose:
+    const float3x3 object_to_tangent = float3x3(cotangent, cobitangent, normal);
+    return object_to_tangent;
+}
+
+float2 get_curved_video_uv_coords_and_tangent_matrix(
+    const float2 flat_video_uv, const float3 eye_pos_local,
+    const float2 output_size_inv, const float2 geom_aspect,
+    const float geom_mode, const float3x3 global_to_local,
+    out float2x2 pixel_to_tangent_video_uv)
+{
+    //  Requires:   Parameters:
+    //              1.) flat_video_uv coords are in range [0.0, 1.0], where
+    //                  (0.0, 0.0) is the top-left corner of the screen and
+    //                  (1.0, 1.0) is the bottom-right corner.
+    //              2.) eye_pos_local is the 3D camera position in the simulated
+    //                  CRT's local coordinate frame.  For best results, it must
+    //                  be computed based on the same geom_view_dist used here.
+    //              3.) output_size_inv = float2(1.0)/output_size
+    //              4.) geom_aspect = get_aspect_vector(
+    //                      output_size.x / output_size.y);
+    //              5.) geom_mode is a static or runtime mode setting:
+    //                  0 = off, 1 = sphere, 2 = sphere alt., 3 = cylinder
+    //              6.) global_to_local is a 3x3 matrix transforming (ordinary)
+    //                  worldspace vectors to the CRT's local coordinate frame
+    //              Globals:
+    //              1.) geom_view_dist must be > 0.0.  It controls the "near
+    //                  plane" used to interpret flat_video_uv as a view
+    //                  vector, which controls the field of view (FOV).
+    //  Returns:    Return final uv coords in [0.0, 1.0], and return a pixel-
+    //              space to video_uv tangent-space matrix in the out parameter.
+    //              (This matrix assumes pixel-space +y = down, like +v = down.)
+    //              We'll transform flat_video_uv into a view vector, project
+    //              the view vector from the camera/eye, intersect with a sphere
+    //              or cylinder representing the simulated CRT, and convert the
+    //              intersection position into final uv coords and a local
+    //              transformation matrix.
+    //  First get the 3D view vector (geom_aspect and geom_view_dist are globals):
+    //  1.) Center uv around (0.0, 0.0) and make (-0.5, -0.5) and (0.5, 0.5)
+    //      correspond to the top-left/bottom-right output screen corners.
+    //  2.) Multiply by geom_aspect to preemptively "undo" Retroarch's screen-
+    //      space 2D aspect correction.  We'll reapply it in uv-space.
+    //  3.) (x, y) = (u, -v), because +v is down in 2D screenspace, but +y
+    //      is up in 3D worldspace (enforce a right-handed system).
+    //  4.) The view vector z controls the "near plane" distance and FOV.
+    //      For the effect of "looking through a window" at a CRT, it should be
+    //      set equal to the user's distance from their physical screen, in
+    //      units of the viewport's physical diagonal size.
+    const float2 view_uv = (flat_video_uv - float2(0.5)) * geom_aspect;
+    const float3 view_vec_global =
+        float3(view_uv.x, -view_uv.y, -geom_view_dist);
+    //  Transform the view vector into the CRT's local coordinate frame, convert
+    //  to video_uv coords, and get the local 3D intersection position:
+    const float3 view_vec_local = mul(global_to_local, view_vec_global);
+    float3 pos;
+    const float2 centered_uv = view_vec_to_uv(
+        view_vec_local, eye_pos_local, geom_aspect, geom_mode, pos);
+    const float2 video_uv = centered_uv + float2(0.5);
+    //  Get a pixel-to-tangent-video-uv matrix.  The caller could deal with
+    //  all but one of these cases, but that would be more complicated.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        //  Derivatives obtain a matrix very fast, but the direction of pixel-
+        //  space +y seems to depend on the pass.  Enforce the correct direction
+        //  on a best-effort basis (but it shouldn't matter for antialiasing).
+        const float2 duv_dx = ddx(video_uv);
+        const float2 duv_dy = ddy(video_uv);
+        #ifdef LAST_PASS
+            pixel_to_tangent_video_uv = float2x2(
+                duv_dx.x, duv_dy.x,
+                -duv_dx.y, -duv_dy.y);
+        #else
+            pixel_to_tangent_video_uv = float2x2(
+                duv_dx.x, duv_dy.x,
+                duv_dx.y, duv_dy.y);
+        #endif
+    #else
+        //  Manually define a transformation matrix.  We'll assume pixel-space
+        //  +y = down, just like +v = down.
+        if(geom_force_correct_tangent_matrix)
+        {
+            //  Get the surface normal based on the local intersection position:
+            const float3 normal_base = geom_mode < 2.5 ? pos :
+                float3(pos.x, 0.0, pos.z);
+            const float3 normal = normalize(normal_base);
+            //  Get pixel-to-object and object-to-tangent matrices and combine
+            //  them into a 2x2 pixel-to-tangent matrix for video_uv offsets:
+            const float3x3 pixel_to_object = get_pixel_to_object_matrix(
+                global_to_local, eye_pos_local, view_vec_global, pos, normal,
+                output_size_inv);
+            const float3x3 object_to_tangent = get_object_to_tangent_matrix(
+                pos, normal, geom_aspect, geom_mode);
+            const float3x3 pixel_to_tangent3x3 =
+                mul(object_to_tangent, pixel_to_object);
+            pixel_to_tangent_video_uv = float2x2(
+                pixel_to_tangent3x3[0][0], pixel_to_tangent3x3[0][1], pixel_to_tangent3x3[1][0], pixel_to_tangent3x3[1][1]);//._m00_m01_m10_m11); //TODO/FIXME: needs to correct for column-major??
+        }
+        else
+        {
+            //  Ignore curvature, and just consider flat scaling.  The
+            //  difference is only apparent with strong curvature:
+            pixel_to_tangent_video_uv = float2x2(
+                output_size_inv.x, 0.0, 0.0, output_size_inv.y);
+        }
+    #endif
+    return video_uv;
+}
+
+float get_border_dim_factor(const float2 video_uv, const float2 geom_aspect)
+{
+    //  COPYRIGHT NOTE FOR THIS FUNCTION:
+    //  Copyright (C) 2010-2012 cgwg, 2014 TroggleMonkey
+    //  This function uses an algorithm first coded in several of cgwg's GPL-
+    //  licensed lines in crt-geom-curved.cg and its ancestors.  The line
+    //  between algorithm and code is nearly indistinguishable here, so it's
+    //  unclear whether I could even release this project under a non-GPL
+    //  license with this function included.
+
+    //  Calculate border_dim_factor from the proximity to uv-space image
+    //  borders; geom_aspect/border_size/border/darkness/border_compress are globals:
+    const float2 edge_dists = min(video_uv, float2(1.0) - video_uv) *
+        geom_aspect;
+    const float2 border_penetration =
+        max(float2(border_size) - edge_dists, float2(0.0));
+    const float penetration_ratio = length(border_penetration)/border_size;
+    const float border_escape_ratio = max(1.0 - penetration_ratio, 0.0);
+    const float border_dim_factor =
+        pow(border_escape_ratio, border_darkness) * max(1.0, border_compress);
+    return min(border_dim_factor, 1.0);
+}
+
+
+
+#endif  //  GEOMETRY_FUNCTIONS_H
+
+/////////////////////////  END GEOMETRY-FUNCTIONS  /////////////////////////
+
+///////////////////////////////////  HELPERS  //////////////////////////////////
+
+float2x2 mul_scale(float2 scale, float2x2 matrix)
+{
+    //float2x2 scale_matrix = float2x2(scale.x, 0.0, 0.0, scale.y);
+    //return mul(scale_matrix, matrix);
+    float4 intermed = float4(matrix[0][0],matrix[0][1],matrix[1][0],matrix[1][1]) * scale.xxyy;
+    return float2x2(intermed.x, intermed.y, intermed.z, intermed.w);
+}
+
+#undef COMPAT_PRECISION
+#undef COMPAT_TEXTURE
+
+void main() {
+   gl_Position = position;
+   vTexCoord = texCoord * 1.0001;
+	tex_uv = vTexCoord.xy;
+    video_and_texture_size_inv =
+        float4(1.0, 1.0, 1.0, 1.0) / float4(video_size, texture_size);
+    output_size_inv = float2(1.0, 1.0)/output_size;
+
+    //  Get aspect/overscan vectors from scalar parameters (likely uniforms):
+    const float viewport_aspect_ratio = output_size.x/output_size.y;
+    const float2 geom_aspect = get_aspect_vector(viewport_aspect_ratio);
+    const float2 geom_overscan = get_geom_overscan_vector();
+    geom_aspect_and_overscan = float4(geom_aspect, geom_overscan);
+
+    #ifdef RUNTIME_GEOMETRY_TILT
+        //  Create a local-to-global rotation matrix for the CRT's coordinate
+        //  frame and its global-to-local inverse.  Rotate around the x axis
+        //  first (pitch) and then the y axis (yaw) with yucky Euler angles.
+        //  Positive angles go clockwise around the right-vec and up-vec.
+        //  Runtime shader parameters prevent us from computing these globally,
+        //  but we can still combine the pitch/yaw matrices by hand to cut a
+        //  few instructions.  Note that cg matrices fill row1 first, then row2,
+        //  etc. (row-major order).
+        const float2 geom_tilt_angle = get_geom_tilt_angle_vector();
+        const float2 sin_tilt = sin(geom_tilt_angle);
+        const float2 cos_tilt = cos(geom_tilt_angle);
+        //  Conceptual breakdown:
+              static const float3x3 rot_x_matrix = float3x3(
+                  1.0, 0.0, 0.0,
+                  0.0, cos_tilt.y, -sin_tilt.y,
+                  0.0, sin_tilt.y, cos_tilt.y);
+              static const float3x3 rot_y_matrix = float3x3(
+                  cos_tilt.x, 0.0, sin_tilt.x,
+                  0.0, 1.0, 0.0,
+                  -sin_tilt.x, 0.0, cos_tilt.x);
+              static const float3x3 local_to_global =
+                  mul(rot_y_matrix, rot_x_matrix);
+/*              static const float3x3 global_to_local =
+                  transpose(local_to_global);
+        const float3x3 local_to_global = float3x3(
+            cos_tilt.x, sin_tilt.y*sin_tilt.x, cos_tilt.y*sin_tilt.x,
+            0.0, cos_tilt.y, sin_tilt.y,
+            sin_tilt.x, sin_tilt.y*cos_tilt.x, cos_tilt.y*cos_tilt.x);
+*/        //  This is a pure rotation, so transpose = inverse:
+        const float3x3 global_to_local = transpose(local_to_global);
+        //  Decompose the matrix into 3 float3's for output:
+        global_to_local_row0 = float3(global_to_local[0][0], global_to_local[0][1], global_to_local[0][2]);//._m00_m01_m02);
+        global_to_local_row1 = float3(global_to_local[1][0], global_to_local[1][1], global_to_local[1][2]);//._m10_m11_m12);
+        global_to_local_row2 = float3(global_to_local[2][0], global_to_local[2][1], global_to_local[2][2]);//._m20_m21_m22);
+    #else
+        static const float3x3 global_to_local = geom_global_to_local_static;
+        static const float3x3 local_to_global = geom_local_to_global_static;
+    #endif
+
+    //  Get an optimal eye position based on geom_view_dist, viewport_aspect,
+    //  and CRT radius/rotation:
+    #ifdef RUNTIME_GEOMETRY_MODE
+        const float geom_mode = geom_mode_runtime;
+    #else
+        static const float geom_mode = geom_mode_static;
+    #endif
+    const float3 eye_pos_global =
+        get_ideal_global_eye_pos(local_to_global, geom_aspect, geom_mode);
+    eye_pos_local = mul(global_to_local, eye_pos_global);
+}
\ No newline at end of file
diff --git a/shaders/CRT-Royale.shader/manifest.bml b/shaders/CRT-Royale.shader/manifest.bml
new file mode 100644
index 00000000..778c0689
--- /dev/null
+++ b/shaders/CRT-Royale.shader/manifest.bml
@@ -0,0 +1,214 @@
+input
+  filter: nearest
+  
+// IMPORTANT:
+// Shader passes need to know details about the image in the mask_texture LUT
+// files, so set the following constants in user-preset-constants.h accordingly:
+// 1.) mask_triads_per_tile = (number of horizontal triads in mask texture LUT's)
+// 2.) mask_texture_small_size = (texture size of mask*texture_small LUT's)
+// 3.) mask_texture_large_size = (texture size of mask*texture_large LUT's)
+// 4.) mask_grille_avg_color = (avg. brightness of mask_grille_texture* LUT's, in [0, 1])
+// 5.) mask_slot_avg_color = (avg. brightness of mask_slot_texture* LUT's, in [0, 1])
+// 6.) mask_shadow_avg_color = (avg. brightness of mask_shadow_texture* LUT's, in [0, 1])
+// Shader passes also need to know certain scales set in this preset, but their
+// compilation model doesn't currently allow the preset file to tell them.  Make
+// sure to set the following constants in user-preset-constants.h accordingly too:
+// 1.) bloom_approx_scale_x = scale_x2
+// 2.) mask_resize_viewport_scale = vec2(scale_x6, scale_y5)
+// Finally, shader passes need to know the value of geom_max_aspect_ratio used to
+// calculate scale_y5 (among other values):
+// 1.) geom_max_aspect_ratio = (geom_max_aspect_ratio used to calculate scale_y5)
+
+// Pass0: Linearize the input based on CRT gamma and bob interlaced fields.
+// (Bobbing ensures we can immediately blur without getting artifacts.)
+program
+  filter: nearest
+  vertex: first-pass-linearize-crt-gamma-bob-fields.vs
+  fragment: first-pass-linearize-crt-gamma-bob-fields.fs
+  format: rgba16f
+  height: 100%
+  width: 100%
+
+// Pass1: Resample interlaced (and misconverged) scanlines vertically.
+// Separating vertical/horizontal scanline sampling is faster: It lets us
+// consider more scanlines while calculating weights for fewer pixels, and
+// it reduces our samples from vertical*horizontal to vertical+horizontal.
+// This has to come right after ORIG_LINEARIZED, because there's no
+// "original_source" scale_type we can use later.  
+program
+  filter: linear
+  vertex: scanlines-vertical-interlacing.vs
+  fragment: scanlines-vertical-interlacing.fs
+  height: 400%
+  width: 100%
+  format: rgba16f
+  
+// Pass2: Do a small resize blur of ORIG_LINEARIZED at an absolute size, and
+// account for convergence offsets.  We want to blur a predictable portion of the
+// screen to match the phosphor bloom, and absolute scale works best for
+// reliable results with a fixed-size bloom.  Picking a scale is tricky:
+// a.) 400x300 is a good compromise for the "fake-bloom" version: It's low enough
+//     to blur high-res/interlaced sources but high enough that resampling
+//     doesn't smear low-res sources too much.
+// b.) 320x240 works well for the "real bloom" version: It's 1-1.5% faster, and
+//     the only noticeable visual difference is a larger halation spread (which
+//     may be a good thing for people who like to crank it up).
+// Note the 4:3 aspect ratio assumes the input has cropped geom_overscan (so it's
+// *intended* for an ~4:3 aspect ratio).  
+program
+  filter: linear
+  vertex: bloom-approx.vs
+  fragment: bloom-approx.fs
+  format: rgba16f
+  width: 320 px
+  height: 240 px
+
+// Pass3: Vertically blur the input for halation and refractive diffusion.
+// Base this on BLOOM_APPROX: This blur should be small and fast, and blurring
+// a constant portion of the screen is probably physically correct if the
+// viewport resolution is proportional to the simulated CRT size.  
+program
+  filter: linear
+  vertex: blur9fast-vertical.vs
+  fragment: blur9fast-vertical.fs
+  format: rgba16f
+  height: 100%
+  width: 100%
+
+// Pass4: Horizontally blur the input for halation and refractive diffusion.
+// Note: Using a one-pass 9x9 blur is about 1% slower.  
+program
+  filter: linear
+  vertex: blur9fast-horizontal.vs
+  fragment: blur9fast-horizontal.fs
+  format: rgba16f
+  height: 100%
+  width: 100%
+
+// Pass5: Lanczos-resize the phosphor mask vertically.  Set the absolute
+// scale_x5 == mask_texture_small_size.x (see IMPORTANT above).  Larger scales
+// will blur, and smaller scales could get nasty.  The vertical size must be
+// based on the viewport size and calculated carefully to avoid artifacts later.
+// First calculate the minimum number of mask tiles we need to draw.
+// Since curvature is computed after the scanline masking pass:
+//   num_resized_mask_tiles = 2.0;
+// If curvature were computed in the scanline masking pass (it's not):
+//   max_mask_texel_border = ~3.0 * (1/3.0 + 4.0*sqrt(2.0) + 0.5 + 1.0);
+//   max_mask_tile_border = max_mask_texel_border/
+//       (min_resized_phosphor_triad_size * mask_triads_per_tile);
+//   num_resized_mask_tiles = max(2.0, 1.0 + max_mask_tile_border * 2.0);
+//   At typical values (triad_size >= 2.0, mask_triads_per_tile == 8):
+//       num_resized_mask_tiles = ~3.8
+// Triad sizes are given in horizontal terms, so we need geom_max_aspect_ratio
+// to relate them to vertical resolution.  The widest we expect is:
+//   geom_max_aspect_ratio = 4.0/3.0  // Note: Shader passes need to know this!
+// The fewer triads we tile across the screen, the larger each triad will be as a
+// fraction of the viewport size, and the larger scale_y5 must be to draw a full
+// num_resized_mask_tiles.  Therefore, we must decide the smallest number of
+// triads we'll guarantee can be displayed on screen.  We'll set this according
+// to 3-pixel triads at 768p resolution (the lowest anyone's likely to use):
+//   min_allowed_viewport_triads = 768.0*geom_max_aspect_ratio / 3.0 = 341.333333
+// Now calculate the viewport scale that ensures we can draw resized_mask_tiles:
+//   min_scale_x = resized_mask_tiles * mask_triads_per_tile /
+//       min_allowed_viewport_triads
+//   scale_y5 = geom_max_aspect_ratio * min_scale_x
+//   # Some code might depend on equal scales:
+//   scale_x6 = scale_y5
+// Given our default geom_max_aspect_ratio and min_allowed_viewport_triads:
+//   scale_y5 = 4.0/3.0 * 2.0/(341.33333 / 8.0) = 0.0625
+// IMPORTANT: The scales MUST be calculated in this way.  If you wish to change
+// geom_max_aspect_ratio, update that constant in user-preset-constants.h!
+program
+  filter: linear
+  format: rgba16f
+  width: 64 px
+  height: 6.25%
+  vertex: mask-resize-vertical.vs
+  fragment: mask-resize-vertical.fs
+  pixmap: textures/TileableLinearApertureGrille15Wide8And5d5SpacingResizeTo64.png
+    filter: linear
+    wrap: repeat
+  pixmap: textures/TileableLinearApertureGrille15Wide8And5d5Spacing.png
+    filter: linear
+    wrap: repeat
+  pixmap: textures/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacingResizeTo64.png
+    filter: linear
+    wrap: repeat
+  pixmap: textures/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing.png
+    filter: linear
+    wrap: repeat
+  pixmap: textures/TileableLinearShadowMaskEDPResizeTo64.png
+    filter: linear
+    wrap: repeat
+  pixmap: textures/TileableLinearShadowMaskEDP.png
+    filter: linear
+    wrap: repeat
+
+// Pass6: Lanczos-resize the phosphor mask horizontally.  scale_x6 = scale_y5.
+// TODO: Check again if the shaders actually require equal scales.
+program
+  filter: nearest
+  vertex: mask-resize-horizontal.vs
+  fragment: mask-resize-horizontal.fs
+  format: rgba16f
+
+// Pass7: Resample (misconverged) scanlines horizontally, apply halation, and
+// apply the phosphor mask.
+program
+  filter: linear
+  format: rgba16f
+  height: 100%
+  width: 100%
+  vertex: scanlines-horizontal-apply-mask.vs
+  fragment: scanlines-horizontal-apply-mask.fs  
+  pixmap: textures/TileableLinearApertureGrille15Wide8And5d5SpacingResizeTo64.png
+    filter: linear
+    wrap: repeat
+  pixmap: textures/TileableLinearApertureGrille15Wide8And5d5Spacing.png
+    filter: linear
+    wrap: repeat
+  pixmap: textures/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacingResizeTo64.png
+    filter: linear
+    wrap: repeat
+  pixmap: textures/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing.png
+    filter: linear
+    wrap: repeat
+  pixmap: textures/TileableLinearShadowMaskEDPResizeTo64.png
+    filter: linear
+    wrap: repeat
+  pixmap: textures/TileableLinearShadowMaskEDP.png
+    filter: linear
+    wrap: repeat
+
+// Pass 8: Compute a brightpass.  This will require reading the final mask.    
+program
+  filter: linear
+  format: rgba16f
+  vertex: brightpass.vs
+  fragment: brightpass.fs
+
+// Pass 9: Blur the brightpass vertically
+program
+  filter: linear
+  format: rgba16f
+  vertex: bloom-vertical.vs
+  fragment: bloom-vertical.fs
+  
+// Pass 10: Blur the brightpass horizontally and combine it with the dimpass:
+program
+  filter: linear
+  format: rgba16f
+  height: 100%
+  width: 100%
+  vertex: bloom-horizontal-reconstitute.vs
+  fragment: bloom-horizontal-reconstitute.fs
+  
+// Pass 11: Compute curvature/AA:
+program
+  filter: linear
+  format: rgba16f
+  vertex: geometry-aa-last-pass.vs
+  fragment: geometry-aa-last-pass.fs
+  
+output
+  filter: nearest
\ No newline at end of file
diff --git a/shaders/CRT-Royale.shader/mask-resize-horizontal.fs b/shaders/CRT-Royale.shader/mask-resize-horizontal.fs
new file mode 100644
index 00000000..8545d587
--- /dev/null
+++ b/shaders/CRT-Royale.shader/mask-resize-horizontal.fs
@@ -0,0 +1,3208 @@
+#version 150
+
+uniform sampler2D source[];
+uniform vec4 sourceSize[];
+uniform vec4 targetSize;
+
+in Vertex {
+   vec2 vTexCoord;
+   vec2 src_tex_uv_wrap;
+   vec2 tile_uv_wrap;
+   vec2 resize_magnification_scale;
+   vec2 src_dxdy;
+   vec2 tile_size_uv;
+   vec2 input_tiles_per_texture;
+};
+
+out vec4 FragColor;
+
+// USER SETTINGS BLOCK //
+
+#define crt_gamma 2.500000
+#define lcd_gamma 2.200000
+#define levels_contrast 1.0
+#define halation_weight 0.0
+#define diffusion_weight 0.075
+#define bloom_underestimate_levels 0.8
+#define bloom_excess 0.000000
+#define beam_min_sigma 0.020000
+#define beam_max_sigma 0.300000
+#define beam_spot_power 0.330000
+#define beam_min_shape 2.000000
+#define beam_max_shape 4.000000
+#define beam_shape_power 0.250000
+#define beam_horiz_filter 0.000000
+#define beam_horiz_sigma 0.35
+#define beam_horiz_linear_rgb_weight 1.000000
+#define convergence_offset_x_r -0.000000
+#define convergence_offset_x_g 0.000000
+#define convergence_offset_x_b 0.000000
+#define convergence_offset_y_r 0.000000
+#define convergence_offset_y_g -0.000000
+#define convergence_offset_y_b 0.000000
+#define mask_type 1.000000
+#define mask_sample_mode_desired 0.000000
+#define mask_specify_num_triads 0.000000
+#define mask_triad_size_desired 3.000000
+#define mask_num_triads_desired 480.000000
+#define aa_subpixel_r_offset_x_runtime -0.0
+#define aa_subpixel_r_offset_y_runtime 0.000000
+#define aa_cubic_c 0.500000
+#define aa_gauss_sigma 0.500000
+#define geom_mode_runtime 0.000000
+#define geom_radius 2.000000
+#define geom_view_dist 2.000000
+#define geom_tilt_angle_x 0.000000
+#define geom_tilt_angle_y 0.000000
+#define geom_aspect_ratio_x 432.000000
+#define geom_aspect_ratio_y 329.000000
+#define geom_overscan_x 1.000000
+#define geom_overscan_y 1.000000
+#define border_size 0.015
+#define border_darkness 2.0
+#define border_compress 2.500000
+#define interlace_bff 0.000000
+#define interlace_1080i 0.000000
+
+// END USER SETTINGS BLOCK //
+
+// compatibility macros for transparently converting HLSLisms into GLSLisms
+#define mul(a,b) (b*a)
+#define lerp(a,b,c) mix(a,b,c)
+#define saturate(c) clamp(c, 0.0, 1.0)
+#define frac(x) (fract(x))
+#define float2 vec2
+#define float3 vec3
+#define float4 vec4
+#define bool2 bvec2
+#define bool3 bvec3
+#define bool4 bvec4
+#define float2x2 mat2x2
+#define float3x3 mat3x3
+#define float4x4 mat4x4
+#define float4x3 mat4x3
+#define float2x4 mat2x4
+#define IN params
+#define texture_size sourceSize[0].xy
+#define video_size sourceSize[0].xy
+#define output_size targetSize.xy
+#define frame_count phase
+#define static  
+#define inline  
+#define const  
+#define fmod(x,y) mod(x,y)
+#define ddx(c) dFdx(c)
+#define ddy(c) dFdy(c)
+#define atan2(x,y) atan(y,x)
+#define rsqrt(c) inversesqrt(c)
+
+#define input_texture source[0]
+
+#if defined(GL_ES)
+	#define COMPAT_PRECISION mediump
+#else
+	#define COMPAT_PRECISION
+#endif
+
+#if __VERSION__ >= 130
+	#define COMPAT_TEXTURE texture
+#else
+	#define COMPAT_TEXTURE texture2D
+#endif
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+////////////////////////////  END USER-SETTINGS  //////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  END DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////////////
+
+//#include "bind-shader-h"
+
+/////////////////////////////  BEGIN BIND-SHADER-PARAMS  ////////////////////////////
+
+#ifndef BIND_SHADER_PARAMS_H
+#define BIND_SHADER_PARAMS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+/////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+////////////////////   END DERIVED-SETTINGS-AND-CONSTANTS   /////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+//  Override some parameters for gamma-management.h and tex2Dantialias.h:
+#define OVERRIDE_DEVICE_GAMMA
+static const float gba_gamma = 3.5; //  Irrelevant but necessary to define.
+#define ANTIALIAS_OVERRIDE_BASICS
+#define ANTIALIAS_OVERRIDE_PARAMETERS
+
+//  Provide accessors for vector constants that pack scalar uniforms:
+inline float2 get_aspect_vector(const float geom_aspect_ratio)
+{
+    //  Get an aspect ratio vector.  Enforce geom_max_aspect_ratio, and prevent
+    //  the absolute scale from affecting the uv-mapping for curvature:
+    const float geom_clamped_aspect_ratio =
+        min(geom_aspect_ratio, geom_max_aspect_ratio);
+    const float2 geom_aspect =
+        normalize(float2(geom_clamped_aspect_ratio, 1.0));
+    return geom_aspect;
+}
+
+inline float2 get_geom_overscan_vector()
+{
+    return float2(geom_overscan_x, geom_overscan_y);
+}
+
+inline float2 get_geom_tilt_angle_vector()
+{
+    return float2(geom_tilt_angle_x, geom_tilt_angle_y);
+}
+
+inline float3 get_convergence_offsets_x_vector()
+{
+    return float3(convergence_offset_x_r, convergence_offset_x_g,
+        convergence_offset_x_b);
+}
+
+inline float3 get_convergence_offsets_y_vector()
+{
+    return float3(convergence_offset_y_r, convergence_offset_y_g,
+        convergence_offset_y_b);
+}
+
+inline float2 get_convergence_offsets_r_vector()
+{
+    return float2(convergence_offset_x_r, convergence_offset_y_r);
+}
+
+inline float2 get_convergence_offsets_g_vector()
+{
+    return float2(convergence_offset_x_g, convergence_offset_y_g);
+}
+
+inline float2 get_convergence_offsets_b_vector()
+{
+    return float2(convergence_offset_x_b, convergence_offset_y_b);
+}
+
+inline float2 get_aa_subpixel_r_offset()
+{
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+            //  WARNING: THIS IS EXTREMELY EXPENSIVE.
+            return float2(aa_subpixel_r_offset_x_runtime,
+                aa_subpixel_r_offset_y_runtime);
+        #else
+            return aa_subpixel_r_offset_static;
+        #endif
+    #else
+        return aa_subpixel_r_offset_static;
+    #endif
+}
+
+//  Provide accessors settings which still need "cooking:"
+inline float get_mask_amplify()
+{
+    static const float mask_grille_amplify = 1.0/mask_grille_avg_color;
+    static const float mask_slot_amplify = 1.0/mask_slot_avg_color;
+    static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color;
+    return mask_type < 0.5 ? mask_grille_amplify :
+        mask_type < 1.5 ? mask_slot_amplify :
+        mask_shadow_amplify;
+}
+
+inline float get_mask_sample_mode()
+{
+    #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_desired;
+        #else
+            return clamp(mask_sample_mode_desired, 1.0, 2.0);
+        #endif
+    #else
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_static;
+        #else
+            return clamp(mask_sample_mode_static, 1.0, 2.0);
+        #endif
+    #endif
+}
+
+#endif  //  BIND_SHADER_PARAMS_H
+
+////////////////////////////  END BIND-SHADER-PARAMS  ///////////////////////////
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//#include "phosphor-mask-resizing.h"
+
+////////////////////////  BEGIN PHOSPHOR-MASK-RESIZING  ////////////////////////
+
+#ifndef PHOSPHOR_MASK_RESIZING_H
+#define PHOSPHOR_MASK_RESIZING_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//#include "../user-settings.h"
+//#include "derived-settings-and-constants.h"
+
+/////////////////////////////  CODEPATH SELECTION  /////////////////////////////
+
+//  Choose a looping strategy based on what's allowed:
+//  Dynamic loops not allowed: Use a flat static loop.
+//  Dynamic loops accomodated: Coarsely branch around static loops.
+//  Dynamic loops assumed allowed: Use a flat dynamic loop.
+#ifndef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #ifdef ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+        #define BREAK_LOOPS_INTO_PIECES
+    #else
+        #define USE_SINGLE_STATIC_LOOP
+    #endif
+#endif  //  No else needed: Dynamic loops assumed.
+
+
+//////////////////////////////////  CONSTANTS  /////////////////////////////////
+
+//  The larger the resized tile, the fewer samples we'll need for downsizing.
+//  See if we can get a static min tile size > mask_min_allowed_tile_size:
+static const float mask_min_allowed_tile_size = ceil(
+    mask_min_allowed_triad_size * mask_triads_per_tile);
+static const float mask_min_expected_tile_size = 
+        mask_min_allowed_tile_size;
+//  Limit the number of sinc resize taps by the maximum minification factor:
+static const float pi_over_lobes = pi/mask_sinc_lobes;
+static const float max_sinc_resize_samples_float = 2.0 * mask_sinc_lobes *
+    mask_resize_src_lut_size.x/mask_min_expected_tile_size;
+//  Vectorized loops sample in multiples of 4.  Round up to be safe:
+static const float max_sinc_resize_samples_m4 = ceil(
+    max_sinc_resize_samples_float * 0.25) * 4.0;
+
+
+/////////////////////////  RESAMPLING FUNCTION HELPERS  ////////////////////////
+
+inline float get_dynamic_loop_size(const float magnification_scale)
+{
+    //  Requires:   The following global constants must be defined:
+    //              1.) mask_sinc_lobes
+    //              2.) max_sinc_resize_samples_m4
+    //  Returns:    The minimum number of texture samples for a correct downsize
+    //              at magnification_scale.
+    //  We're downsizing, so the filter is sized across 2*lobes output pixels
+    //  (not 2*lobes input texels).  This impacts distance measurements and the
+    //  minimum number of input samples needed.
+    const float min_samples_float = 2.0 * mask_sinc_lobes / magnification_scale;
+    const float min_samples_m4 = ceil(min_samples_float * 0.25) * 4.0;
+    #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+        const float max_samples_m4 = max_sinc_resize_samples_m4;
+    #else   // ifdef BREAK_LOOPS_INTO_PIECES
+        //  Simulating loops with branches imposes a 128-sample limit.
+        const float max_samples_m4 = min(128.0, max_sinc_resize_samples_m4);
+    #endif
+    return min(min_samples_m4, max_samples_m4);
+}
+
+float2 get_first_texel_tile_uv_and_dist(const float2 tex_uv, 
+    const float2 tex_size, const float dr, 
+    const float input_tiles_per_texture_r, const float samples,
+    static const bool vertical)
+{
+    //  Requires:   1.) dr == du == 1.0/texture_size.x or
+    //                  dr == dv == 1.0/texture_size.y
+    //                  (whichever direction we're resampling in).
+    //                  It's a scalar to save register space.
+    //              2.) input_tiles_per_texture_r is the number of input tiles
+    //                  that can fit in the input texture in the direction we're
+    //                  resampling this pass.
+    //              3.) vertical indicates whether we're resampling vertically
+    //                  this pass (or horizontally).
+    //  Returns:    Pack and return the first sample's tile_uv coord in [0, 1]
+    //              and its texel distance from the destination pixel, in the
+    //              resized dimension only.
+    //  We'll start with the topmost or leftmost sample and work down or right,
+    //  so get the first sample location and distance.  Modify both dimensions
+    //  as if we're doing a one-pass 2D resize; we'll throw away the unneeded
+    //  (and incorrect) dimension at the end.
+    const float2 curr_texel = tex_uv * tex_size;
+    const float2 prev_texel =
+        floor(curr_texel - float2(under_half)) + float2(0.5);
+    const float2 first_texel = prev_texel - float2(samples/2.0 - 1.0);
+    const float2 first_texel_uv_wrap_2D = first_texel * dr;
+    const float2 first_texel_dist_2D = curr_texel - first_texel;
+    //  Convert from tex_uv to tile_uv coords so we can sub fracs for fmods.
+    const float2 first_texel_tile_uv_wrap_2D =
+        first_texel_uv_wrap_2D * input_tiles_per_texture_r;
+    //  Project wrapped coordinates to the [0, 1] range.  We'll do this with all
+    //  samples,but the first texel is special, since it might be negative.
+    const float2 coord_negative =
+        float2((first_texel_tile_uv_wrap_2D.x < 0.),(first_texel_tile_uv_wrap_2D.y < 0.));
+    const float2 first_texel_tile_uv_2D =
+        frac(first_texel_tile_uv_wrap_2D) + coord_negative;
+    //  Pack the first texel's tile_uv coord and texel distance in 1D:
+    const float2 tile_u_and_dist =
+        float2(first_texel_tile_uv_2D.x, first_texel_dist_2D.x);
+    const float2 tile_v_and_dist =
+        float2(first_texel_tile_uv_2D.y, first_texel_dist_2D.y);
+    return vertical ? tile_v_and_dist : tile_u_and_dist;
+    //return lerp(tile_u_and_dist, tile_v_and_dist, float(vertical));
+}
+
+inline float4 tex2Dlod0try(const sampler2D tex, const float2 tex_uv)
+{
+    //  Mipmapping and anisotropic filtering get confused by sinc-resampling.
+    //  One [slow] workaround is to select the lowest mip level:
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        return textureLod(tex, float4(tex_uv, 0.0, 0.0).xy);
+    #else
+        #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+            return tex2Dbias(tex, float4(tex_uv, 0.0, -16.0));
+        #else
+            return texture(tex, tex_uv);
+        #endif
+    #endif
+}
+
+
+//////////////////////////////  LOOP BODY MACROS  //////////////////////////////
+
+//  Using inline functions can exceed the temporary register limit, so we're
+//  stuck with #define macros (I'm TRULY sorry).  They're declared here instead
+//  of above to be closer to the actual invocation sites.  Steps:
+//  1.) Get the exact texel location.
+//  2.) Sample the phosphor mask (already assumed encoded in linear RGB).
+//  3.) Get the distance from the current pixel and sinc weight:
+//          sinc(dist) = sin(pi * dist)/(pi * dist)
+//      We can also use the slower/smoother Lanczos instead:
+//          L(x) = sinc(dist) * sinc(dist / lobes)
+//  4.) Accumulate the weight sum in weights, and accumulate the weighted texels
+//      in pixel_color (we'll normalize outside the loop at the end).
+//  We vectorize the loop to help reduce the Lanczos window's cost.
+
+    //  The r coord is the coord in the dimension we're resizing along (u or v),
+    //  and first_texel_tile_uv_rrrr is a float4 of the first texel's u or v
+    //  tile_uv coord in [0, 1].  tex_uv_r will contain the tile_uv u or v coord
+    //  for four new texel samples.
+    #define CALCULATE_R_COORD_FOR_4_SAMPLES                                    \
+        const float4 true_i = float4(i_base + i) + float4(0.0, 1.0, 2.0, 3.0); \
+        const float4 tile_uv_r = frac(                                         \
+            first_texel_tile_uv_rrrr + true_i * tile_dr);                      \
+        const float4 tex_uv_r = tile_uv_r * tile_size_uv_r;
+
+    #ifdef PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+        #define CALCULATE_SINC_RESAMPLE_WEIGHTS                                \
+            const float4 pi_dist_over_lobes = pi_over_lobes * dist;            \
+            const float4 weights = min(sin(pi_dist) * sin(pi_dist_over_lobes) /\
+                (pi_dist*pi_dist_over_lobes), float4(1.0));
+    #else
+        #define CALCULATE_SINC_RESAMPLE_WEIGHTS                                \
+            const float4 weights = min(sin(pi_dist)/pi_dist, float4(1.0));
+    #endif
+
+    #define UPDATE_COLOR_AND_WEIGHT_SUMS                                       \
+        const float4 dist = magnification_scale *                              \
+            abs(first_dist_unscaled - true_i);                                 \
+        const float4 pi_dist = pi * dist;                                      \
+        CALCULATE_SINC_RESAMPLE_WEIGHTS;                                       \
+        pixel_color += new_sample0 * weights.xxx;                              \
+        pixel_color += new_sample1 * weights.yyy;                              \
+        pixel_color += new_sample2 * weights.zzz;                              \
+        pixel_color += new_sample3 * weights.www;                              \
+        weight_sum += weights;
+
+    #define VERTICAL_SINC_RESAMPLE_LOOP_BODY                                   \
+        CALCULATE_R_COORD_FOR_4_SAMPLES;                                       \
+        const float3 new_sample0 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.x)).rgb;                                 \
+        const float3 new_sample1 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.y)).rgb;                                 \
+        const float3 new_sample2 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.z)).rgb;                                 \
+        const float3 new_sample3 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.w)).rgb;                                 \
+        UPDATE_COLOR_AND_WEIGHT_SUMS;
+
+    #define HORIZONTAL_SINC_RESAMPLE_LOOP_BODY                                 \
+        CALCULATE_R_COORD_FOR_4_SAMPLES;                                       \
+        const float3 new_sample0 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.x, tex_uv.y)).rgb;                                 \
+        const float3 new_sample1 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.y, tex_uv.y)).rgb;                                 \
+        const float3 new_sample2 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.z, tex_uv.y)).rgb;                                 \
+        const float3 new_sample3 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.w, tex_uv.y)).rgb;                                 \
+        UPDATE_COLOR_AND_WEIGHT_SUMS;
+
+
+////////////////////////////  RESAMPLING FUNCTIONS  ////////////////////////////
+
+float3 downsample_vertical_sinc_tiled(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size, static const float dr,
+    const float magnification_scale, static const float tile_size_uv_r)
+{
+    //  Requires:   1.) dr == du == 1.0/texture_size.x or
+    //                  dr == dv == 1.0/texture_size.y
+    //                  (whichever direction we're resampling in).
+    //                  It's a scalar to save register space.
+    //              2.) tile_size_uv_r is the number of texels an input tile
+    //                  takes up in the input texture, in the direction we're
+    //                  resampling this pass.
+    //              3.) magnification_scale must be <= 1.0.
+    //  Returns:    Return a [Lanczos] sinc-resampled pixel of a vertically
+    //              downsized input tile embedded in an input texture.  (The
+    //              vertical version is special-cased though: It assumes the
+    //              tile size equals the [static] texture size, since it's used
+    //              on an LUT texture input containing one tile.  For more
+    //              generic use, eliminate the "static" in the parameters.)
+    //  The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension
+    //  we're resizing along, e.g. "dy" in this case.
+    #ifdef USE_SINGLE_STATIC_LOOP
+        //  A static loop can be faster, but it might blur too much from using
+        //  more samples than it should.
+        static const int samples = int(max_sinc_resize_samples_m4);
+    #else
+        const int samples = int(get_dynamic_loop_size(magnification_scale));
+    #endif
+
+    //  Get the first sample location (scalar tile uv coord along the resized
+    //  dimension) and distance from the output location (in texels):
+    static const float input_tiles_per_texture_r = 1.0/tile_size_uv_r;
+    //  true = vertical resize:
+    const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist(
+        tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, true);
+    const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx;
+    const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy;
+    //  Get the tile sample offset:
+    static const float tile_dr = dr * input_tiles_per_texture_r;
+
+    //  Sum up each weight and weighted sample color, varying the looping
+    //  strategy based on our expected dynamic loop capabilities.  See the
+    //  loop body macros above.
+    int i_base = 0;
+    float4 weight_sum = float4(0.0);
+    float3 pixel_color = float3(0.0);
+    static const int i_step = 4;
+    #ifdef BREAK_LOOPS_INTO_PIECES
+        if(samples - i_base >= 64)
+        {
+            for(int i = 0; i < 64; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 64;
+        }
+        if(samples - i_base >= 32)
+        {
+            for(int i = 0; i < 32; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 32;
+        }
+        if(samples - i_base >= 16)
+        {
+            for(int i = 0; i < 16; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 16;
+        }
+        if(samples - i_base >= 8)
+        {
+            for(int i = 0; i < 8; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 8;
+        }
+        if(samples - i_base >= 4)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 4;
+        }
+        //  Do another 4-sample block for a total of 128 max samples.
+        if(samples - i_base > 0)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+        }
+    #else
+        for(int i = 0; i < samples; i += i_step)
+        {
+            VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+        }
+    #endif
+    //  Normalize so the weight_sum == 1.0, and return:
+    const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw;
+    const float3 scalar_weight_sum = float3(weight_sum_reduce.x + 
+        weight_sum_reduce.y);
+    return (pixel_color/scalar_weight_sum);
+}
+
+float3 downsample_horizontal_sinc_tiled(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size, const float dr,
+    const float magnification_scale, const float tile_size_uv_r)
+{
+    //  Differences from downsample_horizontal_sinc_tiled:
+    //  1.) The dr and tile_size_uv_r parameters are not static consts.
+    //  2.) The "vertical" parameter to get_first_texel_tile_uv_and_dist is
+    //      set to false instead of true.
+    //  3.) The horizontal version of the loop body is used.
+    //  TODO: If we can get guaranteed compile-time dead code elimination,
+    //  we can combine the vertical/horizontal downsampling functions by:
+    //  1.) Add an extra static const bool parameter called "vertical."
+    //  2.) Supply it with the result of get_first_texel_tile_uv_and_dist().
+    //  3.) Use a conditional assignment in the loop body macro.  This is the
+    //      tricky part: We DO NOT want to incur the extra conditional
+    //      assignment in the inner loop at runtime!
+    //  The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension
+    //  we're resizing along, e.g. "dx" in this case.
+    #ifdef USE_SINGLE_STATIC_LOOP
+        //  If we have to load all samples, we might as well use them.
+        static const int samples = int(max_sinc_resize_samples_m4);
+    #else
+        const int samples = int(get_dynamic_loop_size(magnification_scale));
+    #endif
+
+    //  Get the first sample location (scalar tile uv coord along resized
+    //  dimension) and distance from the output location (in texels):
+    const float input_tiles_per_texture_r = 1.0/tile_size_uv_r;
+    //  false = horizontal resize:
+    const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist(
+        tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, false);
+    const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx;
+    const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy;
+    //  Get the tile sample offset:
+    const float tile_dr = dr * input_tiles_per_texture_r;
+
+    //  Sum up each weight and weighted sample color, varying the looping
+    //  strategy based on our expected dynamic loop capabilities.  See the
+    //  loop body macros above.
+    int i_base = 0;
+    float4 weight_sum = float4(0.0);
+    float3 pixel_color = float3(0.0);
+    static const int i_step = 4;
+    #ifdef BREAK_LOOPS_INTO_PIECES
+        if(samples - i_base >= 64)
+        {
+            for(int i = 0; i < 64; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 64;
+        }
+        if(samples - i_base >= 32)
+        {
+            for(int i = 0; i < 32; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 32;
+        }
+        if(samples - i_base >= 16)
+        {
+            for(int i = 0; i < 16; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 16;
+        }
+        if(samples - i_base >= 8)
+        {
+            for(int i = 0; i < 8; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 8;
+        }
+        if(samples - i_base >= 4)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 4;
+        }
+        //  Do another 4-sample block for a total of 128 max samples.
+        if(samples - i_base > 0)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+        }
+    #else
+        for(int i = 0; i < samples; i += i_step)
+        {
+            HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+        }
+    #endif
+    //  Normalize so the weight_sum == 1.0, and return:
+    const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw;
+    const float3 scalar_weight_sum = float3(weight_sum_reduce.x +
+        weight_sum_reduce.y);
+    return (pixel_color/scalar_weight_sum);
+}
+
+
+////////////////////////////  TILE SIZE CALCULATION  ///////////////////////////
+
+float2 get_resized_mask_tile_size(const float2 estimated_viewport_size,
+    const float2 estimated_mask_resize_output_size,
+    const bool solemnly_swear_same_inputs_for_every_pass)
+{
+    //  Requires:   The following global constants must be defined according to
+    //              certain constraints:
+    //              1.) mask_resize_num_triads: Must be high enough that our
+    //                  mask sampling method won't have artifacts later
+    //                  (long story; see derived-settings-and-constants.h)
+    //              2.) mask_resize_src_lut_size: Texel size of our mask LUT
+    //              3.) mask_triads_per_tile: Num horizontal triads in our LUT
+    //              4.) mask_min_allowed_triad_size: User setting (the more
+    //                  restrictive it is, the faster the resize will go)
+    //              5.) mask_min_allowed_tile_size_x < mask_resize_src_lut_size.x
+    //              6.) mask_triad_size_desired_{runtime, static}
+    //              7.) mask_num_triads_desired_{runtime, static}
+    //              8.) mask_specify_num_triads must be 0.0/1.0 (false/true)
+    //              The function parameters must be defined as follows:
+    //              1.) estimated_viewport_size == (final viewport size);
+    //                  If mask_specify_num_triads is 1.0/true and the viewport
+    //                  estimate is wrong, the number of triads will differ from
+    //                  the user's preference by about the same factor.
+    //              2.) estimated_mask_resize_output_size: Must equal the
+    //                  output size of the MASK_RESIZE pass.
+    //                  Exception: The x component may be estimated garbage if
+    //                  and only if the caller throws away the x result.
+    //              3.) solemnly_swear_same_inputs_for_every_pass: Set to false,
+    //                  unless you can guarantee that every call across every
+    //                  pass will use the same sizes for the other parameters.
+    //              When calling this across multiple passes, always use the
+    //              same y viewport size/scale, and always use the same x
+    //              viewport size/scale when using the x result.
+    //  Returns:    Return the final size of a manually resized mask tile, after
+    //              constraining the desired size to avoid artifacts.  Under
+    //              unusual circumstances, tiles may become stretched vertically
+    //              (see wall of text below).
+    //  Stated tile properties must be correct:
+    static const float tile_aspect_ratio_inv =
+        mask_resize_src_lut_size.y/mask_resize_src_lut_size.x;
+    static const float tile_aspect_ratio = 1.0/tile_aspect_ratio_inv;
+    static const float2 tile_aspect = float2(1.0, tile_aspect_ratio_inv);
+    //  If mask_specify_num_triads is 1.0/true and estimated_viewport_size.x is
+    //  wrong, the user preference will be misinterpreted:
+    const float desired_tile_size_x = mask_triads_per_tile * lerp(
+        mask_triad_size_desired,
+        estimated_viewport_size.x / mask_num_triads_desired,
+        mask_specify_num_triads);
+    if(get_mask_sample_mode() > 0.5)
+    {
+        //  We don't need constraints unless we're sampling MASK_RESIZE.
+        return desired_tile_size_x * tile_aspect;
+    }
+    //  Make sure we're not upsizing:
+    const float temp_tile_size_x =
+        min(desired_tile_size_x, mask_resize_src_lut_size.x);
+    //  Enforce min_tile_size and max_tile_size in both dimensions:
+    const float2 temp_tile_size = temp_tile_size_x * tile_aspect;
+    static const float2 min_tile_size =
+        mask_min_allowed_tile_size * tile_aspect;
+    const float2 max_tile_size =
+        estimated_mask_resize_output_size / mask_resize_num_tiles;
+    const float2 clamped_tile_size =
+        clamp(temp_tile_size, min_tile_size, max_tile_size);
+    //  Try to maintain tile_aspect_ratio.  This is the tricky part:
+    //  If we're currently resizing in the y dimension, the x components
+    //  could be MEANINGLESS.  (If estimated_mask_resize_output_size.x is
+    //  bogus, then so is max_tile_size.x and clamped_tile_size.x.)
+    //  We can't adjust the y size based on clamped_tile_size.x.  If it
+    //  clamps when it shouldn't, it won't clamp again when later passes
+    //  call this function with the correct sizes, and the discrepancy will
+    //  break the sampling coords in MASKED_SCANLINES.  Instead, we'll limit
+    //  the x size based on the y size, but not vice versa, unless the
+    //  caller swears the parameters were the same (correct) in every pass.
+    //  As a result, triads could appear vertically stretched if:
+    //  a.) mask_resize_src_lut_size.x > mask_resize_src_lut_size.y: Wide
+    //      LUT's might clamp x more than y (all provided LUT's are square)
+    //  b.) true_viewport_size.x < true_viewport_size.y: The user is playing
+    //      with a vertically oriented screen (not accounted for anyway)
+    //  c.) mask_resize_viewport_scale.x < masked_resize_viewport_scale.y:
+    //      Viewport scales are equal by default.
+    //  If any of these are the case, you can fix the stretching by setting:
+    //      mask_resize_viewport_scale.x = mask_resize_viewport_scale.y *
+    //          (1.0 / min_expected_aspect_ratio) *
+    //          (mask_resize_src_lut_size.x / mask_resize_src_lut_size.y)
+    const float x_tile_size_from_y =
+        clamped_tile_size.y * tile_aspect_ratio;
+    const float y_tile_size_from_x = lerp(clamped_tile_size.y,
+        clamped_tile_size.x * tile_aspect_ratio_inv,
+        float(solemnly_swear_same_inputs_for_every_pass));
+    const float2 reclamped_tile_size = float2(
+        min(clamped_tile_size.x, x_tile_size_from_y),
+        min(clamped_tile_size.y, y_tile_size_from_x));
+    //  We need integer tile sizes in both directions for tiled sampling to
+    //  work correctly.  Use floor (to make sure we don't round up), but be
+    //  careful to avoid a rounding bug where floor decreases whole numbers:
+    const float2 final_resized_tile_size =
+        floor(reclamped_tile_size + float2(FIX_ZERO(0.0)));
+    return final_resized_tile_size;
+}
+
+
+/////////////////////////  FINAL MASK SAMPLING HELPERS  ////////////////////////
+
+float4 get_mask_sampling_parameters(const float2 mask_resize_texture_size,
+    const float2 mask_resize_video_size, const float2 true_viewport_size,
+    out float2 mask_tiles_per_screen)
+{
+    //  Requires:   1.) Requirements of get_resized_mask_tile_size() must be
+    //                  met, particularly regarding global constants.
+    //              The function parameters must be defined as follows:
+    //              1.) mask_resize_texture_size == MASK_RESIZE.texture_size
+    //                  if get_mask_sample_mode() is 0 (otherwise anything)
+    //              2.) mask_resize_video_size == MASK_RESIZE.video_size
+    //                  if get_mask_sample_mode() is 0 (otherwise anything)
+    //              3.) true_viewport_size == output_size for a pass set to
+    //                  1.0 viewport scale (i.e. it must be correct)
+    //  Returns:    Return a float4 containing:
+    //                  xy: tex_uv coords for the start of the mask tile
+    //                  zw: tex_uv size of the mask tile from start to end
+    //              mask_tiles_per_screen is an out parameter containing the
+    //              number of mask tiles that will fit on the screen.
+    //  First get the final resized tile size.  The viewport size and mask
+    //  resize viewport scale must be correct, but don't solemnly swear they
+    //  were correct in both mask resize passes unless you know it's true.
+    //  (We can better ensure a correct tile aspect ratio if the parameters are
+    //  guaranteed correct in all passes...but if we lie, we'll get inconsistent
+    //  sizes across passes, resulting in broken texture coordinates.)
+    const float mask_sample_mode = get_mask_sample_mode();
+    const float2 mask_resize_tile_size = get_resized_mask_tile_size(
+        true_viewport_size, mask_resize_video_size, false);
+    if(mask_sample_mode < 0.5)
+    {
+        //  Sample MASK_RESIZE: The resized tile is a fraction of the texture
+        //  size and starts at a nonzero offset to allow for border texels:
+        const float2 mask_tile_uv_size = mask_resize_tile_size /
+            mask_resize_texture_size;
+        const float2 skipped_tiles = mask_start_texels/mask_resize_tile_size;
+        const float2 mask_tile_start_uv = skipped_tiles * mask_tile_uv_size;
+        //  mask_tiles_per_screen must be based on the *true* viewport size:
+        mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size;
+        return float4(mask_tile_start_uv, mask_tile_uv_size);
+    }
+    else
+    {
+        //  If we're tiling at the original size (1:1 pixel:texel), redefine a
+        //  "tile" to be the full texture containing many triads.  Otherwise,
+        //  we're hardware-resampling an LUT, and the texture truly contains a
+        //  single unresized phosphor mask tile anyway.
+        static const float2 mask_tile_uv_size = float2(1.0);
+        static const float2 mask_tile_start_uv = float2(0.0);
+        if(mask_sample_mode > 1.5)
+        {
+            //  Repeat the full LUT at a 1:1 pixel:texel ratio without resizing:
+            mask_tiles_per_screen = true_viewport_size/mask_texture_large_size;
+        }
+        else
+        {
+            //  Hardware-resize the original LUT:
+            mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size;
+        }
+        return float4(mask_tile_start_uv, mask_tile_uv_size);
+    }
+}
+/*
+float2 fix_tiling_discontinuities_normalized(const float2 tile_uv,
+    float2 duv_dx, float2 duv_dy)
+{
+    //  Requires:   1.) duv_dx == ddx(tile_uv)
+    //              2.) duv_dy == ddy(tile_uv)
+    //              3.) tile_uv contains tile-relative uv coords in [0, 1],
+    //                  such that (0.5, 0.5) is the center of a tile, etc.
+    //                  ("Tile" can mean texture, the video embedded in the
+    //                  texture, or some other "tile" embedded in a texture.)
+    //  Returns:    Return new tile_uv coords that contain no discontinuities
+    //              across a 2x2 pixel quad.
+    //  Description:
+    //  When uv coords wrap from 1.0 to 0.0, they create a discontinuity in the
+    //  derivatives, which we assume happened if the absolute difference between
+    //  any fragment in a 2x2 block is > ~half a tile.  If the current block has
+    //  a u or v discontinuity and the current fragment is in the first half of
+    //  the tile along that axis (i.e. it wrapped from 1.0 to 0.0), add a tile
+    //  to that coord to make the 2x2 block continuous.  (It will now have a
+    //  coord > 1.0 in the padding area beyond the tile.)  This function takes
+    //  derivatives as parameters so the caller can reuse them.
+    //  In case we're using high-quality (nVidia-style) derivatives, ensure
+    //  diagonically opposite fragments see each other for correctness:
+    duv_dx = abs(duv_dx) + abs(ddy(duv_dx));
+    duv_dy = abs(duv_dy) + abs(ddx(duv_dy));
+    const float2 pixel_in_first_half_tile = float2((tile_uv.x < 0.5),(tile_uv.y < 0.5));
+    const float2 jump_exists = float2(((duv_dx + duv_dy).x > 0.5),((duv_dx + duv_dy).y > 0.5));
+    return tile_uv + jump_exists * pixel_in_first_half_tile;
+}
+*/
+float2 convert_phosphor_tile_uv_wrap_to_tex_uv(const float2 tile_uv_wrap,
+    const float4 mask_tile_start_uv_and_size)
+{
+    //  Requires:   1.) tile_uv_wrap contains tile-relative uv coords, where the
+    //                  tile spans from [0, 1], such that (0.5, 0.5) is at the
+    //                  tile center.  The input coords can range from [0, inf],
+    //                  and their fractional parts map to a repeated tile.
+    //                  ("Tile" can mean texture, the video embedded in the
+    //                  texture, or some other "tile" embedded in a texture.)
+    //              2.) mask_tile_start_uv_and_size.xy contains tex_uv coords
+    //                  for the start of the embedded tile in the full texture.
+    //              3.) mask_tile_start_uv_and_size.zw contains the [fractional]
+    //                  tex_uv size of the embedded tile in the full texture.
+    //  Returns:    Return tex_uv coords (used for texture sampling)
+    //              corresponding to tile_uv_wrap.
+    if(get_mask_sample_mode() < 0.5)
+    {
+        //  Manually repeat the resized mask tile to fill the screen:
+        //  First get fractional tile_uv coords.  Using frac/fmod on coords
+        //  confuses anisotropic filtering; fix it as user options dictate.
+        //  derived-settings-and-constants.h disables incompatible options.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            float2 tile_uv = frac(tile_uv_wrap * 0.5) * 2.0;
+        #else
+            float2 tile_uv = frac(tile_uv_wrap);
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            const float2 tile_uv_dx = ddx(tile_uv);
+            const float2 tile_uv_dy = ddy(tile_uv);
+            tile_uv = fix_tiling_discontinuities_normalized(tile_uv,
+                tile_uv_dx, tile_uv_dy);
+        #endif
+        //  The tile is embedded in a padded FBO, and it may start at a
+        //  nonzero offset if border texels are used to avoid artifacts:
+        const float2 mask_tex_uv = mask_tile_start_uv_and_size.xy +
+            tile_uv * mask_tile_start_uv_and_size.zw;
+        return mask_tex_uv;
+    }
+    else
+    {
+        //  Sample from the input phosphor mask texture with hardware tiling.
+        //  If we're tiling at the original size (mode 2), the "tile" is the
+        //  whole texture, and it contains a large number of triads mapped with
+        //  a 1:1 pixel:texel ratio.  OTHERWISE, the texture contains a single
+        //  unresized tile.  tile_uv_wrap already has correct coords for both!
+        return tile_uv_wrap;
+    }
+}
+
+
+#endif  //  PHOSPHOR_MASK_RESIZING_H
+
+/////////////////////////  END PHOSPHOR-MASK-RESIZING  /////////////////////////
+
+#undef COMPAT_PRECISION
+#undef COMPAT_TEXTURE
+
+void main() {
+    //  The input contains one mask tile horizontally and a number vertically.
+    //  Resize the tile horizontally to its final screen size and repeat it
+    //  until drawing at least mask_resize_num_tiles, leaving it unchanged
+    //  vertically.  Lanczos-resizing the phosphor mask achieves much sharper
+    //  results than mipmapping, outputting >= mask_resize_num_tiles makes for
+    //  easier tiled sampling later.
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        //  Discard unneeded fragments in case our profile allows real branches.
+        //const float2 tile_uv_wrap = tile_uv_wrap;
+        if(get_mask_sample_mode() < 0.5 &&
+            max(tile_uv_wrap.x, tile_uv_wrap.y) <= mask_resize_num_tiles)
+        {
+            const float src_dx = src_dxdy.x;
+            const float2 src_tex_uv = frac(src_tex_uv_wrap);
+            const float3 pixel_color = downsample_horizontal_sinc_tiled(input_texture,
+                src_tex_uv, texture_size, src_dxdy.x,
+                resize_magnification_scale.x, tile_size_uv.x);
+            //  The input LUT was linear RGB, and so is our output:
+            FragColor = float4(pixel_color, 1.0);
+        }
+        else
+        {
+            discard;
+        }
+    #else
+        discard;
+        FragColor = float4(1.0,1.0,1.0,1.0);
+    #endif
+}
\ No newline at end of file
diff --git a/shaders/CRT-Royale.shader/mask-resize-horizontal.vs b/shaders/CRT-Royale.shader/mask-resize-horizontal.vs
new file mode 100644
index 00000000..b64cf9c8
--- /dev/null
+++ b/shaders/CRT-Royale.shader/mask-resize-horizontal.vs
@@ -0,0 +1,3236 @@
+#version 150
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+in vec4 position;
+in vec2 texCoord;
+
+out Vertex {
+   vec2 vTexCoord;
+   vec2 src_tex_uv_wrap;
+   vec2 tile_uv_wrap;
+   vec2 resize_magnification_scale;
+   vec2 src_dxdy;
+   vec2 tile_size_uv;
+   vec2 input_tiles_per_texture;
+};
+
+uniform vec4 targetSize;
+uniform vec4 sourceSize[];
+uniform int phase;
+
+// USER SETTINGS BLOCK //
+
+#define crt_gamma 2.500000
+#define lcd_gamma 2.200000
+#define levels_contrast 1.0
+#define halation_weight 0.0
+#define diffusion_weight 0.075
+#define bloom_underestimate_levels 0.8
+#define bloom_excess 0.000000
+#define beam_min_sigma 0.020000
+#define beam_max_sigma 0.300000
+#define beam_spot_power 0.330000
+#define beam_min_shape 2.000000
+#define beam_max_shape 4.000000
+#define beam_shape_power 0.250000
+#define beam_horiz_filter 0.000000
+#define beam_horiz_sigma 0.35
+#define beam_horiz_linear_rgb_weight 1.000000
+#define convergence_offset_x_r -0.000000
+#define convergence_offset_x_g 0.000000
+#define convergence_offset_x_b 0.000000
+#define convergence_offset_y_r 0.000000
+#define convergence_offset_y_g -0.000000
+#define convergence_offset_y_b 0.000000
+#define mask_type 1.000000
+#define mask_sample_mode_desired 0.000000
+#define mask_specify_num_triads 0.000000
+#define mask_triad_size_desired 3.000000
+#define mask_num_triads_desired 480.000000
+#define aa_subpixel_r_offset_x_runtime -0.0
+#define aa_subpixel_r_offset_y_runtime 0.000000
+#define aa_cubic_c 0.500000
+#define aa_gauss_sigma 0.500000
+#define geom_mode_runtime 0.000000
+#define geom_radius 2.000000
+#define geom_view_dist 2.000000
+#define geom_tilt_angle_x 0.000000
+#define geom_tilt_angle_y 0.000000
+#define geom_aspect_ratio_x 432.000000
+#define geom_aspect_ratio_y 329.000000
+#define geom_overscan_x 1.000000
+#define geom_overscan_y 1.000000
+#define border_size 0.015
+#define border_darkness 2.0
+#define border_compress 2.500000
+#define interlace_bff 0.000000
+#define interlace_1080i 0.000000
+
+// END USER SETTINGS BLOCK //
+
+// compatibility macros for transparently converting HLSLisms into GLSLisms
+#define mul(a,b) (b*a)
+#define lerp(a,b,c) mix(a,b,c)
+#define saturate(c) clamp(c, 0.0, 1.0)
+#define frac(x) (fract(x))
+#define float2 vec2
+#define float3 vec3
+#define float4 vec4
+#define bool2 bvec2
+#define bool3 bvec3
+#define bool4 bvec4
+#define float2x2 mat2x2
+#define float3x3 mat3x3
+#define float4x4 mat4x4
+#define float4x3 mat4x3
+#define float2x4 mat2x4
+#define IN params
+#define texture_size sourceSize[0].xy
+#define video_size sourceSize[0].xy
+#define output_size targetSize.xy
+#define frame_count phase
+#define static  
+#define inline  
+#define const  
+#define fmod(x,y) mod(x,y)
+#define ddx(c) dFdx(c)
+#define ddy(c) dFdy(c)
+#define atan2(x,y) atan(y,x)
+#define rsqrt(c) inversesqrt(c)
+
+#define input_texture source[0]
+
+#if defined(GL_ES)
+	#define COMPAT_PRECISION mediump
+#else
+	#define COMPAT_PRECISION
+#endif
+
+#if __VERSION__ >= 130
+	#define COMPAT_TEXTURE texture
+#else
+	#define COMPAT_TEXTURE texture2D
+#endif
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+////////////////////////////  END USER-SETTINGS  //////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  END DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////////////
+
+//#include "bind-shader-h"
+
+/////////////////////////////  BEGIN BIND-SHADER-PARAMS  ////////////////////////////
+
+#ifndef BIND_SHADER_PARAMS_H
+#define BIND_SHADER_PARAMS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+/////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+////////////////////   END DERIVED-SETTINGS-AND-CONSTANTS   /////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+//  Override some parameters for gamma-management.h and tex2Dantialias.h:
+#define OVERRIDE_DEVICE_GAMMA
+static const float gba_gamma = 3.5; //  Irrelevant but necessary to define.
+#define ANTIALIAS_OVERRIDE_BASICS
+#define ANTIALIAS_OVERRIDE_PARAMETERS
+
+//  Provide accessors for vector constants that pack scalar uniforms:
+inline float2 get_aspect_vector(const float geom_aspect_ratio)
+{
+    //  Get an aspect ratio vector.  Enforce geom_max_aspect_ratio, and prevent
+    //  the absolute scale from affecting the uv-mapping for curvature:
+    const float geom_clamped_aspect_ratio =
+        min(geom_aspect_ratio, geom_max_aspect_ratio);
+    const float2 geom_aspect =
+        normalize(float2(geom_clamped_aspect_ratio, 1.0));
+    return geom_aspect;
+}
+
+inline float2 get_geom_overscan_vector()
+{
+    return float2(geom_overscan_x, geom_overscan_y);
+}
+
+inline float2 get_geom_tilt_angle_vector()
+{
+    return float2(geom_tilt_angle_x, geom_tilt_angle_y);
+}
+
+inline float3 get_convergence_offsets_x_vector()
+{
+    return float3(convergence_offset_x_r, convergence_offset_x_g,
+        convergence_offset_x_b);
+}
+
+inline float3 get_convergence_offsets_y_vector()
+{
+    return float3(convergence_offset_y_r, convergence_offset_y_g,
+        convergence_offset_y_b);
+}
+
+inline float2 get_convergence_offsets_r_vector()
+{
+    return float2(convergence_offset_x_r, convergence_offset_y_r);
+}
+
+inline float2 get_convergence_offsets_g_vector()
+{
+    return float2(convergence_offset_x_g, convergence_offset_y_g);
+}
+
+inline float2 get_convergence_offsets_b_vector()
+{
+    return float2(convergence_offset_x_b, convergence_offset_y_b);
+}
+
+inline float2 get_aa_subpixel_r_offset()
+{
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+            //  WARNING: THIS IS EXTREMELY EXPENSIVE.
+            return float2(aa_subpixel_r_offset_x_runtime,
+                aa_subpixel_r_offset_y_runtime);
+        #else
+            return aa_subpixel_r_offset_static;
+        #endif
+    #else
+        return aa_subpixel_r_offset_static;
+    #endif
+}
+
+//  Provide accessors settings which still need "cooking:"
+inline float get_mask_amplify()
+{
+    static const float mask_grille_amplify = 1.0/mask_grille_avg_color;
+    static const float mask_slot_amplify = 1.0/mask_slot_avg_color;
+    static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color;
+    return mask_type < 0.5 ? mask_grille_amplify :
+        mask_type < 1.5 ? mask_slot_amplify :
+        mask_shadow_amplify;
+}
+
+inline float get_mask_sample_mode()
+{
+    #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_desired;
+        #else
+            return clamp(mask_sample_mode_desired, 1.0, 2.0);
+        #endif
+    #else
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_static;
+        #else
+            return clamp(mask_sample_mode_static, 1.0, 2.0);
+        #endif
+    #endif
+}
+
+#endif  //  BIND_SHADER_PARAMS_H
+
+////////////////////////////  END BIND-SHADER-PARAMS  ///////////////////////////
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//#include "phosphor-mask-resizing.h"
+
+////////////////////////  BEGIN PHOSPHOR-MASK-RESIZING  ////////////////////////
+
+#ifndef PHOSPHOR_MASK_RESIZING_H
+#define PHOSPHOR_MASK_RESIZING_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//#include "../user-settings.h"
+//#include "derived-settings-and-constants.h"
+
+/////////////////////////////  CODEPATH SELECTION  /////////////////////////////
+
+//  Choose a looping strategy based on what's allowed:
+//  Dynamic loops not allowed: Use a flat static loop.
+//  Dynamic loops accomodated: Coarsely branch around static loops.
+//  Dynamic loops assumed allowed: Use a flat dynamic loop.
+#ifndef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #ifdef ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+        #define BREAK_LOOPS_INTO_PIECES
+    #else
+        #define USE_SINGLE_STATIC_LOOP
+    #endif
+#endif  //  No else needed: Dynamic loops assumed.
+
+
+//////////////////////////////////  CONSTANTS  /////////////////////////////////
+
+//  The larger the resized tile, the fewer samples we'll need for downsizing.
+//  See if we can get a static min tile size > mask_min_allowed_tile_size:
+static const float mask_min_allowed_tile_size = ceil(
+    mask_min_allowed_triad_size * mask_triads_per_tile);
+static const float mask_min_expected_tile_size = 
+        mask_min_allowed_tile_size;
+//  Limit the number of sinc resize taps by the maximum minification factor:
+static const float pi_over_lobes = pi/mask_sinc_lobes;
+static const float max_sinc_resize_samples_float = 2.0 * mask_sinc_lobes *
+    mask_resize_src_lut_size.x/mask_min_expected_tile_size;
+//  Vectorized loops sample in multiples of 4.  Round up to be safe:
+static const float max_sinc_resize_samples_m4 = ceil(
+    max_sinc_resize_samples_float * 0.25) * 4.0;
+
+
+/////////////////////////  RESAMPLING FUNCTION HELPERS  ////////////////////////
+
+inline float get_dynamic_loop_size(const float magnification_scale)
+{
+    //  Requires:   The following global constants must be defined:
+    //              1.) mask_sinc_lobes
+    //              2.) max_sinc_resize_samples_m4
+    //  Returns:    The minimum number of texture samples for a correct downsize
+    //              at magnification_scale.
+    //  We're downsizing, so the filter is sized across 2*lobes output pixels
+    //  (not 2*lobes input texels).  This impacts distance measurements and the
+    //  minimum number of input samples needed.
+    const float min_samples_float = 2.0 * mask_sinc_lobes / magnification_scale;
+    const float min_samples_m4 = ceil(min_samples_float * 0.25) * 4.0;
+    #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+        const float max_samples_m4 = max_sinc_resize_samples_m4;
+    #else   // ifdef BREAK_LOOPS_INTO_PIECES
+        //  Simulating loops with branches imposes a 128-sample limit.
+        const float max_samples_m4 = min(128.0, max_sinc_resize_samples_m4);
+    #endif
+    return min(min_samples_m4, max_samples_m4);
+}
+
+float2 get_first_texel_tile_uv_and_dist(const float2 tex_uv, 
+    const float2 tex_size, const float dr, 
+    const float input_tiles_per_texture_r, const float samples,
+    static const bool vertical)
+{
+    //  Requires:   1.) dr == du == 1.0/texture_size.x or
+    //                  dr == dv == 1.0/texture_size.y
+    //                  (whichever direction we're resampling in).
+    //                  It's a scalar to save register space.
+    //              2.) input_tiles_per_texture_r is the number of input tiles
+    //                  that can fit in the input texture in the direction we're
+    //                  resampling this pass.
+    //              3.) vertical indicates whether we're resampling vertically
+    //                  this pass (or horizontally).
+    //  Returns:    Pack and return the first sample's tile_uv coord in [0, 1]
+    //              and its texel distance from the destination pixel, in the
+    //              resized dimension only.
+    //  We'll start with the topmost or leftmost sample and work down or right,
+    //  so get the first sample location and distance.  Modify both dimensions
+    //  as if we're doing a one-pass 2D resize; we'll throw away the unneeded
+    //  (and incorrect) dimension at the end.
+    const float2 curr_texel = tex_uv * tex_size;
+    const float2 prev_texel =
+        floor(curr_texel - float2(under_half)) + float2(0.5);
+    const float2 first_texel = prev_texel - float2(samples/2.0 - 1.0);
+    const float2 first_texel_uv_wrap_2D = first_texel * dr;
+    const float2 first_texel_dist_2D = curr_texel - first_texel;
+    //  Convert from tex_uv to tile_uv coords so we can sub fracs for fmods.
+    const float2 first_texel_tile_uv_wrap_2D =
+        first_texel_uv_wrap_2D * input_tiles_per_texture_r;
+    //  Project wrapped coordinates to the [0, 1] range.  We'll do this with all
+    //  samples,but the first texel is special, since it might be negative.
+    const float2 coord_negative =
+        float2((first_texel_tile_uv_wrap_2D.x < 0.),(first_texel_tile_uv_wrap_2D.y < 0.));
+    const float2 first_texel_tile_uv_2D =
+        frac(first_texel_tile_uv_wrap_2D) + coord_negative;
+    //  Pack the first texel's tile_uv coord and texel distance in 1D:
+    const float2 tile_u_and_dist =
+        float2(first_texel_tile_uv_2D.x, first_texel_dist_2D.x);
+    const float2 tile_v_and_dist =
+        float2(first_texel_tile_uv_2D.y, first_texel_dist_2D.y);
+    return vertical ? tile_v_and_dist : tile_u_and_dist;
+    //return lerp(tile_u_and_dist, tile_v_and_dist, float(vertical));
+}
+
+inline float4 tex2Dlod0try(const sampler2D tex, const float2 tex_uv)
+{
+    //  Mipmapping and anisotropic filtering get confused by sinc-resampling.
+    //  One [slow] workaround is to select the lowest mip level:
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        return textureLod(tex, float4(tex_uv, 0.0, 0.0).xy);
+    #else
+        #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+            return tex2Dbias(tex, float4(tex_uv, 0.0, -16.0));
+        #else
+            return texture(tex, tex_uv);
+        #endif
+    #endif
+}
+
+
+//////////////////////////////  LOOP BODY MACROS  //////////////////////////////
+
+//  Using inline functions can exceed the temporary register limit, so we're
+//  stuck with #define macros (I'm TRULY sorry).  They're declared here instead
+//  of above to be closer to the actual invocation sites.  Steps:
+//  1.) Get the exact texel location.
+//  2.) Sample the phosphor mask (already assumed encoded in linear RGB).
+//  3.) Get the distance from the current pixel and sinc weight:
+//          sinc(dist) = sin(pi * dist)/(pi * dist)
+//      We can also use the slower/smoother Lanczos instead:
+//          L(x) = sinc(dist) * sinc(dist / lobes)
+//  4.) Accumulate the weight sum in weights, and accumulate the weighted texels
+//      in pixel_color (we'll normalize outside the loop at the end).
+//  We vectorize the loop to help reduce the Lanczos window's cost.
+
+    //  The r coord is the coord in the dimension we're resizing along (u or v),
+    //  and first_texel_tile_uv_rrrr is a float4 of the first texel's u or v
+    //  tile_uv coord in [0, 1].  tex_uv_r will contain the tile_uv u or v coord
+    //  for four new texel samples.
+    #define CALCULATE_R_COORD_FOR_4_SAMPLES                                    \
+        const float4 true_i = float4(i_base + i) + float4(0.0, 1.0, 2.0, 3.0); \
+        const float4 tile_uv_r = frac(                                         \
+            first_texel_tile_uv_rrrr + true_i * tile_dr);                      \
+        const float4 tex_uv_r = tile_uv_r * tile_size_uv_r;
+
+    #ifdef PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+        #define CALCULATE_SINC_RESAMPLE_WEIGHTS                                \
+            const float4 pi_dist_over_lobes = pi_over_lobes * dist;            \
+            const float4 weights = min(sin(pi_dist) * sin(pi_dist_over_lobes) /\
+                (pi_dist*pi_dist_over_lobes), float4(1.0));
+    #else
+        #define CALCULATE_SINC_RESAMPLE_WEIGHTS                                \
+            const float4 weights = min(sin(pi_dist)/pi_dist, float4(1.0));
+    #endif
+
+    #define UPDATE_COLOR_AND_WEIGHT_SUMS                                       \
+        const float4 dist = magnification_scale *                              \
+            abs(first_dist_unscaled - true_i);                                 \
+        const float4 pi_dist = pi * dist;                                      \
+        CALCULATE_SINC_RESAMPLE_WEIGHTS;                                       \
+        pixel_color += new_sample0 * weights.xxx;                              \
+        pixel_color += new_sample1 * weights.yyy;                              \
+        pixel_color += new_sample2 * weights.zzz;                              \
+        pixel_color += new_sample3 * weights.www;                              \
+        weight_sum += weights;
+
+    #define VERTICAL_SINC_RESAMPLE_LOOP_BODY                                   \
+        CALCULATE_R_COORD_FOR_4_SAMPLES;                                       \
+        const float3 new_sample0 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.x)).rgb;                                 \
+        const float3 new_sample1 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.y)).rgb;                                 \
+        const float3 new_sample2 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.z)).rgb;                                 \
+        const float3 new_sample3 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.w)).rgb;                                 \
+        UPDATE_COLOR_AND_WEIGHT_SUMS;
+
+    #define HORIZONTAL_SINC_RESAMPLE_LOOP_BODY                                 \
+        CALCULATE_R_COORD_FOR_4_SAMPLES;                                       \
+        const float3 new_sample0 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.x, tex_uv.y)).rgb;                                 \
+        const float3 new_sample1 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.y, tex_uv.y)).rgb;                                 \
+        const float3 new_sample2 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.z, tex_uv.y)).rgb;                                 \
+        const float3 new_sample3 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.w, tex_uv.y)).rgb;                                 \
+        UPDATE_COLOR_AND_WEIGHT_SUMS;
+
+
+////////////////////////////  RESAMPLING FUNCTIONS  ////////////////////////////
+
+float3 downsample_vertical_sinc_tiled(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size, static const float dr,
+    const float magnification_scale, static const float tile_size_uv_r)
+{
+    //  Requires:   1.) dr == du == 1.0/texture_size.x or
+    //                  dr == dv == 1.0/texture_size.y
+    //                  (whichever direction we're resampling in).
+    //                  It's a scalar to save register space.
+    //              2.) tile_size_uv_r is the number of texels an input tile
+    //                  takes up in the input texture, in the direction we're
+    //                  resampling this pass.
+    //              3.) magnification_scale must be <= 1.0.
+    //  Returns:    Return a [Lanczos] sinc-resampled pixel of a vertically
+    //              downsized input tile embedded in an input texture.  (The
+    //              vertical version is special-cased though: It assumes the
+    //              tile size equals the [static] texture size, since it's used
+    //              on an LUT texture input containing one tile.  For more
+    //              generic use, eliminate the "static" in the parameters.)
+    //  The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension
+    //  we're resizing along, e.g. "dy" in this case.
+    #ifdef USE_SINGLE_STATIC_LOOP
+        //  A static loop can be faster, but it might blur too much from using
+        //  more samples than it should.
+        static const int samples = int(max_sinc_resize_samples_m4);
+    #else
+        const int samples = int(get_dynamic_loop_size(magnification_scale));
+    #endif
+
+    //  Get the first sample location (scalar tile uv coord along the resized
+    //  dimension) and distance from the output location (in texels):
+    static const float input_tiles_per_texture_r = 1.0/tile_size_uv_r;
+    //  true = vertical resize:
+    const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist(
+        tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, true);
+    const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx;
+    const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy;
+    //  Get the tile sample offset:
+    static const float tile_dr = dr * input_tiles_per_texture_r;
+
+    //  Sum up each weight and weighted sample color, varying the looping
+    //  strategy based on our expected dynamic loop capabilities.  See the
+    //  loop body macros above.
+    int i_base = 0;
+    float4 weight_sum = float4(0.0);
+    float3 pixel_color = float3(0.0);
+    static const int i_step = 4;
+    #ifdef BREAK_LOOPS_INTO_PIECES
+        if(samples - i_base >= 64)
+        {
+            for(int i = 0; i < 64; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 64;
+        }
+        if(samples - i_base >= 32)
+        {
+            for(int i = 0; i < 32; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 32;
+        }
+        if(samples - i_base >= 16)
+        {
+            for(int i = 0; i < 16; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 16;
+        }
+        if(samples - i_base >= 8)
+        {
+            for(int i = 0; i < 8; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 8;
+        }
+        if(samples - i_base >= 4)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 4;
+        }
+        //  Do another 4-sample block for a total of 128 max samples.
+        if(samples - i_base > 0)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+        }
+    #else
+        for(int i = 0; i < samples; i += i_step)
+        {
+            VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+        }
+    #endif
+    //  Normalize so the weight_sum == 1.0, and return:
+    const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw;
+    const float3 scalar_weight_sum = float3(weight_sum_reduce.x + 
+        weight_sum_reduce.y);
+    return (pixel_color/scalar_weight_sum);
+}
+
+float3 downsample_horizontal_sinc_tiled(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size, const float dr,
+    const float magnification_scale, const float tile_size_uv_r)
+{
+    //  Differences from downsample_horizontal_sinc_tiled:
+    //  1.) The dr and tile_size_uv_r parameters are not static consts.
+    //  2.) The "vertical" parameter to get_first_texel_tile_uv_and_dist is
+    //      set to false instead of true.
+    //  3.) The horizontal version of the loop body is used.
+    //  TODO: If we can get guaranteed compile-time dead code elimination,
+    //  we can combine the vertical/horizontal downsampling functions by:
+    //  1.) Add an extra static const bool parameter called "vertical."
+    //  2.) Supply it with the result of get_first_texel_tile_uv_and_dist().
+    //  3.) Use a conditional assignment in the loop body macro.  This is the
+    //      tricky part: We DO NOT want to incur the extra conditional
+    //      assignment in the inner loop at runtime!
+    //  The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension
+    //  we're resizing along, e.g. "dx" in this case.
+    #ifdef USE_SINGLE_STATIC_LOOP
+        //  If we have to load all samples, we might as well use them.
+        static const int samples = int(max_sinc_resize_samples_m4);
+    #else
+        const int samples = int(get_dynamic_loop_size(magnification_scale));
+    #endif
+
+    //  Get the first sample location (scalar tile uv coord along resized
+    //  dimension) and distance from the output location (in texels):
+    const float input_tiles_per_texture_r = 1.0/tile_size_uv_r;
+    //  false = horizontal resize:
+    const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist(
+        tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, false);
+    const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx;
+    const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy;
+    //  Get the tile sample offset:
+    const float tile_dr = dr * input_tiles_per_texture_r;
+
+    //  Sum up each weight and weighted sample color, varying the looping
+    //  strategy based on our expected dynamic loop capabilities.  See the
+    //  loop body macros above.
+    int i_base = 0;
+    float4 weight_sum = float4(0.0);
+    float3 pixel_color = float3(0.0);
+    static const int i_step = 4;
+    #ifdef BREAK_LOOPS_INTO_PIECES
+        if(samples - i_base >= 64)
+        {
+            for(int i = 0; i < 64; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 64;
+        }
+        if(samples - i_base >= 32)
+        {
+            for(int i = 0; i < 32; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 32;
+        }
+        if(samples - i_base >= 16)
+        {
+            for(int i = 0; i < 16; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 16;
+        }
+        if(samples - i_base >= 8)
+        {
+            for(int i = 0; i < 8; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 8;
+        }
+        if(samples - i_base >= 4)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 4;
+        }
+        //  Do another 4-sample block for a total of 128 max samples.
+        if(samples - i_base > 0)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+        }
+    #else
+        for(int i = 0; i < samples; i += i_step)
+        {
+            HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+        }
+    #endif
+    //  Normalize so the weight_sum == 1.0, and return:
+    const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw;
+    const float3 scalar_weight_sum = float3(weight_sum_reduce.x +
+        weight_sum_reduce.y);
+    return (pixel_color/scalar_weight_sum);
+}
+
+
+////////////////////////////  TILE SIZE CALCULATION  ///////////////////////////
+
+float2 get_resized_mask_tile_size(const float2 estimated_viewport_size,
+    const float2 estimated_mask_resize_output_size,
+    const bool solemnly_swear_same_inputs_for_every_pass)
+{
+    //  Requires:   The following global constants must be defined according to
+    //              certain constraints:
+    //              1.) mask_resize_num_triads: Must be high enough that our
+    //                  mask sampling method won't have artifacts later
+    //                  (long story; see derived-settings-and-constants.h)
+    //              2.) mask_resize_src_lut_size: Texel size of our mask LUT
+    //              3.) mask_triads_per_tile: Num horizontal triads in our LUT
+    //              4.) mask_min_allowed_triad_size: User setting (the more
+    //                  restrictive it is, the faster the resize will go)
+    //              5.) mask_min_allowed_tile_size_x < mask_resize_src_lut_size.x
+    //              6.) mask_triad_size_desired_{runtime, static}
+    //              7.) mask_num_triads_desired_{runtime, static}
+    //              8.) mask_specify_num_triads must be 0.0/1.0 (false/true)
+    //              The function parameters must be defined as follows:
+    //              1.) estimated_viewport_size == (final viewport size);
+    //                  If mask_specify_num_triads is 1.0/true and the viewport
+    //                  estimate is wrong, the number of triads will differ from
+    //                  the user's preference by about the same factor.
+    //              2.) estimated_mask_resize_output_size: Must equal the
+    //                  output size of the MASK_RESIZE pass.
+    //                  Exception: The x component may be estimated garbage if
+    //                  and only if the caller throws away the x result.
+    //              3.) solemnly_swear_same_inputs_for_every_pass: Set to false,
+    //                  unless you can guarantee that every call across every
+    //                  pass will use the same sizes for the other parameters.
+    //              When calling this across multiple passes, always use the
+    //              same y viewport size/scale, and always use the same x
+    //              viewport size/scale when using the x result.
+    //  Returns:    Return the final size of a manually resized mask tile, after
+    //              constraining the desired size to avoid artifacts.  Under
+    //              unusual circumstances, tiles may become stretched vertically
+    //              (see wall of text below).
+    //  Stated tile properties must be correct:
+    static const float tile_aspect_ratio_inv =
+        mask_resize_src_lut_size.y/mask_resize_src_lut_size.x;
+    static const float tile_aspect_ratio = 1.0/tile_aspect_ratio_inv;
+    static const float2 tile_aspect = float2(1.0, tile_aspect_ratio_inv);
+    //  If mask_specify_num_triads is 1.0/true and estimated_viewport_size.x is
+    //  wrong, the user preference will be misinterpreted:
+    const float desired_tile_size_x = mask_triads_per_tile * lerp(
+        mask_triad_size_desired,
+        estimated_viewport_size.x / mask_num_triads_desired,
+        mask_specify_num_triads);
+    if(get_mask_sample_mode() > 0.5)
+    {
+        //  We don't need constraints unless we're sampling MASK_RESIZE.
+        return desired_tile_size_x * tile_aspect;
+    }
+    //  Make sure we're not upsizing:
+    const float temp_tile_size_x =
+        min(desired_tile_size_x, mask_resize_src_lut_size.x);
+    //  Enforce min_tile_size and max_tile_size in both dimensions:
+    const float2 temp_tile_size = temp_tile_size_x * tile_aspect;
+    static const float2 min_tile_size =
+        mask_min_allowed_tile_size * tile_aspect;
+    const float2 max_tile_size =
+        estimated_mask_resize_output_size / mask_resize_num_tiles;
+    const float2 clamped_tile_size =
+        clamp(temp_tile_size, min_tile_size, max_tile_size);
+    //  Try to maintain tile_aspect_ratio.  This is the tricky part:
+    //  If we're currently resizing in the y dimension, the x components
+    //  could be MEANINGLESS.  (If estimated_mask_resize_output_size.x is
+    //  bogus, then so is max_tile_size.x and clamped_tile_size.x.)
+    //  We can't adjust the y size based on clamped_tile_size.x.  If it
+    //  clamps when it shouldn't, it won't clamp again when later passes
+    //  call this function with the correct sizes, and the discrepancy will
+    //  break the sampling coords in MASKED_SCANLINES.  Instead, we'll limit
+    //  the x size based on the y size, but not vice versa, unless the
+    //  caller swears the parameters were the same (correct) in every pass.
+    //  As a result, triads could appear vertically stretched if:
+    //  a.) mask_resize_src_lut_size.x > mask_resize_src_lut_size.y: Wide
+    //      LUT's might clamp x more than y (all provided LUT's are square)
+    //  b.) true_viewport_size.x < true_viewport_size.y: The user is playing
+    //      with a vertically oriented screen (not accounted for anyway)
+    //  c.) mask_resize_viewport_scale.x < masked_resize_viewport_scale.y:
+    //      Viewport scales are equal by default.
+    //  If any of these are the case, you can fix the stretching by setting:
+    //      mask_resize_viewport_scale.x = mask_resize_viewport_scale.y *
+    //          (1.0 / min_expected_aspect_ratio) *
+    //          (mask_resize_src_lut_size.x / mask_resize_src_lut_size.y)
+    const float x_tile_size_from_y =
+        clamped_tile_size.y * tile_aspect_ratio;
+    const float y_tile_size_from_x = lerp(clamped_tile_size.y,
+        clamped_tile_size.x * tile_aspect_ratio_inv,
+        float(solemnly_swear_same_inputs_for_every_pass));
+    const float2 reclamped_tile_size = float2(
+        min(clamped_tile_size.x, x_tile_size_from_y),
+        min(clamped_tile_size.y, y_tile_size_from_x));
+    //  We need integer tile sizes in both directions for tiled sampling to
+    //  work correctly.  Use floor (to make sure we don't round up), but be
+    //  careful to avoid a rounding bug where floor decreases whole numbers:
+    const float2 final_resized_tile_size =
+        floor(reclamped_tile_size + float2(FIX_ZERO(0.0)));
+    return final_resized_tile_size;
+}
+
+
+/////////////////////////  FINAL MASK SAMPLING HELPERS  ////////////////////////
+
+float4 get_mask_sampling_parameters(const float2 mask_resize_texture_size,
+    const float2 mask_resize_video_size, const float2 true_viewport_size,
+    out float2 mask_tiles_per_screen)
+{
+    //  Requires:   1.) Requirements of get_resized_mask_tile_size() must be
+    //                  met, particularly regarding global constants.
+    //              The function parameters must be defined as follows:
+    //              1.) mask_resize_texture_size == MASK_RESIZE.texture_size
+    //                  if get_mask_sample_mode() is 0 (otherwise anything)
+    //              2.) mask_resize_video_size == MASK_RESIZE.video_size
+    //                  if get_mask_sample_mode() is 0 (otherwise anything)
+    //              3.) true_viewport_size == output_size for a pass set to
+    //                  1.0 viewport scale (i.e. it must be correct)
+    //  Returns:    Return a float4 containing:
+    //                  xy: tex_uv coords for the start of the mask tile
+    //                  zw: tex_uv size of the mask tile from start to end
+    //              mask_tiles_per_screen is an out parameter containing the
+    //              number of mask tiles that will fit on the screen.
+    //  First get the final resized tile size.  The viewport size and mask
+    //  resize viewport scale must be correct, but don't solemnly swear they
+    //  were correct in both mask resize passes unless you know it's true.
+    //  (We can better ensure a correct tile aspect ratio if the parameters are
+    //  guaranteed correct in all passes...but if we lie, we'll get inconsistent
+    //  sizes across passes, resulting in broken texture coordinates.)
+    const float mask_sample_mode = get_mask_sample_mode();
+    const float2 mask_resize_tile_size = get_resized_mask_tile_size(
+        true_viewport_size, mask_resize_video_size, false);
+    if(mask_sample_mode < 0.5)
+    {
+        //  Sample MASK_RESIZE: The resized tile is a fraction of the texture
+        //  size and starts at a nonzero offset to allow for border texels:
+        const float2 mask_tile_uv_size = mask_resize_tile_size /
+            mask_resize_texture_size;
+        const float2 skipped_tiles = mask_start_texels/mask_resize_tile_size;
+        const float2 mask_tile_start_uv = skipped_tiles * mask_tile_uv_size;
+        //  mask_tiles_per_screen must be based on the *true* viewport size:
+        mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size;
+        return float4(mask_tile_start_uv, mask_tile_uv_size);
+    }
+    else
+    {
+        //  If we're tiling at the original size (1:1 pixel:texel), redefine a
+        //  "tile" to be the full texture containing many triads.  Otherwise,
+        //  we're hardware-resampling an LUT, and the texture truly contains a
+        //  single unresized phosphor mask tile anyway.
+        static const float2 mask_tile_uv_size = float2(1.0);
+        static const float2 mask_tile_start_uv = float2(0.0);
+        if(mask_sample_mode > 1.5)
+        {
+            //  Repeat the full LUT at a 1:1 pixel:texel ratio without resizing:
+            mask_tiles_per_screen = true_viewport_size/mask_texture_large_size;
+        }
+        else
+        {
+            //  Hardware-resize the original LUT:
+            mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size;
+        }
+        return float4(mask_tile_start_uv, mask_tile_uv_size);
+    }
+}
+/*
+float2 fix_tiling_discontinuities_normalized(const float2 tile_uv,
+    float2 duv_dx, float2 duv_dy)
+{
+    //  Requires:   1.) duv_dx == ddx(tile_uv)
+    //              2.) duv_dy == ddy(tile_uv)
+    //              3.) tile_uv contains tile-relative uv coords in [0, 1],
+    //                  such that (0.5, 0.5) is the center of a tile, etc.
+    //                  ("Tile" can mean texture, the video embedded in the
+    //                  texture, or some other "tile" embedded in a texture.)
+    //  Returns:    Return new tile_uv coords that contain no discontinuities
+    //              across a 2x2 pixel quad.
+    //  Description:
+    //  When uv coords wrap from 1.0 to 0.0, they create a discontinuity in the
+    //  derivatives, which we assume happened if the absolute difference between
+    //  any fragment in a 2x2 block is > ~half a tile.  If the current block has
+    //  a u or v discontinuity and the current fragment is in the first half of
+    //  the tile along that axis (i.e. it wrapped from 1.0 to 0.0), add a tile
+    //  to that coord to make the 2x2 block continuous.  (It will now have a
+    //  coord > 1.0 in the padding area beyond the tile.)  This function takes
+    //  derivatives as parameters so the caller can reuse them.
+    //  In case we're using high-quality (nVidia-style) derivatives, ensure
+    //  diagonically opposite fragments see each other for correctness:
+    duv_dx = abs(duv_dx) + abs(ddy(duv_dx));
+    duv_dy = abs(duv_dy) + abs(ddx(duv_dy));
+    const float2 pixel_in_first_half_tile = float2((tile_uv.x < 0.5),(tile_uv.y < 0.5));
+    const float2 jump_exists = float2(((duv_dx + duv_dy).x > 0.5),((duv_dx + duv_dy).y > 0.5));
+    return tile_uv + jump_exists * pixel_in_first_half_tile;
+}
+*/
+float2 convert_phosphor_tile_uv_wrap_to_tex_uv(const float2 tile_uv_wrap,
+    const float4 mask_tile_start_uv_and_size)
+{
+    //  Requires:   1.) tile_uv_wrap contains tile-relative uv coords, where the
+    //                  tile spans from [0, 1], such that (0.5, 0.5) is at the
+    //                  tile center.  The input coords can range from [0, inf],
+    //                  and their fractional parts map to a repeated tile.
+    //                  ("Tile" can mean texture, the video embedded in the
+    //                  texture, or some other "tile" embedded in a texture.)
+    //              2.) mask_tile_start_uv_and_size.xy contains tex_uv coords
+    //                  for the start of the embedded tile in the full texture.
+    //              3.) mask_tile_start_uv_and_size.zw contains the [fractional]
+    //                  tex_uv size of the embedded tile in the full texture.
+    //  Returns:    Return tex_uv coords (used for texture sampling)
+    //              corresponding to tile_uv_wrap.
+    if(get_mask_sample_mode() < 0.5)
+    {
+        //  Manually repeat the resized mask tile to fill the screen:
+        //  First get fractional tile_uv coords.  Using frac/fmod on coords
+        //  confuses anisotropic filtering; fix it as user options dictate.
+        //  derived-settings-and-constants.h disables incompatible options.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            float2 tile_uv = frac(tile_uv_wrap * 0.5) * 2.0;
+        #else
+            float2 tile_uv = frac(tile_uv_wrap);
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            const float2 tile_uv_dx = ddx(tile_uv);
+            const float2 tile_uv_dy = ddy(tile_uv);
+            tile_uv = fix_tiling_discontinuities_normalized(tile_uv,
+                tile_uv_dx, tile_uv_dy);
+        #endif
+        //  The tile is embedded in a padded FBO, and it may start at a
+        //  nonzero offset if border texels are used to avoid artifacts:
+        const float2 mask_tex_uv = mask_tile_start_uv_and_size.xy +
+            tile_uv * mask_tile_start_uv_and_size.zw;
+        return mask_tex_uv;
+    }
+    else
+    {
+        //  Sample from the input phosphor mask texture with hardware tiling.
+        //  If we're tiling at the original size (mode 2), the "tile" is the
+        //  whole texture, and it contains a large number of triads mapped with
+        //  a 1:1 pixel:texel ratio.  OTHERWISE, the texture contains a single
+        //  unresized tile.  tile_uv_wrap already has correct coords for both!
+        return tile_uv_wrap;
+    }
+}
+
+
+#endif  //  PHOSPHOR_MASK_RESIZING_H
+
+/////////////////////////  END PHOSPHOR-MASK-RESIZING  /////////////////////////
+
+#undef COMPAT_PRECISION
+#undef COMPAT_TEXTURE
+
+void main() {
+   gl_Position = position;
+   vTexCoord = texCoord * 1.0001;
+   float2 tex_uv = vTexCoord.xy;
+	//  First estimate the viewport size (the user will get the wrong number of
+    //  triads if it's wrong and mask_specify_num_triads is 1.0/true).
+    const float2 estimated_viewport_size =
+        output_size / mask_resize_viewport_scale;
+    //  Find the final size of our resized phosphor mask tiles.  We probably
+    //  estimated the viewport size and MASK_RESIZE output size differently last
+    //  pass, so do not swear they were the same. ;)
+    const float2 mask_resize_tile_size = get_resized_mask_tile_size(
+        estimated_viewport_size, output_size, false);
+
+    //  We'll render resized tiles until filling the output FBO or meeting a
+    //  limit, so compute [wrapped] tile uv coords based on the output uv coords
+    //  and the number of tiles that will fit in the FBO.
+    const float2 output_tiles_this_pass = output_size / mask_resize_tile_size;
+    const float2 output_video_uv = tex_uv * texture_size / video_size;
+    const float2 tile_uv_wrap = output_video_uv * output_tiles_this_pass;
+
+    //  Get the texel size of an input tile and related values:
+    const float2 input_tile_size = float2(min(
+        mask_resize_src_lut_size.x, video_size.x), mask_resize_tile_size.y);
+    tile_size_uv = input_tile_size / texture_size;
+    input_tiles_per_texture = texture_size / input_tile_size;
+
+    //  Derive [wrapped] texture uv coords from [wrapped] tile uv coords and
+    //  the tile size in uv coords, and save frac() for the fragment shader.
+    src_tex_uv_wrap = tile_uv_wrap * tile_size_uv;
+
+    //  Output the values we need, including the magnification scale and step:
+    //tile_uv_wrap = tile_uv_wrap;
+    //src_tex_uv_wrap = src_tex_uv_wrap;
+    resize_magnification_scale = mask_resize_tile_size / input_tile_size;
+    src_dxdy = float2(1.0/texture_size.x, 0.0);
+    //tile_size_uv = tile_size_uv;
+    //input_tiles_per_texture = input_tiles_per_texture;
+}
\ No newline at end of file
diff --git a/shaders/CRT-Royale.shader/mask-resize-vertical.fs b/shaders/CRT-Royale.shader/mask-resize-vertical.fs
new file mode 100644
index 00000000..16e8090e
--- /dev/null
+++ b/shaders/CRT-Royale.shader/mask-resize-vertical.fs
@@ -0,0 +1,3248 @@
+#version 150
+
+uniform sampler2D source[];
+uniform vec4 sourceSize[];
+uniform vec4 targetSize;
+uniform sampler2D pixmap[];
+uniform int phase;
+
+in Vertex {
+   vec2 vTexCoord;
+   vec2 src_tex_uv_wrap;
+   vec2 resize_magnification_scale;
+};
+
+out vec4 FragColor;
+
+// USER SETTINGS BLOCK //
+
+#define crt_gamma 2.500000
+#define lcd_gamma 2.200000
+#define levels_contrast 1.0
+#define halation_weight 0.0
+#define diffusion_weight 0.075
+#define bloom_underestimate_levels 0.8
+#define bloom_excess 0.000000
+#define beam_min_sigma 0.020000
+#define beam_max_sigma 0.300000
+#define beam_spot_power 0.330000
+#define beam_min_shape 2.000000
+#define beam_max_shape 4.000000
+#define beam_shape_power 0.250000
+#define beam_horiz_filter 0.000000
+#define beam_horiz_sigma 0.35
+#define beam_horiz_linear_rgb_weight 1.000000
+#define convergence_offset_x_r -0.000000
+#define convergence_offset_x_g 0.000000
+#define convergence_offset_x_b 0.000000
+#define convergence_offset_y_r 0.000000
+#define convergence_offset_y_g -0.000000
+#define convergence_offset_y_b 0.000000
+#define mask_type 1.000000
+#define mask_sample_mode_desired 0.000000
+#define mask_specify_num_triads 0.000000
+#define mask_triad_size_desired 3.000000
+#define mask_num_triads_desired 480.000000
+#define aa_subpixel_r_offset_x_runtime -0.0
+#define aa_subpixel_r_offset_y_runtime 0.000000
+#define aa_cubic_c 0.500000
+#define aa_gauss_sigma 0.500000
+#define geom_mode_runtime 0.000000
+#define geom_radius 2.000000
+#define geom_view_dist 2.000000
+#define geom_tilt_angle_x 0.000000
+#define geom_tilt_angle_y 0.000000
+#define geom_aspect_ratio_x 432.000000
+#define geom_aspect_ratio_y 329.000000
+#define geom_overscan_x 1.000000
+#define geom_overscan_y 1.000000
+#define border_size 0.015
+#define border_darkness 2.0
+#define border_compress 2.500000
+#define interlace_bff 0.000000
+#define interlace_1080i 0.000000
+
+// END USER SETTINGS BLOCK //
+
+// compatibility macros for transparently converting HLSLisms into GLSLisms
+#define mul(a,b) (b*a)
+#define lerp(a,b,c) mix(a,b,c)
+#define saturate(c) clamp(c, 0.0, 1.0)
+#define frac(x) (fract(x))
+#define float2 vec2
+#define float3 vec3
+#define float4 vec4
+#define bool2 bvec2
+#define bool3 bvec3
+#define bool4 bvec4
+#define float2x2 mat2x2
+#define float3x3 mat3x3
+#define float4x4 mat4x4
+#define float4x3 mat4x3
+#define float2x4 mat2x4
+#define IN params
+#define texture_size sourceSize[0].xy
+#define video_size sourceSize[0].xy
+#define output_size targetSize.xy
+#define frame_count phase
+#define static  
+#define inline  
+#define const  
+#define fmod(x,y) mod(x,y)
+#define ddx(c) dFdx(c)
+#define ddy(c) dFdy(c)
+#define atan2(x,y) atan(y,x)
+#define rsqrt(c) inversesqrt(c)
+
+#define mask_grille_texture_small pixmap[0]
+#define mask_slot_texture_small pixmap[2]
+#define mask_shadow_texture_small pixmap[4]
+
+#if defined(GL_ES)
+	#define COMPAT_PRECISION mediump
+#else
+	#define COMPAT_PRECISION
+#endif
+
+#if __VERSION__ >= 130
+	#define COMPAT_TEXTURE texture
+#else
+	#define COMPAT_TEXTURE texture2D
+#endif
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+////////////////////////////  END USER-SETTINGS  //////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  END DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////////////
+
+//#include "bind-shader-h"
+
+/////////////////////////////  BEGIN BIND-SHADER-PARAMS  ////////////////////////////
+
+#ifndef BIND_SHADER_PARAMS_H
+#define BIND_SHADER_PARAMS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+/////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+////////////////////   END DERIVED-SETTINGS-AND-CONSTANTS   /////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+//  Override some parameters for gamma-management.h and tex2Dantialias.h:
+#define OVERRIDE_DEVICE_GAMMA
+static const float gba_gamma = 3.5; //  Irrelevant but necessary to define.
+#define ANTIALIAS_OVERRIDE_BASICS
+#define ANTIALIAS_OVERRIDE_PARAMETERS
+
+//  Provide accessors for vector constants that pack scalar uniforms:
+inline float2 get_aspect_vector(const float geom_aspect_ratio)
+{
+    //  Get an aspect ratio vector.  Enforce geom_max_aspect_ratio, and prevent
+    //  the absolute scale from affecting the uv-mapping for curvature:
+    const float geom_clamped_aspect_ratio =
+        min(geom_aspect_ratio, geom_max_aspect_ratio);
+    const float2 geom_aspect =
+        normalize(float2(geom_clamped_aspect_ratio, 1.0));
+    return geom_aspect;
+}
+
+inline float2 get_geom_overscan_vector()
+{
+    return float2(geom_overscan_x, geom_overscan_y);
+}
+
+inline float2 get_geom_tilt_angle_vector()
+{
+    return float2(geom_tilt_angle_x, geom_tilt_angle_y);
+}
+
+inline float3 get_convergence_offsets_x_vector()
+{
+    return float3(convergence_offset_x_r, convergence_offset_x_g,
+        convergence_offset_x_b);
+}
+
+inline float3 get_convergence_offsets_y_vector()
+{
+    return float3(convergence_offset_y_r, convergence_offset_y_g,
+        convergence_offset_y_b);
+}
+
+inline float2 get_convergence_offsets_r_vector()
+{
+    return float2(convergence_offset_x_r, convergence_offset_y_r);
+}
+
+inline float2 get_convergence_offsets_g_vector()
+{
+    return float2(convergence_offset_x_g, convergence_offset_y_g);
+}
+
+inline float2 get_convergence_offsets_b_vector()
+{
+    return float2(convergence_offset_x_b, convergence_offset_y_b);
+}
+
+inline float2 get_aa_subpixel_r_offset()
+{
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+            //  WARNING: THIS IS EXTREMELY EXPENSIVE.
+            return float2(aa_subpixel_r_offset_x_runtime,
+                aa_subpixel_r_offset_y_runtime);
+        #else
+            return aa_subpixel_r_offset_static;
+        #endif
+    #else
+        return aa_subpixel_r_offset_static;
+    #endif
+}
+
+//  Provide accessors settings which still need "cooking:"
+inline float get_mask_amplify()
+{
+    static const float mask_grille_amplify = 1.0/mask_grille_avg_color;
+    static const float mask_slot_amplify = 1.0/mask_slot_avg_color;
+    static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color;
+    return mask_type < 0.5 ? mask_grille_amplify :
+        mask_type < 1.5 ? mask_slot_amplify :
+        mask_shadow_amplify;
+}
+
+inline float get_mask_sample_mode()
+{
+    #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_desired;
+        #else
+            return clamp(mask_sample_mode_desired, 1.0, 2.0);
+        #endif
+    #else
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_static;
+        #else
+            return clamp(mask_sample_mode_static, 1.0, 2.0);
+        #endif
+    #endif
+}
+
+#endif  //  BIND_SHADER_PARAMS_H
+
+////////////////////////////  END BIND-SHADER-PARAMS  ///////////////////////////
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//#include "phosphor-mask-resizing.h"
+
+////////////////////////  BEGIN PHOSPHOR-MASK-RESIZING  ////////////////////////
+
+#ifndef PHOSPHOR_MASK_RESIZING_H
+#define PHOSPHOR_MASK_RESIZING_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//#include "../user-settings.h"
+//#include "derived-settings-and-constants.h"
+
+/////////////////////////////  CODEPATH SELECTION  /////////////////////////////
+
+//  Choose a looping strategy based on what's allowed:
+//  Dynamic loops not allowed: Use a flat static loop.
+//  Dynamic loops accomodated: Coarsely branch around static loops.
+//  Dynamic loops assumed allowed: Use a flat dynamic loop.
+#ifndef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #ifdef ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+        #define BREAK_LOOPS_INTO_PIECES
+    #else
+        #define USE_SINGLE_STATIC_LOOP
+    #endif
+#endif  //  No else needed: Dynamic loops assumed.
+
+
+//////////////////////////////////  CONSTANTS  /////////////////////////////////
+
+//  The larger the resized tile, the fewer samples we'll need for downsizing.
+//  See if we can get a static min tile size > mask_min_allowed_tile_size:
+static const float mask_min_allowed_tile_size = ceil(
+    mask_min_allowed_triad_size * mask_triads_per_tile);
+static const float mask_min_expected_tile_size = 
+        mask_min_allowed_tile_size;
+//  Limit the number of sinc resize taps by the maximum minification factor:
+static const float pi_over_lobes = pi/mask_sinc_lobes;
+static const float max_sinc_resize_samples_float = 2.0 * mask_sinc_lobes *
+    mask_resize_src_lut_size.x/mask_min_expected_tile_size;
+//  Vectorized loops sample in multiples of 4.  Round up to be safe:
+static const float max_sinc_resize_samples_m4 = ceil(
+    max_sinc_resize_samples_float * 0.25) * 4.0;
+
+
+/////////////////////////  RESAMPLING FUNCTION HELPERS  ////////////////////////
+
+inline float get_dynamic_loop_size(const float magnification_scale)
+{
+    //  Requires:   The following global constants must be defined:
+    //              1.) mask_sinc_lobes
+    //              2.) max_sinc_resize_samples_m4
+    //  Returns:    The minimum number of texture samples for a correct downsize
+    //              at magnification_scale.
+    //  We're downsizing, so the filter is sized across 2*lobes output pixels
+    //  (not 2*lobes input texels).  This impacts distance measurements and the
+    //  minimum number of input samples needed.
+    const float min_samples_float = 2.0 * mask_sinc_lobes / magnification_scale;
+    const float min_samples_m4 = ceil(min_samples_float * 0.25) * 4.0;
+    #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+        const float max_samples_m4 = max_sinc_resize_samples_m4;
+    #else   // ifdef BREAK_LOOPS_INTO_PIECES
+        //  Simulating loops with branches imposes a 128-sample limit.
+        const float max_samples_m4 = min(128.0, max_sinc_resize_samples_m4);
+    #endif
+    return min(min_samples_m4, max_samples_m4);
+}
+
+float2 get_first_texel_tile_uv_and_dist(const float2 tex_uv, 
+    const float2 tex_size, const float dr, 
+    const float input_tiles_per_texture_r, const float samples,
+    static const bool vertical)
+{
+    //  Requires:   1.) dr == du == 1.0/texture_size.x or
+    //                  dr == dv == 1.0/texture_size.y
+    //                  (whichever direction we're resampling in).
+    //                  It's a scalar to save register space.
+    //              2.) input_tiles_per_texture_r is the number of input tiles
+    //                  that can fit in the input texture in the direction we're
+    //                  resampling this pass.
+    //              3.) vertical indicates whether we're resampling vertically
+    //                  this pass (or horizontally).
+    //  Returns:    Pack and return the first sample's tile_uv coord in [0, 1]
+    //              and its texel distance from the destination pixel, in the
+    //              resized dimension only.
+    //  We'll start with the topmost or leftmost sample and work down or right,
+    //  so get the first sample location and distance.  Modify both dimensions
+    //  as if we're doing a one-pass 2D resize; we'll throw away the unneeded
+    //  (and incorrect) dimension at the end.
+    const float2 curr_texel = tex_uv * tex_size;
+    const float2 prev_texel =
+        floor(curr_texel - float2(under_half)) + float2(0.5);
+    const float2 first_texel = prev_texel - float2(samples/2.0 - 1.0);
+    const float2 first_texel_uv_wrap_2D = first_texel * dr;
+    const float2 first_texel_dist_2D = curr_texel - first_texel;
+    //  Convert from tex_uv to tile_uv coords so we can sub fracs for fmods.
+    const float2 first_texel_tile_uv_wrap_2D =
+        first_texel_uv_wrap_2D * input_tiles_per_texture_r;
+    //  Project wrapped coordinates to the [0, 1] range.  We'll do this with all
+    //  samples,but the first texel is special, since it might be negative.
+    const float2 coord_negative =
+        float2((first_texel_tile_uv_wrap_2D.x < 0.),(first_texel_tile_uv_wrap_2D.y < 0.));
+    const float2 first_texel_tile_uv_2D =
+        frac(first_texel_tile_uv_wrap_2D) + coord_negative;
+    //  Pack the first texel's tile_uv coord and texel distance in 1D:
+    const float2 tile_u_and_dist =
+        float2(first_texel_tile_uv_2D.x, first_texel_dist_2D.x);
+    const float2 tile_v_and_dist =
+        float2(first_texel_tile_uv_2D.y, first_texel_dist_2D.y);
+    return vertical ? tile_v_and_dist : tile_u_and_dist;
+    //return lerp(tile_u_and_dist, tile_v_and_dist, float(vertical));
+}
+
+inline float4 tex2Dlod0try(const sampler2D tex, const float2 tex_uv)
+{
+    //  Mipmapping and anisotropic filtering get confused by sinc-resampling.
+    //  One [slow] workaround is to select the lowest mip level:
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        return textureLod(tex, float4(tex_uv, 0.0, 0.0).xy);
+    #else
+        #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+            return tex2Dbias(tex, float4(tex_uv, 0.0, -16.0));
+        #else
+            return texture(tex, tex_uv);
+        #endif
+    #endif
+}
+
+
+//////////////////////////////  LOOP BODY MACROS  //////////////////////////////
+
+//  Using inline functions can exceed the temporary register limit, so we're
+//  stuck with #define macros (I'm TRULY sorry).  They're declared here instead
+//  of above to be closer to the actual invocation sites.  Steps:
+//  1.) Get the exact texel location.
+//  2.) Sample the phosphor mask (already assumed encoded in linear RGB).
+//  3.) Get the distance from the current pixel and sinc weight:
+//          sinc(dist) = sin(pi * dist)/(pi * dist)
+//      We can also use the slower/smoother Lanczos instead:
+//          L(x) = sinc(dist) * sinc(dist / lobes)
+//  4.) Accumulate the weight sum in weights, and accumulate the weighted texels
+//      in pixel_color (we'll normalize outside the loop at the end).
+//  We vectorize the loop to help reduce the Lanczos window's cost.
+
+    //  The r coord is the coord in the dimension we're resizing along (u or v),
+    //  and first_texel_tile_uv_rrrr is a float4 of the first texel's u or v
+    //  tile_uv coord in [0, 1].  tex_uv_r will contain the tile_uv u or v coord
+    //  for four new texel samples.
+    #define CALCULATE_R_COORD_FOR_4_SAMPLES                                    \
+        const float4 true_i = float4(i_base + i) + float4(0.0, 1.0, 2.0, 3.0); \
+        const float4 tile_uv_r = frac(                                         \
+            first_texel_tile_uv_rrrr + true_i * tile_dr);                      \
+        const float4 tex_uv_r = tile_uv_r * tile_size_uv_r;
+
+    #ifdef PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+        #define CALCULATE_SINC_RESAMPLE_WEIGHTS                                \
+            const float4 pi_dist_over_lobes = pi_over_lobes * dist;            \
+            const float4 weights = min(sin(pi_dist) * sin(pi_dist_over_lobes) /\
+                (pi_dist*pi_dist_over_lobes), float4(1.0));
+    #else
+        #define CALCULATE_SINC_RESAMPLE_WEIGHTS                                \
+            const float4 weights = min(sin(pi_dist)/pi_dist, float4(1.0));
+    #endif
+
+    #define UPDATE_COLOR_AND_WEIGHT_SUMS                                       \
+        const float4 dist = magnification_scale *                              \
+            abs(first_dist_unscaled - true_i);                                 \
+        const float4 pi_dist = pi * dist;                                      \
+        CALCULATE_SINC_RESAMPLE_WEIGHTS;                                       \
+        pixel_color += new_sample0 * weights.xxx;                              \
+        pixel_color += new_sample1 * weights.yyy;                              \
+        pixel_color += new_sample2 * weights.zzz;                              \
+        pixel_color += new_sample3 * weights.www;                              \
+        weight_sum += weights;
+
+    #define VERTICAL_SINC_RESAMPLE_LOOP_BODY                                   \
+        CALCULATE_R_COORD_FOR_4_SAMPLES;                                       \
+        const float3 new_sample0 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.x)).rgb;                                 \
+        const float3 new_sample1 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.y)).rgb;                                 \
+        const float3 new_sample2 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.z)).rgb;                                 \
+        const float3 new_sample3 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.w)).rgb;                                 \
+        UPDATE_COLOR_AND_WEIGHT_SUMS;
+
+    #define HORIZONTAL_SINC_RESAMPLE_LOOP_BODY                                 \
+        CALCULATE_R_COORD_FOR_4_SAMPLES;                                       \
+        const float3 new_sample0 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.x, tex_uv.y)).rgb;                                 \
+        const float3 new_sample1 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.y, tex_uv.y)).rgb;                                 \
+        const float3 new_sample2 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.z, tex_uv.y)).rgb;                                 \
+        const float3 new_sample3 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.w, tex_uv.y)).rgb;                                 \
+        UPDATE_COLOR_AND_WEIGHT_SUMS;
+
+
+////////////////////////////  RESAMPLING FUNCTIONS  ////////////////////////////
+
+float3 downsample_vertical_sinc_tiled(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size, static const float dr,
+    const float magnification_scale, static const float tile_size_uv_r)
+{
+    //  Requires:   1.) dr == du == 1.0/texture_size.x or
+    //                  dr == dv == 1.0/texture_size.y
+    //                  (whichever direction we're resampling in).
+    //                  It's a scalar to save register space.
+    //              2.) tile_size_uv_r is the number of texels an input tile
+    //                  takes up in the input texture, in the direction we're
+    //                  resampling this pass.
+    //              3.) magnification_scale must be <= 1.0.
+    //  Returns:    Return a [Lanczos] sinc-resampled pixel of a vertically
+    //              downsized input tile embedded in an input texture.  (The
+    //              vertical version is special-cased though: It assumes the
+    //              tile size equals the [static] texture size, since it's used
+    //              on an LUT texture input containing one tile.  For more
+    //              generic use, eliminate the "static" in the parameters.)
+    //  The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension
+    //  we're resizing along, e.g. "dy" in this case.
+    #ifdef USE_SINGLE_STATIC_LOOP
+        //  A static loop can be faster, but it might blur too much from using
+        //  more samples than it should.
+        static const int samples = int(max_sinc_resize_samples_m4);
+    #else
+        const int samples = int(get_dynamic_loop_size(magnification_scale));
+    #endif
+
+    //  Get the first sample location (scalar tile uv coord along the resized
+    //  dimension) and distance from the output location (in texels):
+    static const float input_tiles_per_texture_r = 1.0/tile_size_uv_r;
+    //  true = vertical resize:
+    const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist(
+        tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, true);
+    const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx;
+    const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy;
+    //  Get the tile sample offset:
+    static const float tile_dr = dr * input_tiles_per_texture_r;
+
+    //  Sum up each weight and weighted sample color, varying the looping
+    //  strategy based on our expected dynamic loop capabilities.  See the
+    //  loop body macros above.
+    int i_base = 0;
+    float4 weight_sum = float4(0.0);
+    float3 pixel_color = float3(0.0);
+    static const int i_step = 4;
+    #ifdef BREAK_LOOPS_INTO_PIECES
+        if(samples - i_base >= 64)
+        {
+            for(int i = 0; i < 64; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 64;
+        }
+        if(samples - i_base >= 32)
+        {
+            for(int i = 0; i < 32; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 32;
+        }
+        if(samples - i_base >= 16)
+        {
+            for(int i = 0; i < 16; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 16;
+        }
+        if(samples - i_base >= 8)
+        {
+            for(int i = 0; i < 8; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 8;
+        }
+        if(samples - i_base >= 4)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 4;
+        }
+        //  Do another 4-sample block for a total of 128 max samples.
+        if(samples - i_base > 0)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+        }
+    #else
+        for(int i = 0; i < samples; i += i_step)
+        {
+            VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+        }
+    #endif
+    //  Normalize so the weight_sum == 1.0, and return:
+    const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw;
+    const float3 scalar_weight_sum = float3(weight_sum_reduce.x + 
+        weight_sum_reduce.y);
+    return (pixel_color/scalar_weight_sum);
+}
+
+float3 downsample_horizontal_sinc_tiled(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size, const float dr,
+    const float magnification_scale, const float tile_size_uv_r)
+{
+    //  Differences from downsample_horizontal_sinc_tiled:
+    //  1.) The dr and tile_size_uv_r parameters are not static consts.
+    //  2.) The "vertical" parameter to get_first_texel_tile_uv_and_dist is
+    //      set to false instead of true.
+    //  3.) The horizontal version of the loop body is used.
+    //  TODO: If we can get guaranteed compile-time dead code elimination,
+    //  we can combine the vertical/horizontal downsampling functions by:
+    //  1.) Add an extra static const bool parameter called "vertical."
+    //  2.) Supply it with the result of get_first_texel_tile_uv_and_dist().
+    //  3.) Use a conditional assignment in the loop body macro.  This is the
+    //      tricky part: We DO NOT want to incur the extra conditional
+    //      assignment in the inner loop at runtime!
+    //  The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension
+    //  we're resizing along, e.g. "dx" in this case.
+    #ifdef USE_SINGLE_STATIC_LOOP
+        //  If we have to load all samples, we might as well use them.
+        static const int samples = int(max_sinc_resize_samples_m4);
+    #else
+        const int samples = int(get_dynamic_loop_size(magnification_scale));
+    #endif
+
+    //  Get the first sample location (scalar tile uv coord along resized
+    //  dimension) and distance from the output location (in texels):
+    const float input_tiles_per_texture_r = 1.0/tile_size_uv_r;
+    //  false = horizontal resize:
+    const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist(
+        tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, false);
+    const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx;
+    const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy;
+    //  Get the tile sample offset:
+    const float tile_dr = dr * input_tiles_per_texture_r;
+
+    //  Sum up each weight and weighted sample color, varying the looping
+    //  strategy based on our expected dynamic loop capabilities.  See the
+    //  loop body macros above.
+    int i_base = 0;
+    float4 weight_sum = float4(0.0);
+    float3 pixel_color = float3(0.0);
+    static const int i_step = 4;
+    #ifdef BREAK_LOOPS_INTO_PIECES
+        if(samples - i_base >= 64)
+        {
+            for(int i = 0; i < 64; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 64;
+        }
+        if(samples - i_base >= 32)
+        {
+            for(int i = 0; i < 32; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 32;
+        }
+        if(samples - i_base >= 16)
+        {
+            for(int i = 0; i < 16; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 16;
+        }
+        if(samples - i_base >= 8)
+        {
+            for(int i = 0; i < 8; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 8;
+        }
+        if(samples - i_base >= 4)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 4;
+        }
+        //  Do another 4-sample block for a total of 128 max samples.
+        if(samples - i_base > 0)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+        }
+    #else
+        for(int i = 0; i < samples; i += i_step)
+        {
+            HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+        }
+    #endif
+    //  Normalize so the weight_sum == 1.0, and return:
+    const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw;
+    const float3 scalar_weight_sum = float3(weight_sum_reduce.x +
+        weight_sum_reduce.y);
+    return (pixel_color/scalar_weight_sum);
+}
+
+
+////////////////////////////  TILE SIZE CALCULATION  ///////////////////////////
+
+float2 get_resized_mask_tile_size(const float2 estimated_viewport_size,
+    const float2 estimated_mask_resize_output_size,
+    const bool solemnly_swear_same_inputs_for_every_pass)
+{
+    //  Requires:   The following global constants must be defined according to
+    //              certain constraints:
+    //              1.) mask_resize_num_triads: Must be high enough that our
+    //                  mask sampling method won't have artifacts later
+    //                  (long story; see derived-settings-and-constants.h)
+    //              2.) mask_resize_src_lut_size: Texel size of our mask LUT
+    //              3.) mask_triads_per_tile: Num horizontal triads in our LUT
+    //              4.) mask_min_allowed_triad_size: User setting (the more
+    //                  restrictive it is, the faster the resize will go)
+    //              5.) mask_min_allowed_tile_size_x < mask_resize_src_lut_size.x
+    //              6.) mask_triad_size_desired_{runtime, static}
+    //              7.) mask_num_triads_desired_{runtime, static}
+    //              8.) mask_specify_num_triads must be 0.0/1.0 (false/true)
+    //              The function parameters must be defined as follows:
+    //              1.) estimated_viewport_size == (final viewport size);
+    //                  If mask_specify_num_triads is 1.0/true and the viewport
+    //                  estimate is wrong, the number of triads will differ from
+    //                  the user's preference by about the same factor.
+    //              2.) estimated_mask_resize_output_size: Must equal the
+    //                  output size of the MASK_RESIZE pass.
+    //                  Exception: The x component may be estimated garbage if
+    //                  and only if the caller throws away the x result.
+    //              3.) solemnly_swear_same_inputs_for_every_pass: Set to false,
+    //                  unless you can guarantee that every call across every
+    //                  pass will use the same sizes for the other parameters.
+    //              When calling this across multiple passes, always use the
+    //              same y viewport size/scale, and always use the same x
+    //              viewport size/scale when using the x result.
+    //  Returns:    Return the final size of a manually resized mask tile, after
+    //              constraining the desired size to avoid artifacts.  Under
+    //              unusual circumstances, tiles may become stretched vertically
+    //              (see wall of text below).
+    //  Stated tile properties must be correct:
+    static const float tile_aspect_ratio_inv =
+        mask_resize_src_lut_size.y/mask_resize_src_lut_size.x;
+    static const float tile_aspect_ratio = 1.0/tile_aspect_ratio_inv;
+    static const float2 tile_aspect = float2(1.0, tile_aspect_ratio_inv);
+    //  If mask_specify_num_triads is 1.0/true and estimated_viewport_size.x is
+    //  wrong, the user preference will be misinterpreted:
+    const float desired_tile_size_x = mask_triads_per_tile * lerp(
+        mask_triad_size_desired,
+        estimated_viewport_size.x / mask_num_triads_desired,
+        mask_specify_num_triads);
+    if(get_mask_sample_mode() > 0.5)
+    {
+        //  We don't need constraints unless we're sampling MASK_RESIZE.
+        return desired_tile_size_x * tile_aspect;
+    }
+    //  Make sure we're not upsizing:
+    const float temp_tile_size_x =
+        min(desired_tile_size_x, mask_resize_src_lut_size.x);
+    //  Enforce min_tile_size and max_tile_size in both dimensions:
+    const float2 temp_tile_size = temp_tile_size_x * tile_aspect;
+    static const float2 min_tile_size =
+        mask_min_allowed_tile_size * tile_aspect;
+    const float2 max_tile_size =
+        estimated_mask_resize_output_size / mask_resize_num_tiles;
+    const float2 clamped_tile_size =
+        clamp(temp_tile_size, min_tile_size, max_tile_size);
+    //  Try to maintain tile_aspect_ratio.  This is the tricky part:
+    //  If we're currently resizing in the y dimension, the x components
+    //  could be MEANINGLESS.  (If estimated_mask_resize_output_size.x is
+    //  bogus, then so is max_tile_size.x and clamped_tile_size.x.)
+    //  We can't adjust the y size based on clamped_tile_size.x.  If it
+    //  clamps when it shouldn't, it won't clamp again when later passes
+    //  call this function with the correct sizes, and the discrepancy will
+    //  break the sampling coords in MASKED_SCANLINES.  Instead, we'll limit
+    //  the x size based on the y size, but not vice versa, unless the
+    //  caller swears the parameters were the same (correct) in every pass.
+    //  As a result, triads could appear vertically stretched if:
+    //  a.) mask_resize_src_lut_size.x > mask_resize_src_lut_size.y: Wide
+    //      LUT's might clamp x more than y (all provided LUT's are square)
+    //  b.) true_viewport_size.x < true_viewport_size.y: The user is playing
+    //      with a vertically oriented screen (not accounted for anyway)
+    //  c.) mask_resize_viewport_scale.x < masked_resize_viewport_scale.y:
+    //      Viewport scales are equal by default.
+    //  If any of these are the case, you can fix the stretching by setting:
+    //      mask_resize_viewport_scale.x = mask_resize_viewport_scale.y *
+    //          (1.0 / min_expected_aspect_ratio) *
+    //          (mask_resize_src_lut_size.x / mask_resize_src_lut_size.y)
+    const float x_tile_size_from_y =
+        clamped_tile_size.y * tile_aspect_ratio;
+    const float y_tile_size_from_x = lerp(clamped_tile_size.y,
+        clamped_tile_size.x * tile_aspect_ratio_inv,
+        float(solemnly_swear_same_inputs_for_every_pass));
+    const float2 reclamped_tile_size = float2(
+        min(clamped_tile_size.x, x_tile_size_from_y),
+        min(clamped_tile_size.y, y_tile_size_from_x));
+    //  We need integer tile sizes in both directions for tiled sampling to
+    //  work correctly.  Use floor (to make sure we don't round up), but be
+    //  careful to avoid a rounding bug where floor decreases whole numbers:
+    const float2 final_resized_tile_size =
+        floor(reclamped_tile_size + float2(FIX_ZERO(0.0)));
+    return final_resized_tile_size;
+}
+
+
+/////////////////////////  FINAL MASK SAMPLING HELPERS  ////////////////////////
+
+float4 get_mask_sampling_parameters(const float2 mask_resize_texture_size,
+    const float2 mask_resize_video_size, const float2 true_viewport_size,
+    out float2 mask_tiles_per_screen)
+{
+    //  Requires:   1.) Requirements of get_resized_mask_tile_size() must be
+    //                  met, particularly regarding global constants.
+    //              The function parameters must be defined as follows:
+    //              1.) mask_resize_texture_size == MASK_RESIZE.texture_size
+    //                  if get_mask_sample_mode() is 0 (otherwise anything)
+    //              2.) mask_resize_video_size == MASK_RESIZE.video_size
+    //                  if get_mask_sample_mode() is 0 (otherwise anything)
+    //              3.) true_viewport_size == output_size for a pass set to
+    //                  1.0 viewport scale (i.e. it must be correct)
+    //  Returns:    Return a float4 containing:
+    //                  xy: tex_uv coords for the start of the mask tile
+    //                  zw: tex_uv size of the mask tile from start to end
+    //              mask_tiles_per_screen is an out parameter containing the
+    //              number of mask tiles that will fit on the screen.
+    //  First get the final resized tile size.  The viewport size and mask
+    //  resize viewport scale must be correct, but don't solemnly swear they
+    //  were correct in both mask resize passes unless you know it's true.
+    //  (We can better ensure a correct tile aspect ratio if the parameters are
+    //  guaranteed correct in all passes...but if we lie, we'll get inconsistent
+    //  sizes across passes, resulting in broken texture coordinates.)
+    const float mask_sample_mode = get_mask_sample_mode();
+    const float2 mask_resize_tile_size = get_resized_mask_tile_size(
+        true_viewport_size, mask_resize_video_size, false);
+    if(mask_sample_mode < 0.5)
+    {
+        //  Sample MASK_RESIZE: The resized tile is a fraction of the texture
+        //  size and starts at a nonzero offset to allow for border texels:
+        const float2 mask_tile_uv_size = mask_resize_tile_size /
+            mask_resize_texture_size;
+        const float2 skipped_tiles = mask_start_texels/mask_resize_tile_size;
+        const float2 mask_tile_start_uv = skipped_tiles * mask_tile_uv_size;
+        //  mask_tiles_per_screen must be based on the *true* viewport size:
+        mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size;
+        return float4(mask_tile_start_uv, mask_tile_uv_size);
+    }
+    else
+    {
+        //  If we're tiling at the original size (1:1 pixel:texel), redefine a
+        //  "tile" to be the full texture containing many triads.  Otherwise,
+        //  we're hardware-resampling an LUT, and the texture truly contains a
+        //  single unresized phosphor mask tile anyway.
+        static const float2 mask_tile_uv_size = float2(1.0);
+        static const float2 mask_tile_start_uv = float2(0.0);
+        if(mask_sample_mode > 1.5)
+        {
+            //  Repeat the full LUT at a 1:1 pixel:texel ratio without resizing:
+            mask_tiles_per_screen = true_viewport_size/mask_texture_large_size;
+        }
+        else
+        {
+            //  Hardware-resize the original LUT:
+            mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size;
+        }
+        return float4(mask_tile_start_uv, mask_tile_uv_size);
+    }
+}
+/*
+float2 fix_tiling_discontinuities_normalized(const float2 tile_uv,
+    float2 duv_dx, float2 duv_dy)
+{
+    //  Requires:   1.) duv_dx == ddx(tile_uv)
+    //              2.) duv_dy == ddy(tile_uv)
+    //              3.) tile_uv contains tile-relative uv coords in [0, 1],
+    //                  such that (0.5, 0.5) is the center of a tile, etc.
+    //                  ("Tile" can mean texture, the video embedded in the
+    //                  texture, or some other "tile" embedded in a texture.)
+    //  Returns:    Return new tile_uv coords that contain no discontinuities
+    //              across a 2x2 pixel quad.
+    //  Description:
+    //  When uv coords wrap from 1.0 to 0.0, they create a discontinuity in the
+    //  derivatives, which we assume happened if the absolute difference between
+    //  any fragment in a 2x2 block is > ~half a tile.  If the current block has
+    //  a u or v discontinuity and the current fragment is in the first half of
+    //  the tile along that axis (i.e. it wrapped from 1.0 to 0.0), add a tile
+    //  to that coord to make the 2x2 block continuous.  (It will now have a
+    //  coord > 1.0 in the padding area beyond the tile.)  This function takes
+    //  derivatives as parameters so the caller can reuse them.
+    //  In case we're using high-quality (nVidia-style) derivatives, ensure
+    //  diagonically opposite fragments see each other for correctness:
+    duv_dx = abs(duv_dx) + abs(ddy(duv_dx));
+    duv_dy = abs(duv_dy) + abs(ddx(duv_dy));
+    const float2 pixel_in_first_half_tile = float2((tile_uv.x < 0.5),(tile_uv.y < 0.5));
+    const float2 jump_exists = float2(((duv_dx + duv_dy).x > 0.5),((duv_dx + duv_dy).y > 0.5));
+    return tile_uv + jump_exists * pixel_in_first_half_tile;
+}
+*/
+float2 convert_phosphor_tile_uv_wrap_to_tex_uv(const float2 tile_uv_wrap,
+    const float4 mask_tile_start_uv_and_size)
+{
+    //  Requires:   1.) tile_uv_wrap contains tile-relative uv coords, where the
+    //                  tile spans from [0, 1], such that (0.5, 0.5) is at the
+    //                  tile center.  The input coords can range from [0, inf],
+    //                  and their fractional parts map to a repeated tile.
+    //                  ("Tile" can mean texture, the video embedded in the
+    //                  texture, or some other "tile" embedded in a texture.)
+    //              2.) mask_tile_start_uv_and_size.xy contains tex_uv coords
+    //                  for the start of the embedded tile in the full texture.
+    //              3.) mask_tile_start_uv_and_size.zw contains the [fractional]
+    //                  tex_uv size of the embedded tile in the full texture.
+    //  Returns:    Return tex_uv coords (used for texture sampling)
+    //              corresponding to tile_uv_wrap.
+    if(get_mask_sample_mode() < 0.5)
+    {
+        //  Manually repeat the resized mask tile to fill the screen:
+        //  First get fractional tile_uv coords.  Using frac/fmod on coords
+        //  confuses anisotropic filtering; fix it as user options dictate.
+        //  derived-settings-and-constants.h disables incompatible options.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            float2 tile_uv = frac(tile_uv_wrap * 0.5) * 2.0;
+        #else
+            float2 tile_uv = frac(tile_uv_wrap);
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            const float2 tile_uv_dx = ddx(tile_uv);
+            const float2 tile_uv_dy = ddy(tile_uv);
+            tile_uv = fix_tiling_discontinuities_normalized(tile_uv,
+                tile_uv_dx, tile_uv_dy);
+        #endif
+        //  The tile is embedded in a padded FBO, and it may start at a
+        //  nonzero offset if border texels are used to avoid artifacts:
+        const float2 mask_tex_uv = mask_tile_start_uv_and_size.xy +
+            tile_uv * mask_tile_start_uv_and_size.zw;
+        return mask_tex_uv;
+    }
+    else
+    {
+        //  Sample from the input phosphor mask texture with hardware tiling.
+        //  If we're tiling at the original size (mode 2), the "tile" is the
+        //  whole texture, and it contains a large number of triads mapped with
+        //  a 1:1 pixel:texel ratio.  OTHERWISE, the texture contains a single
+        //  unresized tile.  tile_uv_wrap already has correct coords for both!
+        return tile_uv_wrap;
+    }
+}
+
+
+#endif  //  PHOSPHOR_MASK_RESIZING_H
+
+/////////////////////////  END PHOSPHOR-MASK-RESIZING  /////////////////////////
+
+#undef COMPAT_PRECISION
+#undef COMPAT_TEXTURE
+
+void main() {
+    //  Resize the input phosphor mask tile to the final vertical size it will
+    //  appear on screen.  Keep 1x horizontal size if possible (IN.output_size
+    //  >= mask_resize_src_lut_size), and otherwise linearly sample horizontally
+    //  to fit exactly one tile.  Lanczos-resizing the phosphor mask achieves
+    //  much sharper results than mipmapping, and vertically resizing first
+    //  minimizes the total number of taps required.  We output a number of
+    //  resized tiles >= mask_resize_num_tiles for easier tiled sampling later.
+    //const float2 src_tex_uv_wrap = src_tex_uv_wrap;
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        //  Discard unneeded fragments in case our profile allows real branches.
+        const float2 tile_uv_wrap = src_tex_uv_wrap;
+        if(get_mask_sample_mode() < 0.5 &&
+            tile_uv_wrap.y <= mask_resize_num_tiles)
+        {
+            static const float src_dy = 1.0/mask_resize_src_lut_size.y;
+            const float2 src_tex_uv = frac(src_tex_uv_wrap);
+            float3 pixel_color;
+            //  If mask_type is static, this branch will be resolved statically.
+			#ifdef PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+				if(mask_type < 0.5)
+				{
+					pixel_color = downsample_vertical_sinc_tiled(
+						mask_grille_texture_large, src_tex_uv, mask_resize_src_lut_size,
+						src_dy, resize_magnification_scale.y, 1.0);
+				}
+				else if(mask_type < 1.5)
+				{
+					pixel_color = downsample_vertical_sinc_tiled(
+						mask_slot_texture_large, src_tex_uv, mask_resize_src_lut_size,
+						src_dy, resize_magnification_scale.y, 1.0);
+				}
+				else
+				{
+					pixel_color = downsample_vertical_sinc_tiled(
+						mask_shadow_texture_large, src_tex_uv, mask_resize_src_lut_size,
+						src_dy, resize_magnification_scale.y, 1.0);
+				}
+			#else
+				if(mask_type < 0.5)
+				{
+					pixel_color = downsample_vertical_sinc_tiled(
+						mask_grille_texture_small, src_tex_uv, mask_resize_src_lut_size,
+						src_dy, resize_magnification_scale.y, 1.0);
+				}
+				else if(mask_type < 1.5)
+				{
+					pixel_color = downsample_vertical_sinc_tiled(
+						mask_slot_texture_small, src_tex_uv, mask_resize_src_lut_size,
+						src_dy, resize_magnification_scale.y, 1.0);
+				}
+				else
+				{
+					pixel_color = downsample_vertical_sinc_tiled(
+						mask_shadow_texture_small, src_tex_uv, mask_resize_src_lut_size,
+						src_dy, resize_magnification_scale.y, 1.0);
+				}
+			#endif
+            //  The input LUT was linear RGB, and so is our output:
+            FragColor = float4(pixel_color, 1.0);
+        }
+        else
+        {
+            discard;
+        }
+    #else
+        discard;
+        FragColor = float4(1.0, 1.0, 1.0, 1.0);
+	#endif
+}
\ No newline at end of file
diff --git a/shaders/CRT-Royale.shader/mask-resize-vertical.vs b/shaders/CRT-Royale.shader/mask-resize-vertical.vs
new file mode 100644
index 00000000..2dac429b
--- /dev/null
+++ b/shaders/CRT-Royale.shader/mask-resize-vertical.vs
@@ -0,0 +1,3212 @@
+#version 150
+
+in vec4 position;
+in vec2 texCoord;
+
+out Vertex {
+   vec2 vTexCoord;
+   vec2 src_tex_uv_wrap;
+   vec2 resize_magnification_scale;
+};
+
+uniform vec4 targetSize;
+uniform vec4 sourceSize[];
+uniform int phase;
+
+// USER SETTINGS BLOCK //
+
+#define crt_gamma 2.500000
+#define lcd_gamma 2.200000
+#define levels_contrast 1.0
+#define halation_weight 0.0
+#define diffusion_weight 0.075
+#define bloom_underestimate_levels 0.8
+#define bloom_excess 0.000000
+#define beam_min_sigma 0.020000
+#define beam_max_sigma 0.300000
+#define beam_spot_power 0.330000
+#define beam_min_shape 2.000000
+#define beam_max_shape 4.000000
+#define beam_shape_power 0.250000
+#define beam_horiz_filter 0.000000
+#define beam_horiz_sigma 0.35
+#define beam_horiz_linear_rgb_weight 1.000000
+#define convergence_offset_x_r -0.000000
+#define convergence_offset_x_g 0.000000
+#define convergence_offset_x_b 0.000000
+#define convergence_offset_y_r 0.000000
+#define convergence_offset_y_g -0.000000
+#define convergence_offset_y_b 0.000000
+#define mask_type 1.000000
+#define mask_sample_mode_desired 0.000000
+#define mask_specify_num_triads 0.000000
+#define mask_triad_size_desired 3.000000
+#define mask_num_triads_desired 480.000000
+#define aa_subpixel_r_offset_x_runtime -0.0
+#define aa_subpixel_r_offset_y_runtime 0.000000
+#define aa_cubic_c 0.500000
+#define aa_gauss_sigma 0.500000
+#define geom_mode_runtime 0.000000
+#define geom_radius 2.000000
+#define geom_view_dist 2.000000
+#define geom_tilt_angle_x 0.000000
+#define geom_tilt_angle_y 0.000000
+#define geom_aspect_ratio_x 432.000000
+#define geom_aspect_ratio_y 329.000000
+#define geom_overscan_x 1.000000
+#define geom_overscan_y 1.000000
+#define border_size 0.015
+#define border_darkness 2.0
+#define border_compress 2.500000
+#define interlace_bff 0.000000
+#define interlace_1080i 0.000000
+
+// END USER SETTINGS BLOCK //
+
+// compatibility macros for transparently converting HLSLisms into GLSLisms
+#define mul(a,b) (b*a)
+#define lerp(a,b,c) mix(a,b,c)
+#define saturate(c) clamp(c, 0.0, 1.0)
+#define frac(x) (fract(x))
+#define float2 vec2
+#define float3 vec3
+#define float4 vec4
+#define bool2 bvec2
+#define bool3 bvec3
+#define bool4 bvec4
+#define float2x2 mat2x2
+#define float3x3 mat3x3
+#define float4x4 mat4x4
+#define float4x3 mat4x3
+#define float2x4 mat2x4
+#define IN params
+#define texture_size sourceSize[0].xy
+#define video_size sourceSize[0].xy
+#define output_size targetSize.xy
+#define frame_count phase
+#define static  
+#define inline  
+#define const  
+#define fmod(x,y) mod(x,y)
+#define ddx(c) dFdx(c)
+#define ddy(c) dFdy(c)
+#define atan2(x,y) atan(y,x)
+#define rsqrt(c) inversesqrt(c)
+
+#define input_texture source[0]
+
+#if defined(GL_ES)
+	#define COMPAT_PRECISION mediump
+#else
+	#define COMPAT_PRECISION
+#endif
+
+#if __VERSION__ >= 130
+	#define COMPAT_TEXTURE texture
+#else
+	#define COMPAT_TEXTURE texture2D
+#endif
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+////////////////////////////  END USER-SETTINGS  //////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  END DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////////////
+
+//#include "bind-shader-h"
+
+/////////////////////////////  BEGIN BIND-SHADER-PARAMS  ////////////////////////////
+
+#ifndef BIND_SHADER_PARAMS_H
+#define BIND_SHADER_PARAMS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+/////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+////////////////////   END DERIVED-SETTINGS-AND-CONSTANTS   /////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+//  Override some parameters for gamma-management.h and tex2Dantialias.h:
+#define OVERRIDE_DEVICE_GAMMA
+static const float gba_gamma = 3.5; //  Irrelevant but necessary to define.
+#define ANTIALIAS_OVERRIDE_BASICS
+#define ANTIALIAS_OVERRIDE_PARAMETERS
+
+//  Provide accessors for vector constants that pack scalar uniforms:
+inline float2 get_aspect_vector(const float geom_aspect_ratio)
+{
+    //  Get an aspect ratio vector.  Enforce geom_max_aspect_ratio, and prevent
+    //  the absolute scale from affecting the uv-mapping for curvature:
+    const float geom_clamped_aspect_ratio =
+        min(geom_aspect_ratio, geom_max_aspect_ratio);
+    const float2 geom_aspect =
+        normalize(float2(geom_clamped_aspect_ratio, 1.0));
+    return geom_aspect;
+}
+
+inline float2 get_geom_overscan_vector()
+{
+    return float2(geom_overscan_x, geom_overscan_y);
+}
+
+inline float2 get_geom_tilt_angle_vector()
+{
+    return float2(geom_tilt_angle_x, geom_tilt_angle_y);
+}
+
+inline float3 get_convergence_offsets_x_vector()
+{
+    return float3(convergence_offset_x_r, convergence_offset_x_g,
+        convergence_offset_x_b);
+}
+
+inline float3 get_convergence_offsets_y_vector()
+{
+    return float3(convergence_offset_y_r, convergence_offset_y_g,
+        convergence_offset_y_b);
+}
+
+inline float2 get_convergence_offsets_r_vector()
+{
+    return float2(convergence_offset_x_r, convergence_offset_y_r);
+}
+
+inline float2 get_convergence_offsets_g_vector()
+{
+    return float2(convergence_offset_x_g, convergence_offset_y_g);
+}
+
+inline float2 get_convergence_offsets_b_vector()
+{
+    return float2(convergence_offset_x_b, convergence_offset_y_b);
+}
+
+inline float2 get_aa_subpixel_r_offset()
+{
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+            //  WARNING: THIS IS EXTREMELY EXPENSIVE.
+            return float2(aa_subpixel_r_offset_x_runtime,
+                aa_subpixel_r_offset_y_runtime);
+        #else
+            return aa_subpixel_r_offset_static;
+        #endif
+    #else
+        return aa_subpixel_r_offset_static;
+    #endif
+}
+
+//  Provide accessors settings which still need "cooking:"
+inline float get_mask_amplify()
+{
+    static const float mask_grille_amplify = 1.0/mask_grille_avg_color;
+    static const float mask_slot_amplify = 1.0/mask_slot_avg_color;
+    static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color;
+    return mask_type < 0.5 ? mask_grille_amplify :
+        mask_type < 1.5 ? mask_slot_amplify :
+        mask_shadow_amplify;
+}
+
+inline float get_mask_sample_mode()
+{
+    #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_desired;
+        #else
+            return clamp(mask_sample_mode_desired, 1.0, 2.0);
+        #endif
+    #else
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_static;
+        #else
+            return clamp(mask_sample_mode_static, 1.0, 2.0);
+        #endif
+    #endif
+}
+
+#endif  //  BIND_SHADER_PARAMS_H
+
+////////////////////////////  END BIND-SHADER-PARAMS  ///////////////////////////
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//#include "phosphor-mask-resizing.h"
+
+////////////////////////  BEGIN PHOSPHOR-MASK-RESIZING  ////////////////////////
+
+#ifndef PHOSPHOR_MASK_RESIZING_H
+#define PHOSPHOR_MASK_RESIZING_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//#include "../user-settings.h"
+//#include "derived-settings-and-constants.h"
+
+/////////////////////////////  CODEPATH SELECTION  /////////////////////////////
+
+//  Choose a looping strategy based on what's allowed:
+//  Dynamic loops not allowed: Use a flat static loop.
+//  Dynamic loops accomodated: Coarsely branch around static loops.
+//  Dynamic loops assumed allowed: Use a flat dynamic loop.
+#ifndef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #ifdef ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+        #define BREAK_LOOPS_INTO_PIECES
+    #else
+        #define USE_SINGLE_STATIC_LOOP
+    #endif
+#endif  //  No else needed: Dynamic loops assumed.
+
+
+//////////////////////////////////  CONSTANTS  /////////////////////////////////
+
+//  The larger the resized tile, the fewer samples we'll need for downsizing.
+//  See if we can get a static min tile size > mask_min_allowed_tile_size:
+static const float mask_min_allowed_tile_size = ceil(
+    mask_min_allowed_triad_size * mask_triads_per_tile);
+static const float mask_min_expected_tile_size = 
+        mask_min_allowed_tile_size;
+//  Limit the number of sinc resize taps by the maximum minification factor:
+static const float pi_over_lobes = pi/mask_sinc_lobes;
+static const float max_sinc_resize_samples_float = 2.0 * mask_sinc_lobes *
+    mask_resize_src_lut_size.x/mask_min_expected_tile_size;
+//  Vectorized loops sample in multiples of 4.  Round up to be safe:
+static const float max_sinc_resize_samples_m4 = ceil(
+    max_sinc_resize_samples_float * 0.25) * 4.0;
+
+
+/////////////////////////  RESAMPLING FUNCTION HELPERS  ////////////////////////
+
+inline float get_dynamic_loop_size(const float magnification_scale)
+{
+    //  Requires:   The following global constants must be defined:
+    //              1.) mask_sinc_lobes
+    //              2.) max_sinc_resize_samples_m4
+    //  Returns:    The minimum number of texture samples for a correct downsize
+    //              at magnification_scale.
+    //  We're downsizing, so the filter is sized across 2*lobes output pixels
+    //  (not 2*lobes input texels).  This impacts distance measurements and the
+    //  minimum number of input samples needed.
+    const float min_samples_float = 2.0 * mask_sinc_lobes / magnification_scale;
+    const float min_samples_m4 = ceil(min_samples_float * 0.25) * 4.0;
+    #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+        const float max_samples_m4 = max_sinc_resize_samples_m4;
+    #else   // ifdef BREAK_LOOPS_INTO_PIECES
+        //  Simulating loops with branches imposes a 128-sample limit.
+        const float max_samples_m4 = min(128.0, max_sinc_resize_samples_m4);
+    #endif
+    return min(min_samples_m4, max_samples_m4);
+}
+
+float2 get_first_texel_tile_uv_and_dist(const float2 tex_uv, 
+    const float2 tex_size, const float dr, 
+    const float input_tiles_per_texture_r, const float samples,
+    static const bool vertical)
+{
+    //  Requires:   1.) dr == du == 1.0/texture_size.x or
+    //                  dr == dv == 1.0/texture_size.y
+    //                  (whichever direction we're resampling in).
+    //                  It's a scalar to save register space.
+    //              2.) input_tiles_per_texture_r is the number of input tiles
+    //                  that can fit in the input texture in the direction we're
+    //                  resampling this pass.
+    //              3.) vertical indicates whether we're resampling vertically
+    //                  this pass (or horizontally).
+    //  Returns:    Pack and return the first sample's tile_uv coord in [0, 1]
+    //              and its texel distance from the destination pixel, in the
+    //              resized dimension only.
+    //  We'll start with the topmost or leftmost sample and work down or right,
+    //  so get the first sample location and distance.  Modify both dimensions
+    //  as if we're doing a one-pass 2D resize; we'll throw away the unneeded
+    //  (and incorrect) dimension at the end.
+    const float2 curr_texel = tex_uv * tex_size;
+    const float2 prev_texel =
+        floor(curr_texel - float2(under_half)) + float2(0.5);
+    const float2 first_texel = prev_texel - float2(samples/2.0 - 1.0);
+    const float2 first_texel_uv_wrap_2D = first_texel * dr;
+    const float2 first_texel_dist_2D = curr_texel - first_texel;
+    //  Convert from tex_uv to tile_uv coords so we can sub fracs for fmods.
+    const float2 first_texel_tile_uv_wrap_2D =
+        first_texel_uv_wrap_2D * input_tiles_per_texture_r;
+    //  Project wrapped coordinates to the [0, 1] range.  We'll do this with all
+    //  samples,but the first texel is special, since it might be negative.
+    const float2 coord_negative =
+        float2((first_texel_tile_uv_wrap_2D.x < 0.),(first_texel_tile_uv_wrap_2D.y < 0.));
+    const float2 first_texel_tile_uv_2D =
+        frac(first_texel_tile_uv_wrap_2D) + coord_negative;
+    //  Pack the first texel's tile_uv coord and texel distance in 1D:
+    const float2 tile_u_and_dist =
+        float2(first_texel_tile_uv_2D.x, first_texel_dist_2D.x);
+    const float2 tile_v_and_dist =
+        float2(first_texel_tile_uv_2D.y, first_texel_dist_2D.y);
+    return vertical ? tile_v_and_dist : tile_u_and_dist;
+    //return lerp(tile_u_and_dist, tile_v_and_dist, float(vertical));
+}
+
+inline float4 tex2Dlod0try(const sampler2D tex, const float2 tex_uv)
+{
+    //  Mipmapping and anisotropic filtering get confused by sinc-resampling.
+    //  One [slow] workaround is to select the lowest mip level:
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        return textureLod(tex, float4(tex_uv, 0.0, 0.0).xy);
+    #else
+        #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+            return tex2Dbias(tex, float4(tex_uv, 0.0, -16.0));
+        #else
+            return texture(tex, tex_uv);
+        #endif
+    #endif
+}
+
+
+//////////////////////////////  LOOP BODY MACROS  //////////////////////////////
+
+//  Using inline functions can exceed the temporary register limit, so we're
+//  stuck with #define macros (I'm TRULY sorry).  They're declared here instead
+//  of above to be closer to the actual invocation sites.  Steps:
+//  1.) Get the exact texel location.
+//  2.) Sample the phosphor mask (already assumed encoded in linear RGB).
+//  3.) Get the distance from the current pixel and sinc weight:
+//          sinc(dist) = sin(pi * dist)/(pi * dist)
+//      We can also use the slower/smoother Lanczos instead:
+//          L(x) = sinc(dist) * sinc(dist / lobes)
+//  4.) Accumulate the weight sum in weights, and accumulate the weighted texels
+//      in pixel_color (we'll normalize outside the loop at the end).
+//  We vectorize the loop to help reduce the Lanczos window's cost.
+
+    //  The r coord is the coord in the dimension we're resizing along (u or v),
+    //  and first_texel_tile_uv_rrrr is a float4 of the first texel's u or v
+    //  tile_uv coord in [0, 1].  tex_uv_r will contain the tile_uv u or v coord
+    //  for four new texel samples.
+    #define CALCULATE_R_COORD_FOR_4_SAMPLES                                    \
+        const float4 true_i = float4(i_base + i) + float4(0.0, 1.0, 2.0, 3.0); \
+        const float4 tile_uv_r = frac(                                         \
+            first_texel_tile_uv_rrrr + true_i * tile_dr);                      \
+        const float4 tex_uv_r = tile_uv_r * tile_size_uv_r;
+
+    #ifdef PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+        #define CALCULATE_SINC_RESAMPLE_WEIGHTS                                \
+            const float4 pi_dist_over_lobes = pi_over_lobes * dist;            \
+            const float4 weights = min(sin(pi_dist) * sin(pi_dist_over_lobes) /\
+                (pi_dist*pi_dist_over_lobes), float4(1.0));
+    #else
+        #define CALCULATE_SINC_RESAMPLE_WEIGHTS                                \
+            const float4 weights = min(sin(pi_dist)/pi_dist, float4(1.0));
+    #endif
+
+    #define UPDATE_COLOR_AND_WEIGHT_SUMS                                       \
+        const float4 dist = magnification_scale *                              \
+            abs(first_dist_unscaled - true_i);                                 \
+        const float4 pi_dist = pi * dist;                                      \
+        CALCULATE_SINC_RESAMPLE_WEIGHTS;                                       \
+        pixel_color += new_sample0 * weights.xxx;                              \
+        pixel_color += new_sample1 * weights.yyy;                              \
+        pixel_color += new_sample2 * weights.zzz;                              \
+        pixel_color += new_sample3 * weights.www;                              \
+        weight_sum += weights;
+
+    #define VERTICAL_SINC_RESAMPLE_LOOP_BODY                                   \
+        CALCULATE_R_COORD_FOR_4_SAMPLES;                                       \
+        const float3 new_sample0 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.x)).rgb;                                 \
+        const float3 new_sample1 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.y)).rgb;                                 \
+        const float3 new_sample2 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.z)).rgb;                                 \
+        const float3 new_sample3 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.w)).rgb;                                 \
+        UPDATE_COLOR_AND_WEIGHT_SUMS;
+
+    #define HORIZONTAL_SINC_RESAMPLE_LOOP_BODY                                 \
+        CALCULATE_R_COORD_FOR_4_SAMPLES;                                       \
+        const float3 new_sample0 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.x, tex_uv.y)).rgb;                                 \
+        const float3 new_sample1 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.y, tex_uv.y)).rgb;                                 \
+        const float3 new_sample2 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.z, tex_uv.y)).rgb;                                 \
+        const float3 new_sample3 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.w, tex_uv.y)).rgb;                                 \
+        UPDATE_COLOR_AND_WEIGHT_SUMS;
+
+
+////////////////////////////  RESAMPLING FUNCTIONS  ////////////////////////////
+
+float3 downsample_vertical_sinc_tiled(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size, static const float dr,
+    const float magnification_scale, static const float tile_size_uv_r)
+{
+    //  Requires:   1.) dr == du == 1.0/texture_size.x or
+    //                  dr == dv == 1.0/texture_size.y
+    //                  (whichever direction we're resampling in).
+    //                  It's a scalar to save register space.
+    //              2.) tile_size_uv_r is the number of texels an input tile
+    //                  takes up in the input texture, in the direction we're
+    //                  resampling this pass.
+    //              3.) magnification_scale must be <= 1.0.
+    //  Returns:    Return a [Lanczos] sinc-resampled pixel of a vertically
+    //              downsized input tile embedded in an input texture.  (The
+    //              vertical version is special-cased though: It assumes the
+    //              tile size equals the [static] texture size, since it's used
+    //              on an LUT texture input containing one tile.  For more
+    //              generic use, eliminate the "static" in the parameters.)
+    //  The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension
+    //  we're resizing along, e.g. "dy" in this case.
+    #ifdef USE_SINGLE_STATIC_LOOP
+        //  A static loop can be faster, but it might blur too much from using
+        //  more samples than it should.
+        static const int samples = int(max_sinc_resize_samples_m4);
+    #else
+        const int samples = int(get_dynamic_loop_size(magnification_scale));
+    #endif
+
+    //  Get the first sample location (scalar tile uv coord along the resized
+    //  dimension) and distance from the output location (in texels):
+    static const float input_tiles_per_texture_r = 1.0/tile_size_uv_r;
+    //  true = vertical resize:
+    const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist(
+        tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, true);
+    const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx;
+    const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy;
+    //  Get the tile sample offset:
+    static const float tile_dr = dr * input_tiles_per_texture_r;
+
+    //  Sum up each weight and weighted sample color, varying the looping
+    //  strategy based on our expected dynamic loop capabilities.  See the
+    //  loop body macros above.
+    int i_base = 0;
+    float4 weight_sum = float4(0.0);
+    float3 pixel_color = float3(0.0);
+    static const int i_step = 4;
+    #ifdef BREAK_LOOPS_INTO_PIECES
+        if(samples - i_base >= 64)
+        {
+            for(int i = 0; i < 64; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 64;
+        }
+        if(samples - i_base >= 32)
+        {
+            for(int i = 0; i < 32; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 32;
+        }
+        if(samples - i_base >= 16)
+        {
+            for(int i = 0; i < 16; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 16;
+        }
+        if(samples - i_base >= 8)
+        {
+            for(int i = 0; i < 8; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 8;
+        }
+        if(samples - i_base >= 4)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 4;
+        }
+        //  Do another 4-sample block for a total of 128 max samples.
+        if(samples - i_base > 0)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+        }
+    #else
+        for(int i = 0; i < samples; i += i_step)
+        {
+            VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+        }
+    #endif
+    //  Normalize so the weight_sum == 1.0, and return:
+    const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw;
+    const float3 scalar_weight_sum = float3(weight_sum_reduce.x + 
+        weight_sum_reduce.y);
+    return (pixel_color/scalar_weight_sum);
+}
+
+float3 downsample_horizontal_sinc_tiled(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size, const float dr,
+    const float magnification_scale, const float tile_size_uv_r)
+{
+    //  Differences from downsample_horizontal_sinc_tiled:
+    //  1.) The dr and tile_size_uv_r parameters are not static consts.
+    //  2.) The "vertical" parameter to get_first_texel_tile_uv_and_dist is
+    //      set to false instead of true.
+    //  3.) The horizontal version of the loop body is used.
+    //  TODO: If we can get guaranteed compile-time dead code elimination,
+    //  we can combine the vertical/horizontal downsampling functions by:
+    //  1.) Add an extra static const bool parameter called "vertical."
+    //  2.) Supply it with the result of get_first_texel_tile_uv_and_dist().
+    //  3.) Use a conditional assignment in the loop body macro.  This is the
+    //      tricky part: We DO NOT want to incur the extra conditional
+    //      assignment in the inner loop at runtime!
+    //  The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension
+    //  we're resizing along, e.g. "dx" in this case.
+    #ifdef USE_SINGLE_STATIC_LOOP
+        //  If we have to load all samples, we might as well use them.
+        static const int samples = int(max_sinc_resize_samples_m4);
+    #else
+        const int samples = int(get_dynamic_loop_size(magnification_scale));
+    #endif
+
+    //  Get the first sample location (scalar tile uv coord along resized
+    //  dimension) and distance from the output location (in texels):
+    const float input_tiles_per_texture_r = 1.0/tile_size_uv_r;
+    //  false = horizontal resize:
+    const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist(
+        tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, false);
+    const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx;
+    const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy;
+    //  Get the tile sample offset:
+    const float tile_dr = dr * input_tiles_per_texture_r;
+
+    //  Sum up each weight and weighted sample color, varying the looping
+    //  strategy based on our expected dynamic loop capabilities.  See the
+    //  loop body macros above.
+    int i_base = 0;
+    float4 weight_sum = float4(0.0);
+    float3 pixel_color = float3(0.0);
+    static const int i_step = 4;
+    #ifdef BREAK_LOOPS_INTO_PIECES
+        if(samples - i_base >= 64)
+        {
+            for(int i = 0; i < 64; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 64;
+        }
+        if(samples - i_base >= 32)
+        {
+            for(int i = 0; i < 32; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 32;
+        }
+        if(samples - i_base >= 16)
+        {
+            for(int i = 0; i < 16; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 16;
+        }
+        if(samples - i_base >= 8)
+        {
+            for(int i = 0; i < 8; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 8;
+        }
+        if(samples - i_base >= 4)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 4;
+        }
+        //  Do another 4-sample block for a total of 128 max samples.
+        if(samples - i_base > 0)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+        }
+    #else
+        for(int i = 0; i < samples; i += i_step)
+        {
+            HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+        }
+    #endif
+    //  Normalize so the weight_sum == 1.0, and return:
+    const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw;
+    const float3 scalar_weight_sum = float3(weight_sum_reduce.x +
+        weight_sum_reduce.y);
+    return (pixel_color/scalar_weight_sum);
+}
+
+
+////////////////////////////  TILE SIZE CALCULATION  ///////////////////////////
+
+float2 get_resized_mask_tile_size(const float2 estimated_viewport_size,
+    const float2 estimated_mask_resize_output_size,
+    const bool solemnly_swear_same_inputs_for_every_pass)
+{
+    //  Requires:   The following global constants must be defined according to
+    //              certain constraints:
+    //              1.) mask_resize_num_triads: Must be high enough that our
+    //                  mask sampling method won't have artifacts later
+    //                  (long story; see derived-settings-and-constants.h)
+    //              2.) mask_resize_src_lut_size: Texel size of our mask LUT
+    //              3.) mask_triads_per_tile: Num horizontal triads in our LUT
+    //              4.) mask_min_allowed_triad_size: User setting (the more
+    //                  restrictive it is, the faster the resize will go)
+    //              5.) mask_min_allowed_tile_size_x < mask_resize_src_lut_size.x
+    //              6.) mask_triad_size_desired_{runtime, static}
+    //              7.) mask_num_triads_desired_{runtime, static}
+    //              8.) mask_specify_num_triads must be 0.0/1.0 (false/true)
+    //              The function parameters must be defined as follows:
+    //              1.) estimated_viewport_size == (final viewport size);
+    //                  If mask_specify_num_triads is 1.0/true and the viewport
+    //                  estimate is wrong, the number of triads will differ from
+    //                  the user's preference by about the same factor.
+    //              2.) estimated_mask_resize_output_size: Must equal the
+    //                  output size of the MASK_RESIZE pass.
+    //                  Exception: The x component may be estimated garbage if
+    //                  and only if the caller throws away the x result.
+    //              3.) solemnly_swear_same_inputs_for_every_pass: Set to false,
+    //                  unless you can guarantee that every call across every
+    //                  pass will use the same sizes for the other parameters.
+    //              When calling this across multiple passes, always use the
+    //              same y viewport size/scale, and always use the same x
+    //              viewport size/scale when using the x result.
+    //  Returns:    Return the final size of a manually resized mask tile, after
+    //              constraining the desired size to avoid artifacts.  Under
+    //              unusual circumstances, tiles may become stretched vertically
+    //              (see wall of text below).
+    //  Stated tile properties must be correct:
+    static const float tile_aspect_ratio_inv =
+        mask_resize_src_lut_size.y/mask_resize_src_lut_size.x;
+    static const float tile_aspect_ratio = 1.0/tile_aspect_ratio_inv;
+    static const float2 tile_aspect = float2(1.0, tile_aspect_ratio_inv);
+    //  If mask_specify_num_triads is 1.0/true and estimated_viewport_size.x is
+    //  wrong, the user preference will be misinterpreted:
+    const float desired_tile_size_x = mask_triads_per_tile * lerp(
+        mask_triad_size_desired,
+        estimated_viewport_size.x / mask_num_triads_desired,
+        mask_specify_num_triads);
+    if(get_mask_sample_mode() > 0.5)
+    {
+        //  We don't need constraints unless we're sampling MASK_RESIZE.
+        return desired_tile_size_x * tile_aspect;
+    }
+    //  Make sure we're not upsizing:
+    const float temp_tile_size_x =
+        min(desired_tile_size_x, mask_resize_src_lut_size.x);
+    //  Enforce min_tile_size and max_tile_size in both dimensions:
+    const float2 temp_tile_size = temp_tile_size_x * tile_aspect;
+    static const float2 min_tile_size =
+        mask_min_allowed_tile_size * tile_aspect;
+    const float2 max_tile_size =
+        estimated_mask_resize_output_size / mask_resize_num_tiles;
+    const float2 clamped_tile_size =
+        clamp(temp_tile_size, min_tile_size, max_tile_size);
+    //  Try to maintain tile_aspect_ratio.  This is the tricky part:
+    //  If we're currently resizing in the y dimension, the x components
+    //  could be MEANINGLESS.  (If estimated_mask_resize_output_size.x is
+    //  bogus, then so is max_tile_size.x and clamped_tile_size.x.)
+    //  We can't adjust the y size based on clamped_tile_size.x.  If it
+    //  clamps when it shouldn't, it won't clamp again when later passes
+    //  call this function with the correct sizes, and the discrepancy will
+    //  break the sampling coords in MASKED_SCANLINES.  Instead, we'll limit
+    //  the x size based on the y size, but not vice versa, unless the
+    //  caller swears the parameters were the same (correct) in every pass.
+    //  As a result, triads could appear vertically stretched if:
+    //  a.) mask_resize_src_lut_size.x > mask_resize_src_lut_size.y: Wide
+    //      LUT's might clamp x more than y (all provided LUT's are square)
+    //  b.) true_viewport_size.x < true_viewport_size.y: The user is playing
+    //      with a vertically oriented screen (not accounted for anyway)
+    //  c.) mask_resize_viewport_scale.x < masked_resize_viewport_scale.y:
+    //      Viewport scales are equal by default.
+    //  If any of these are the case, you can fix the stretching by setting:
+    //      mask_resize_viewport_scale.x = mask_resize_viewport_scale.y *
+    //          (1.0 / min_expected_aspect_ratio) *
+    //          (mask_resize_src_lut_size.x / mask_resize_src_lut_size.y)
+    const float x_tile_size_from_y =
+        clamped_tile_size.y * tile_aspect_ratio;
+    const float y_tile_size_from_x = lerp(clamped_tile_size.y,
+        clamped_tile_size.x * tile_aspect_ratio_inv,
+        float(solemnly_swear_same_inputs_for_every_pass));
+    const float2 reclamped_tile_size = float2(
+        min(clamped_tile_size.x, x_tile_size_from_y),
+        min(clamped_tile_size.y, y_tile_size_from_x));
+    //  We need integer tile sizes in both directions for tiled sampling to
+    //  work correctly.  Use floor (to make sure we don't round up), but be
+    //  careful to avoid a rounding bug where floor decreases whole numbers:
+    const float2 final_resized_tile_size =
+        floor(reclamped_tile_size + float2(FIX_ZERO(0.0)));
+    return final_resized_tile_size;
+}
+
+
+/////////////////////////  FINAL MASK SAMPLING HELPERS  ////////////////////////
+
+float4 get_mask_sampling_parameters(const float2 mask_resize_texture_size,
+    const float2 mask_resize_video_size, const float2 true_viewport_size,
+    out float2 mask_tiles_per_screen)
+{
+    //  Requires:   1.) Requirements of get_resized_mask_tile_size() must be
+    //                  met, particularly regarding global constants.
+    //              The function parameters must be defined as follows:
+    //              1.) mask_resize_texture_size == MASK_RESIZE.texture_size
+    //                  if get_mask_sample_mode() is 0 (otherwise anything)
+    //              2.) mask_resize_video_size == MASK_RESIZE.video_size
+    //                  if get_mask_sample_mode() is 0 (otherwise anything)
+    //              3.) true_viewport_size == output_size for a pass set to
+    //                  1.0 viewport scale (i.e. it must be correct)
+    //  Returns:    Return a float4 containing:
+    //                  xy: tex_uv coords for the start of the mask tile
+    //                  zw: tex_uv size of the mask tile from start to end
+    //              mask_tiles_per_screen is an out parameter containing the
+    //              number of mask tiles that will fit on the screen.
+    //  First get the final resized tile size.  The viewport size and mask
+    //  resize viewport scale must be correct, but don't solemnly swear they
+    //  were correct in both mask resize passes unless you know it's true.
+    //  (We can better ensure a correct tile aspect ratio if the parameters are
+    //  guaranteed correct in all passes...but if we lie, we'll get inconsistent
+    //  sizes across passes, resulting in broken texture coordinates.)
+    const float mask_sample_mode = get_mask_sample_mode();
+    const float2 mask_resize_tile_size = get_resized_mask_tile_size(
+        true_viewport_size, mask_resize_video_size, false);
+    if(mask_sample_mode < 0.5)
+    {
+        //  Sample MASK_RESIZE: The resized tile is a fraction of the texture
+        //  size and starts at a nonzero offset to allow for border texels:
+        const float2 mask_tile_uv_size = mask_resize_tile_size /
+            mask_resize_texture_size;
+        const float2 skipped_tiles = mask_start_texels/mask_resize_tile_size;
+        const float2 mask_tile_start_uv = skipped_tiles * mask_tile_uv_size;
+        //  mask_tiles_per_screen must be based on the *true* viewport size:
+        mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size;
+        return float4(mask_tile_start_uv, mask_tile_uv_size);
+    }
+    else
+    {
+        //  If we're tiling at the original size (1:1 pixel:texel), redefine a
+        //  "tile" to be the full texture containing many triads.  Otherwise,
+        //  we're hardware-resampling an LUT, and the texture truly contains a
+        //  single unresized phosphor mask tile anyway.
+        static const float2 mask_tile_uv_size = float2(1.0);
+        static const float2 mask_tile_start_uv = float2(0.0);
+        if(mask_sample_mode > 1.5)
+        {
+            //  Repeat the full LUT at a 1:1 pixel:texel ratio without resizing:
+            mask_tiles_per_screen = true_viewport_size/mask_texture_large_size;
+        }
+        else
+        {
+            //  Hardware-resize the original LUT:
+            mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size;
+        }
+        return float4(mask_tile_start_uv, mask_tile_uv_size);
+    }
+}
+/*
+float2 fix_tiling_discontinuities_normalized(const float2 tile_uv,
+    float2 duv_dx, float2 duv_dy)
+{
+    //  Requires:   1.) duv_dx == ddx(tile_uv)
+    //              2.) duv_dy == ddy(tile_uv)
+    //              3.) tile_uv contains tile-relative uv coords in [0, 1],
+    //                  such that (0.5, 0.5) is the center of a tile, etc.
+    //                  ("Tile" can mean texture, the video embedded in the
+    //                  texture, or some other "tile" embedded in a texture.)
+    //  Returns:    Return new tile_uv coords that contain no discontinuities
+    //              across a 2x2 pixel quad.
+    //  Description:
+    //  When uv coords wrap from 1.0 to 0.0, they create a discontinuity in the
+    //  derivatives, which we assume happened if the absolute difference between
+    //  any fragment in a 2x2 block is > ~half a tile.  If the current block has
+    //  a u or v discontinuity and the current fragment is in the first half of
+    //  the tile along that axis (i.e. it wrapped from 1.0 to 0.0), add a tile
+    //  to that coord to make the 2x2 block continuous.  (It will now have a
+    //  coord > 1.0 in the padding area beyond the tile.)  This function takes
+    //  derivatives as parameters so the caller can reuse them.
+    //  In case we're using high-quality (nVidia-style) derivatives, ensure
+    //  diagonically opposite fragments see each other for correctness:
+    duv_dx = abs(duv_dx) + abs(ddy(duv_dx));
+    duv_dy = abs(duv_dy) + abs(ddx(duv_dy));
+    const float2 pixel_in_first_half_tile = float2((tile_uv.x < 0.5),(tile_uv.y < 0.5));
+    const float2 jump_exists = float2(((duv_dx + duv_dy).x > 0.5),((duv_dx + duv_dy).y > 0.5));
+    return tile_uv + jump_exists * pixel_in_first_half_tile;
+}
+*/
+float2 convert_phosphor_tile_uv_wrap_to_tex_uv(const float2 tile_uv_wrap,
+    const float4 mask_tile_start_uv_and_size)
+{
+    //  Requires:   1.) tile_uv_wrap contains tile-relative uv coords, where the
+    //                  tile spans from [0, 1], such that (0.5, 0.5) is at the
+    //                  tile center.  The input coords can range from [0, inf],
+    //                  and their fractional parts map to a repeated tile.
+    //                  ("Tile" can mean texture, the video embedded in the
+    //                  texture, or some other "tile" embedded in a texture.)
+    //              2.) mask_tile_start_uv_and_size.xy contains tex_uv coords
+    //                  for the start of the embedded tile in the full texture.
+    //              3.) mask_tile_start_uv_and_size.zw contains the [fractional]
+    //                  tex_uv size of the embedded tile in the full texture.
+    //  Returns:    Return tex_uv coords (used for texture sampling)
+    //              corresponding to tile_uv_wrap.
+    if(get_mask_sample_mode() < 0.5)
+    {
+        //  Manually repeat the resized mask tile to fill the screen:
+        //  First get fractional tile_uv coords.  Using frac/fmod on coords
+        //  confuses anisotropic filtering; fix it as user options dictate.
+        //  derived-settings-and-constants.h disables incompatible options.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            float2 tile_uv = frac(tile_uv_wrap * 0.5) * 2.0;
+        #else
+            float2 tile_uv = frac(tile_uv_wrap);
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            const float2 tile_uv_dx = ddx(tile_uv);
+            const float2 tile_uv_dy = ddy(tile_uv);
+            tile_uv = fix_tiling_discontinuities_normalized(tile_uv,
+                tile_uv_dx, tile_uv_dy);
+        #endif
+        //  The tile is embedded in a padded FBO, and it may start at a
+        //  nonzero offset if border texels are used to avoid artifacts:
+        const float2 mask_tex_uv = mask_tile_start_uv_and_size.xy +
+            tile_uv * mask_tile_start_uv_and_size.zw;
+        return mask_tex_uv;
+    }
+    else
+    {
+        //  Sample from the input phosphor mask texture with hardware tiling.
+        //  If we're tiling at the original size (mode 2), the "tile" is the
+        //  whole texture, and it contains a large number of triads mapped with
+        //  a 1:1 pixel:texel ratio.  OTHERWISE, the texture contains a single
+        //  unresized tile.  tile_uv_wrap already has correct coords for both!
+        return tile_uv_wrap;
+    }
+}
+
+
+#endif  //  PHOSPHOR_MASK_RESIZING_H
+
+/////////////////////////  END PHOSPHOR-MASK-RESIZING  /////////////////////////
+
+#undef COMPAT_PRECISION
+#undef COMPAT_TEXTURE
+
+void main() {
+   gl_Position = position;
+   vTexCoord = texCoord;
+   float2 tex_uv = vTexCoord.xy;
+	//  First estimate the viewport size (the user will get the wrong number of
+    //  triads if it's wrong and mask_specify_num_triads is 1.0/true).
+    const float viewport_y = output_size.y / mask_resize_viewport_scale.y;
+    const float aspect_ratio = geom_aspect_ratio_x / geom_aspect_ratio_y;
+    const float2 estimated_viewport_size =
+        float2(viewport_y * aspect_ratio, viewport_y);
+    //  Estimate the output size of MASK_RESIZE (the next pass).  The estimated
+    //  x component shouldn't matter, because we're not using the x result, and
+    //  we're not swearing it's correct (if we did, the x result would influence
+    //  the y result to maintain the tile aspect ratio).
+    const float2 estimated_mask_resize_output_size =
+        float2(output_size.y * aspect_ratio, output_size.y);
+    //  Find the final intended [y] size of our resized phosphor mask tiles,
+    //  then the tile size for the current pass (resize y only):
+    float2 mask_resize_tile_size = get_resized_mask_tile_size(
+        estimated_viewport_size, estimated_mask_resize_output_size, false);
+    float2 pass_output_tile_size = float2(min(
+        mask_resize_src_lut_size.x, output_size.x), mask_resize_tile_size.y);
+
+    //  We'll render resized tiles until filling the output FBO or meeting a
+    //  limit, so compute [wrapped] tile uv coords based on the output uv coords
+    //  and the number of tiles that will fit in the FBO.
+    const float2 output_tiles_this_pass = output_size / pass_output_tile_size;
+    const float2 output_video_uv = tex_uv * texture_size / video_size;
+    const float2 tile_uv_wrap = output_video_uv * output_tiles_this_pass;
+
+    //  The input LUT is just a single mask tile, so texture uv coords are the
+    //  same as tile uv coords (save frac() for the fragment shader).  The
+    //  magnification scale is also straightforward:
+    src_tex_uv_wrap = tile_uv_wrap;
+    resize_magnification_scale =
+        pass_output_tile_size / mask_resize_src_lut_size;
+}
\ No newline at end of file
diff --git a/shaders/CRT-Royale.shader/scanlines-horizontal-apply-mask.fs b/shaders/CRT-Royale.shader/scanlines-horizontal-apply-mask.fs
new file mode 100644
index 00000000..a987afbb
--- /dev/null
+++ b/shaders/CRT-Royale.shader/scanlines-horizontal-apply-mask.fs
@@ -0,0 +1,10845 @@
+#version 150
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+uniform sampler2D source[];
+uniform vec4 sourceSize[];
+uniform vec4 targetSize;
+uniform int phase;
+uniform sampler2D pixmap[];
+
+in Vertex {
+   vec2 vTexCoord;
+   vec2 video_uv;
+   vec2 scanline_tex_uv;
+   vec2 blur3x3_tex_uv;
+   vec2 halation_tex_uv;
+   vec2 scanline_texture_size_inv;
+   vec4 mask_tile_start_uv_and_size;
+   vec2 mask_tiles_per_screen;
+};
+
+out vec4 FragColor;
+
+// USER SETTINGS BLOCK //
+
+#define crt_gamma 2.500000
+#define lcd_gamma 2.200000
+#define levels_contrast 1.0
+#define halation_weight 0.0
+#define diffusion_weight 0.075
+#define bloom_underestimate_levels 0.8
+#define bloom_excess 0.000000
+#define beam_min_sigma 0.020000
+#define beam_max_sigma 0.300000
+#define beam_spot_power 0.330000
+#define beam_min_shape 2.000000
+#define beam_max_shape 4.000000
+#define beam_shape_power 0.250000
+#define beam_horiz_filter 0.000000
+#define beam_horiz_sigma 0.35
+#define beam_horiz_linear_rgb_weight 1.000000
+#define convergence_offset_x_r -0.000000
+#define convergence_offset_x_g 0.000000
+#define convergence_offset_x_b 0.000000
+#define convergence_offset_y_r 0.000000
+#define convergence_offset_y_g -0.000000
+#define convergence_offset_y_b 0.000000
+#define mask_type 1.000000
+#define mask_sample_mode_desired 0.000000
+#define mask_specify_num_triads 0.000000
+#define mask_triad_size_desired 3.000000
+#define mask_num_triads_desired 480.000000
+#define aa_subpixel_r_offset_x_runtime -0.0
+#define aa_subpixel_r_offset_y_runtime 0.000000
+#define aa_cubic_c 0.500000
+#define aa_gauss_sigma 0.500000
+#define geom_mode_runtime 0.000000
+#define geom_radius 2.000000
+#define geom_view_dist 2.000000
+#define geom_tilt_angle_x 0.000000
+#define geom_tilt_angle_y 0.000000
+#define geom_aspect_ratio_x 432.000000
+#define geom_aspect_ratio_y 329.000000
+#define geom_overscan_x 1.000000
+#define geom_overscan_y 1.000000
+#define border_size 0.015
+#define border_darkness 2.0
+#define border_compress 2.500000
+#define interlace_bff 0.000000
+#define interlace_1080i 0.000000
+
+// END USER SETTINGS BLOCK //
+
+#define VERTICAL_SCANLINEStexture source[5]
+#define VERTICAL_SCANLINEStexture_size sourceSize[5].xy
+#define VERTICAL_SCANLINESvideo_size sourceSize[5].xy
+#define BLOOM_APPROXtexture source[4]
+#define BLOOM_APPROXtexture_size sourceSize[4].xy
+#define BLOOM_APPROXvideo_size sourceSize[4].xy
+#define HALATION_BLURtexture source[2]
+#define HALATION_BLURtexture_size sourceSize[2].xy
+#define HALATION_BLURvideo_size sourceSize[2].xy
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+	#define MASK_RESIZEtexture source[0]
+#else
+	#define MASK_RESIZEtexture source[0]
+#endif
+#define MASK_RESIZEtexture_size sourceSize[0]
+#define MASK_RESIZEvideo_size sourceSize[0]
+
+#define input_texture source[0]
+#define mask_grille_texture_large pixmap[1]
+#define mask_slot_texture_large pixmap[3]
+#define mask_shadow_texture_large pixmap[5]
+
+float bloom_approx_scale_x = targetSize.x / sourceSize[0].y;
+const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0);
+
+// compatibility macros for transparently converting HLSLisms into GLSLisms
+#define mul(a,b) (b*a)
+#define lerp(a,b,c) mix(a,b,c)
+#define saturate(c) clamp(c, 0.0, 1.0)
+#define frac(x) (fract(x))
+#define float2 vec2
+#define float3 vec3
+#define float4 vec4
+#define bool2 bvec2
+#define bool3 bvec3
+#define bool4 bvec4
+#define float2x2 mat2x2
+#define float3x3 mat3x3
+#define float4x4 mat4x4
+#define float4x3 mat4x3
+#define float2x4 mat2x4
+#define IN params
+#define texture_size sourceSize[0].xy
+#define video_size sourceSize[0].xy
+#define output_size targetSize.xy
+#define frame_count phase
+#define static  
+#define inline  
+#define const  
+#define fmod(x,y) mod(x,y)
+#define ddx(c) dFdx(c)
+#define ddy(c) dFdy(c)
+#define atan2(x,y) atan(y,x)
+#define rsqrt(c) inversesqrt(c)
+
+#define input_texture source[0]
+
+#if defined(GL_ES)
+	#define COMPAT_PRECISION mediump
+#else
+	#define COMPAT_PRECISION
+#endif
+
+#if __VERSION__ >= 130
+	#define COMPAT_TEXTURE texture
+#else
+	#define COMPAT_TEXTURE texture2D
+#endif
+
+// VERTEX INCLUDES //
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+////////////////////////////  END USER-SETTINGS  //////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  END DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////////////
+
+//#include "bind-shader-h"
+
+/////////////////////////////  BEGIN BIND-SHADER-PARAMS  ////////////////////////////
+
+#ifndef BIND_SHADER_PARAMS_H
+#define BIND_SHADER_PARAMS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+/////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+////////////////////   END DERIVED-SETTINGS-AND-CONSTANTS   /////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+//  Override some parameters for gamma-management.h and tex2Dantialias.h:
+#define OVERRIDE_DEVICE_GAMMA
+static const float gba_gamma = 3.5; //  Irrelevant but necessary to define.
+#define ANTIALIAS_OVERRIDE_BASICS
+#define ANTIALIAS_OVERRIDE_PARAMETERS
+
+//  Provide accessors for vector constants that pack scalar uniforms:
+inline float2 get_aspect_vector(const float geom_aspect_ratio)
+{
+    //  Get an aspect ratio vector.  Enforce geom_max_aspect_ratio, and prevent
+    //  the absolute scale from affecting the uv-mapping for curvature:
+    const float geom_clamped_aspect_ratio =
+        min(geom_aspect_ratio, geom_max_aspect_ratio);
+    const float2 geom_aspect =
+        normalize(float2(geom_clamped_aspect_ratio, 1.0));
+    return geom_aspect;
+}
+
+inline float2 get_geom_overscan_vector()
+{
+    return float2(geom_overscan_x, geom_overscan_y);
+}
+
+inline float2 get_geom_tilt_angle_vector()
+{
+    return float2(geom_tilt_angle_x, geom_tilt_angle_y);
+}
+
+inline float3 get_convergence_offsets_x_vector()
+{
+    return float3(convergence_offset_x_r, convergence_offset_x_g,
+        convergence_offset_x_b);
+}
+
+inline float3 get_convergence_offsets_y_vector()
+{
+    return float3(convergence_offset_y_r, convergence_offset_y_g,
+        convergence_offset_y_b);
+}
+
+inline float2 get_convergence_offsets_r_vector()
+{
+    return float2(convergence_offset_x_r, convergence_offset_y_r);
+}
+
+inline float2 get_convergence_offsets_g_vector()
+{
+    return float2(convergence_offset_x_g, convergence_offset_y_g);
+}
+
+inline float2 get_convergence_offsets_b_vector()
+{
+    return float2(convergence_offset_x_b, convergence_offset_y_b);
+}
+
+inline float2 get_aa_subpixel_r_offset()
+{
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+            //  WARNING: THIS IS EXTREMELY EXPENSIVE.
+            return float2(aa_subpixel_r_offset_x_runtime,
+                aa_subpixel_r_offset_y_runtime);
+        #else
+            return aa_subpixel_r_offset_static;
+        #endif
+    #else
+        return aa_subpixel_r_offset_static;
+    #endif
+}
+
+//  Provide accessors settings which still need "cooking:"
+inline float get_mask_amplify()
+{
+    static const float mask_grille_amplify = 1.0/mask_grille_avg_color;
+    static const float mask_slot_amplify = 1.0/mask_slot_avg_color;
+    static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color;
+    return mask_type < 0.5 ? mask_grille_amplify :
+        mask_type < 1.5 ? mask_slot_amplify :
+        mask_shadow_amplify;
+}
+
+inline float get_mask_sample_mode()
+{
+    #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_desired;
+        #else
+            return clamp(mask_sample_mode_desired, 1.0, 2.0);
+        #endif
+    #else
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_static;
+        #else
+            return clamp(mask_sample_mode_static, 1.0, 2.0);
+        #endif
+    #endif
+}
+
+#endif  //  BIND_SHADER_PARAMS_H
+
+////////////////////////////  END BIND-SHADER-PARAMS  ///////////////////////////
+
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
+
+//#include "scanline-functions.h"
+
+/////////////////////////////  BEGIN SCANLINE-FUNCTIONS  ////////////////////////////
+
+#ifndef SCANLINE_FUNCTIONS_H
+#define SCANLINE_FUNCTIONS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+////////////////////////////  END USER-SETTINGS  //////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  END DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////////////
+
+//#include "../../../../include/special-functions.h"
+
+///////////////////////////  BEGIN SPECIAL-FUNCTIONS  //////////////////////////
+
+#ifndef SPECIAL_FUNCTIONS_H
+#define SPECIAL_FUNCTIONS_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file implements the following mathematical special functions:
+//  1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2))
+//  2.) gamma(s), a real-numbered extension of the integer factorial function
+//  It also implements normalized_ligamma(s, z), a normalized lower incomplete
+//  gamma function for s < 0.5 only.  Both gamma() and normalized_ligamma() can
+//  be called with an _impl suffix to use an implementation version with a few
+//  extra precomputed parameters (which may be useful for the caller to reuse).
+//  See below for details.
+//
+//  Design Rationale:
+//  Pretty much every line of code in this file is duplicated four times for
+//  different input types (float4/float3/float2/float).  This is unfortunate,
+//  but Cg doesn't allow function templates.  Macros would be far less verbose,
+//  but they would make the code harder to document and read.  I don't expect
+//  these functions will require a whole lot of maintenance changes unless
+//  someone ever has need for more robust incomplete gamma functions, so code
+//  duplication seems to be the lesser evil in this case.
+
+
+///////////////////////////  GAUSSIAN ERROR FUNCTION  //////////////////////////
+
+float4 erf6(float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Return an Abramowitz/Stegun approximation of erf(), where:
+    //                  erf(x) = 2/sqrt(pi) * integral(e**(-x**2))
+    //              This approximation has a max absolute error of 2.5*10**-5
+    //              with solid numerical robustness and efficiency.  See:
+	//                  https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions
+	static const float4 one = float4(1.0);
+	const float4 sign_x = sign(x);
+	const float4 t = one/(one + 0.47047*abs(x));
+	const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float3 erf6(const float3 x)
+{
+    //  Float3 version:
+	static const float3 one = float3(1.0);
+	const float3 sign_x = sign(x);
+	const float3 t = one/(one + 0.47047*abs(x));
+	const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float2 erf6(const float2 x)
+{
+    //  Float2 version:
+	static const float2 one = float2(1.0);
+	const float2 sign_x = sign(x);
+	const float2 t = one/(one + 0.47047*abs(x));
+	const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float erf6(const float x)
+{
+    //  Float version:
+	const float sign_x = sign(x);
+	const float t = 1.0/(1.0 + 0.47047*abs(x));
+	const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float4 erft(const float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Approximate erf() with the hyperbolic tangent.  The error is
+    //              visually noticeable, but it's blazing fast and perceptually
+    //              close...at least on ATI hardware.  See:
+    //                  http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html
+    //  Warning:    Only use this if your hardware drivers correctly implement
+    //              tanh(): My nVidia 8800GTS returns garbage output.
+	return tanh(1.202760580 * x);
+}
+
+float3 erft(const float3 x)
+{
+    //  Float3 version:
+	return tanh(1.202760580 * x);
+}
+
+float2 erft(const float2 x)
+{
+    //  Float2 version:
+	return tanh(1.202760580 * x);
+}
+
+float erft(const float x)
+{
+    //  Float version:
+	return tanh(1.202760580 * x);
+}
+
+inline float4 erf(const float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Some approximation of erf(x), depending on user settings.
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float3 erf(const float3 x)
+{
+    //  Float3 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float2 erf(const float2 x)
+{
+    //  Float2 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float erf(const float x)
+{
+    //  Float version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+
+///////////////////////////  COMPLETE GAMMA FUNCTION  //////////////////////////
+
+float4 gamma_impl(const float4 s, const float4 s_inv)
+{
+    //  Requires:   1.) s is the standard parameter to the gamma function, and
+    //                  it should lie in the [0, 36] range.
+    //              2.) s_inv = 1.0/s.  This implementation function requires
+    //                  the caller to precompute this value, giving users the
+    //                  opportunity to reuse it.
+    //  Returns:    Return approximate gamma function (real-numbered factorial)
+    //              output using the Lanczos approximation with two coefficients
+    //              calculated using Paul Godfrey's method here:
+    //                  http://my.fit.edu/~gabdo/gamma.txt
+    //              An optimal g value for s in [0, 36] is ~1.12906830989, with
+    //              a maximum relative error of 0.000463 for 2**16 equally
+    //              evals.  We could use three coeffs (0.0000346 error) without
+    //              hurting latency, but this allows more parallelism with
+    //              outside instructions.
+	static const float4 g = float4(1.12906830989);
+	static const float4 c0 = float4(0.8109119309638332633713423362694399653724431);
+	static const float4 c1 = float4(0.4808354605142681877121661197951496120000040);
+	static const float4 e = float4(2.71828182845904523536028747135266249775724709);
+	const float4 sph = s + float4(0.5);
+	const float4 lanczos_sum = c0 + c1/(s + float4(1.0));
+	const float4 base = (sph + g)/e;  //  or (s + g + float4(0.5))/e
+	//  gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s).
+	//  This has less error for small s's than (s -= 1.0) at the beginning.
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float3 gamma_impl(const float3 s, const float3 s_inv)
+{
+    //  Float3 version:
+	static const float3 g = float3(1.12906830989);
+	static const float3 c0 = float3(0.8109119309638332633713423362694399653724431);
+	static const float3 c1 = float3(0.4808354605142681877121661197951496120000040);
+	static const float3 e = float3(2.71828182845904523536028747135266249775724709);
+	const float3 sph = s + float3(0.5);
+	const float3 lanczos_sum = c0 + c1/(s + float3(1.0));
+	const float3 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float2 gamma_impl(const float2 s, const float2 s_inv)
+{
+    //  Float2 version:
+	static const float2 g = float2(1.12906830989);
+	static const float2 c0 = float2(0.8109119309638332633713423362694399653724431);
+	static const float2 c1 = float2(0.4808354605142681877121661197951496120000040);
+	static const float2 e = float2(2.71828182845904523536028747135266249775724709);
+	const float2 sph = s + float2(0.5);
+	const float2 lanczos_sum = c0 + c1/(s + float2(1.0));
+	const float2 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float gamma_impl(const float s, const float s_inv)
+{
+    //  Float version:
+	static const float g = 1.12906830989;
+	static const float c0 = 0.8109119309638332633713423362694399653724431;
+	static const float c1 = 0.4808354605142681877121661197951496120000040;
+	static const float e = 2.71828182845904523536028747135266249775724709;
+	const float sph = s + 0.5;
+	const float lanczos_sum = c0 + c1/(s + 1.0);
+	const float base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float4 gamma(const float4 s)
+{
+    //  Requires:   s is the standard parameter to the gamma function, and it
+    //              should lie in the [0, 36] range.
+    //  Returns:    Return approximate gamma function output with a maximum
+    //              relative error of 0.000463.  See gamma_impl for details.
+	return gamma_impl(s, float4(1.0)/s);
+}
+
+float3 gamma(const float3 s)
+{
+    //  Float3 version:
+	return gamma_impl(s, float3(1.0)/s);
+}
+
+float2 gamma(const float2 s)
+{
+    //  Float2 version:
+	return gamma_impl(s, float2(1.0)/s);
+}
+
+float gamma(const float s)
+{
+    //  Float version:
+	return gamma_impl(s, 1.0/s);
+}
+
+
+////////////////  INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT)  ///////////////
+
+//  Lower incomplete gamma function for small s and z (implementation):
+float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z <= ~0.775075
+    //              3.) s_inv = 1.0/s (precomputed for outside reuse)
+    //  Returns:    A series representation for the lower incomplete gamma
+    //              function for small s and small z (4 terms).
+    //  The actual "rolled up" summation looks like:
+	//      last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0;
+	//      sum = last_sign * last_pow / ((s + k) * last_factorial)
+	//      for(int i = 0; i < 4; ++i)
+	//      {
+	//          last_sign *= -1.0; last_pow *= z; last_factorial *= i;
+	//          sum += last_sign * last_pow / ((s + k) * last_factorial);
+	//      }
+	//  Unrolled, constant-unfolded and arranged for madds and parallelism:
+	const float4 scale = pow(z, s);
+	float4 sum = s_inv;  //  Summation iteration 0 result
+	//  Summation iterations 1, 2, and 3:
+	const float4 z_sq = z*z;
+	const float4 denom1 = s + float4(1.0);
+	const float4 denom2 = 2.0*s + float4(4.0);
+	const float4 denom3 = 6.0*s + float4(18.0);
+	//float4 denom4 = 24.0*s + float4(96.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	//sum += z_sq * z_sq / denom4;
+	//  Scale and return:
+	return scale * sum;
+}
+
+float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv)
+{
+    //  Float3 version:
+	const float3 scale = pow(z, s);
+	float3 sum = s_inv;
+	const float3 z_sq = z*z;
+	const float3 denom1 = s + float3(1.0);
+	const float3 denom2 = 2.0*s + float3(4.0);
+	const float3 denom3 = 6.0*s + float3(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv)
+{
+    //  Float2 version:
+	const float2 scale = pow(z, s);
+	float2 sum = s_inv;
+	const float2 z_sq = z*z;
+	const float2 denom1 = s + float2(1.0);
+	const float2 denom2 = 2.0*s + float2(4.0);
+	const float2 denom3 = 6.0*s + float2(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float ligamma_small_z_impl(const float s, const float z, const float s_inv)
+{
+    //  Float version:
+	const float scale = pow(z, s);
+	float sum = s_inv;
+	const float z_sq = z*z;
+	const float denom1 = s + 1.0;
+	const float denom2 = 2.0*s + 4.0;
+	const float denom3 = 6.0*s + 18.0;
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+//  Upper incomplete gamma function for small s and large z (implementation):
+float4 uigamma_large_z_impl(const float4 s, const float4 z)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z > ~0.775075
+    //  Returns:    Gauss's continued fraction representation for the upper
+    //              incomplete gamma function (4 terms).
+	//  The "rolled up" continued fraction looks like this.  The denominator
+    //  is truncated, and it's calculated "from the bottom up:"
+	//      denom = float4('inf');
+	//      float4 one = float4(1.0);
+	//      for(int i = 4; i > 0; --i)
+	//      {
+	//          denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom;
+	//      }
+	//  Unrolled and constant-unfolded for madds and parallelism:
+	const float4 numerator = pow(z, s) * exp(-z);
+	float4 denom = float4(7.0) + z - s;
+	denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom;
+	denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom;
+	denom = float4(1.0) + z - s + (s - float4(1.0))/denom;
+	return numerator / denom;
+}
+
+float3 uigamma_large_z_impl(const float3 s, const float3 z)
+{
+    //  Float3 version:
+	const float3 numerator = pow(z, s) * exp(-z);
+	float3 denom = float3(7.0) + z - s;
+	denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom;
+	denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom;
+	denom = float3(1.0) + z - s + (s - float3(1.0))/denom;
+	return numerator / denom;
+}
+
+float2 uigamma_large_z_impl(const float2 s, const float2 z)
+{
+    //  Float2 version:
+	const float2 numerator = pow(z, s) * exp(-z);
+	float2 denom = float2(7.0) + z - s;
+	denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom;
+	denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom;
+	denom = float2(1.0) + z - s + (s - float2(1.0))/denom;
+	return numerator / denom;
+}
+
+float uigamma_large_z_impl(const float s, const float z)
+{
+    //  Float version:
+	const float numerator = pow(z, s) * exp(-z);
+	float denom = 7.0 + z - s;
+	denom = 5.0 + z - s + (3.0*s - 9.0)/denom;
+	denom = 3.0 + z - s + (2.0*s - 4.0)/denom;
+	denom = 1.0 + z - s + (s - 1.0)/denom;
+	return numerator / denom;
+}
+
+//  Normalized lower incomplete gamma function for small s (implementation):
+float4 normalized_ligamma_impl(const float4 s, const float4 z,
+    const float4 s_inv, const float4 gamma_s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) s_inv = 1/s (precomputed for outside reuse)
+    //              3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse)
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  Since we only care about s < 0.5, we only need
+    //              to evaluate two branches (not four) based on z.  Each branch
+    //              uses four terms, with a max relative error of ~0.00182.  The
+    //              branch threshold and specifics were adapted for fewer terms
+    //              from Gil/Segura/Temme's paper here:
+    //                  http://oai.cwi.nl/oai/asset/20433/20433B.pdf
+	//  Evaluate both branches: Real branches test slower even when available.
+	static const float4 thresh = float4(0.775075);
+	bool4 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	z_is_large.w = z.w > thresh.w;
+	const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	//  Combine the results from both branches:
+	bool4 inverse_z_is_large = not(z_is_large);
+	return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large);
+}
+
+float3 normalized_ligamma_impl(const float3 s, const float3 z,
+    const float3 s_inv, const float3 gamma_s_inv)
+{
+    //  Float3 version:
+	static const float3 thresh = float3(0.775075);
+	bool3 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool3 inverse_z_is_large = not(z_is_large);
+	return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large);
+}
+
+float2 normalized_ligamma_impl(const float2 s, const float2 z,
+    const float2 s_inv, const float2 gamma_s_inv)
+{
+    //  Float2 version:
+	static const float2 thresh = float2(0.775075);
+	bool2 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool2 inverse_z_is_large = not(z_is_large);
+	return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large);
+}
+
+float normalized_ligamma_impl(const float s, const float z,
+    const float s_inv, const float gamma_s_inv)
+{
+    //  Float version:
+	static const float thresh = 0.775075;
+	const bool z_is_large = z > thresh;
+	const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	return large_z * float(z_is_large) + small_z * float(!z_is_large);
+}
+
+//  Normalized lower incomplete gamma function for small s:
+float4 normalized_ligamma(const float4 s, const float4 z)
+{
+    //  Requires:   s < ~0.5
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  See normalized_ligamma_impl() for details.
+	const float4 s_inv = float4(1.0)/s;
+	const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float3 normalized_ligamma(const float3 s, const float3 z)
+{
+    //  Float3 version:
+	const float3 s_inv = float3(1.0)/s;
+	const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float2 normalized_ligamma(const float2 s, const float2 z)
+{
+    //  Float2 version:
+	const float2 s_inv = float2(1.0)/s;
+	const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float normalized_ligamma(const float s, const float z)
+{
+    //  Float version:
+	const float s_inv = 1.0/s;
+	const float gamma_s_inv = 1.0/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+#endif  //  SPECIAL_FUNCTIONS_H
+
+////////////////////////////  END SPECIAL-FUNCTIONS  ///////////////////////////
+
+//#include "../../../../include/gamma-management.h"
+
+////////////////////////////  BEGIN GAMMA-MANAGEMENT  //////////////////////////
+
+#ifndef GAMMA_MANAGEMENT_H
+#define GAMMA_MANAGEMENT_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//  
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides gamma-aware tex*D*() and encode_output() functions.
+//  Requires:   Before #include-ing this file, the including file must #define
+//              the following macros when applicable and follow their rules:
+//              1.) #define FIRST_PASS if this is the first pass.
+//              2.) #define LAST_PASS if this is the last pass.
+//              3.) If sRGB is available, set srgb_framebufferN = "true" for
+//                  every pass except the last in your .cgp preset.
+//              4.) If sRGB isn't available but you want gamma-correctness with
+//                  no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
+//              5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
+//              6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
+//              7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
+//              8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
+//              If an option in [5, 8] is #defined in the first or last pass, it
+//              should be #defined for both.  It shouldn't make a difference
+//              whether it's #defined for intermediate passes or not.
+//  Optional:   The including file (or an earlier included file) may optionally
+//              #define a number of macros indicating it will override certain
+//              macros and associated constants are as follows:
+//              static constants with either static or uniform constants.  The
+//              1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
+//                  static const float ntsc_gamma
+//                  static const float pal_gamma
+//                  static const float crt_reference_gamma_high
+//                  static const float crt_reference_gamma_low
+//                  static const float lcd_reference_gamma
+//                  static const float crt_office_gamma
+//                  static const float lcd_office_gamma
+//              2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
+//                  static const float crt_gamma
+//                  static const float gba_gamma
+//                  static const float lcd_gamma
+//              3.) OVERRIDE_FINAL_GAMMA: The user must first define:
+//                  static const float input_gamma
+//                  static const float intermediate_gamma
+//                  static const float output_gamma
+//                  (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
+//              4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
+//                  static const bool assume_opaque_alpha
+//              The gamma constant overrides must be used in every pass or none,
+//              and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
+//              OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
+//  Usage:      After setting macros appropriately, ignore gamma correction and
+//              replace all tex*D*() calls with equivalent gamma-aware
+//              tex*D*_linearize calls, except:
+//              1.) When you read an LUT, use regular tex*D or a gamma-specified
+//                  function, depending on its gamma encoding:
+//                      tex*D*_linearize_gamma (takes a runtime gamma parameter)
+//              2.) If you must read pass0's original input in a later pass, use
+//                  tex2D_linearize_ntsc_gamma.  If you want to read pass0's
+//                  input with gamma-corrected bilinear filtering, consider
+//                  creating a first linearizing pass and reading from the input
+//                  of pass1 later.
+//              Then, return encode_output(color) from every fragment shader.
+//              Finally, use the global gamma_aware_bilinear boolean if you want
+//              to statically branch based on whether bilinear filtering is
+//              gamma-correct or not (e.g. for placing Gaussian blur samples).
+//
+//  Detailed Policy:
+//  tex*D*_linearize() functions enforce a consistent gamma-management policy
+//  based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings.  They assume
+//  their input texture has the same encoding characteristics as the input for
+//  the current pass (which doesn't apply to the exceptions listed above).
+//  Similarly, encode_output() enforces a policy based on the LAST_PASS and
+//  GAMMA_ENCODE_EVERY_FBO settings.  Together, they result in one of the
+//  following two pipelines.
+//  Typical pipeline with intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = linear_color;     //  Automatic sRGB encoding
+//      linear_color = intermediate_output;     //  Automatic sRGB decoding
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Typical pipeline without intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
+//      linear_color = pow(intermediate_output, intermediate_gamma);
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
+//  easily get gamma-correctness without banding on devices where sRGB isn't
+//  supported.
+//
+//  Use This Header to Maximize Code Reuse:
+//  The purpose of this header is to provide a consistent interface for texture
+//  reads and output gamma-encoding that localizes and abstracts away all the
+//  annoying details.  This greatly reduces the amount of code in each shader
+//  pass that depends on the pass number in the .cgp preset or whether sRGB
+//  FBO's are being used: You can trivially change the gamma behavior of your
+//  whole pass by commenting or uncommenting 1-3 #defines.  To reuse the same
+//  code in your first, Nth, and last passes, you can even put it all in another
+//  header file and #include it from skeleton .cg files that #define the
+//  appropriate pass-specific settings.
+//
+//  Rationale for Using Three Macros:
+//  This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
+//  SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
+//  a lower maintenance burden on each pass.  At first glance it seems we could
+//  accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
+//  This works for simple use cases where input_gamma == output_gamma, but it
+//  breaks down for more complex scenarios like CRT simulation, where the pass
+//  number determines the gamma encoding of the input and output.
+
+
+///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
+
+//  Set standard gamma constants, but allow users to override them:
+#ifndef OVERRIDE_STANDARD_GAMMA
+    //  Standard encoding gammas:
+    static const float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
+    static const float pal_gamma = 2.8;     //  Never actually 2.8 in practice
+    //  Typical device decoding gammas (only use for emulating devices):
+    //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
+    //  gammas: The standards purposely undercorrected for an analog CRT's
+    //  assumed 2.5 reference display gamma to maintain contrast in assumed
+    //  [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
+    //  These unstated assumptions about display gamma and perceptual rendering
+    //  intent caused a lot of confusion, and more modern CRT's seemed to target
+    //  NTSC 2.2 gamma with circuitry.  LCD displays seem to have followed suit
+    //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
+    //  displays designed to view sRGB in bright environments.  (Standards are
+    //  also in flux again with BT.1886, but it's underspecified for displays.)
+    static const float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
+    static const float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
+    static const float lcd_reference_gamma = 2.5;       //  To match CRT
+    static const float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
+    static const float lcd_office_gamma = 2.2;  //  Approximates sRGB
+#endif  //  OVERRIDE_STANDARD_GAMMA
+
+//  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
+//  but only if they're aware of it.
+#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
+    static const bool assume_opaque_alpha = false;
+#endif
+
+
+///////////////////////  DERIVED CONSTANTS AS FUNCTIONS  ///////////////////////
+
+//  gamma-management.h should be compatible with overriding gamma values with
+//  runtime user parameters, but we can only define other global constants in
+//  terms of static constants, not uniform user parameters.  To get around this
+//  limitation, we need to define derived constants using functions.
+
+//  Set device gamma constants, but allow users to override them:
+#ifdef OVERRIDE_DEVICE_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_crt_gamma()    {   return crt_gamma;   }
+    inline float get_gba_gamma()    {   return gba_gamma;   }
+    inline float get_lcd_gamma()    {   return lcd_gamma;   }
+#else
+    inline float get_crt_gamma()    {   return crt_reference_gamma_high;    }
+    inline float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
+    inline float get_lcd_gamma()    {   return lcd_office_gamma;            }
+#endif  //  OVERRIDE_DEVICE_GAMMA
+
+//  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
+#ifdef OVERRIDE_FINAL_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_intermediate_gamma()   {   return intermediate_gamma;  }
+    inline float get_input_gamma()          {   return input_gamma;         }
+    inline float get_output_gamma()         {   return output_gamma;        }
+#else
+    //  If we gamma-correct every pass, always use ntsc_gamma between passes to
+    //  ensure middle passes don't need to care if anything is being simulated:
+    inline float get_intermediate_gamma()   {   return ntsc_gamma;          }
+    #ifdef SIMULATE_CRT_ON_LCD
+        inline float get_input_gamma()      {   return get_crt_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_LCD
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_LCD_ON_CRT
+        inline float get_input_gamma()      {   return get_lcd_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_CRT
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else   //  Don't simulate anything:
+        inline float get_input_gamma()      {   return ntsc_gamma;          }
+        inline float get_output_gamma()     {   return ntsc_gamma;          }
+    #endif  //  SIMULATE_GBA_ON_CRT
+    #endif  //  SIMULATE_LCD_ON_CRT
+    #endif  //  SIMULATE_GBA_ON_LCD
+    #endif  //  SIMULATE_CRT_ON_LCD
+#endif  //  OVERRIDE_FINAL_GAMMA
+
+//  Set decoding/encoding gammas for the current pass.  Use static constants for
+//  linearize_input and gamma_encode_output, because they aren't derived, and
+//  they let the compiler do dead-code elimination.
+#ifndef GAMMA_ENCODE_EVERY_FBO
+    #ifdef FIRST_PASS
+        static const bool linearize_input = true;
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        static const bool linearize_input = false;
+        inline float get_pass_input_gamma()     {   return 1.0;                 }
+    #endif
+    #ifdef LAST_PASS
+        static const bool gamma_encode_output = true;
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        static const bool gamma_encode_output = false;
+        inline float get_pass_output_gamma()    {   return 1.0;                 }
+    #endif
+#else
+    static const bool linearize_input = true;
+    static const bool gamma_encode_output = true;
+    #ifdef FIRST_PASS
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        inline float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
+    #endif
+    #ifdef LAST_PASS
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        inline float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
+    #endif
+#endif
+
+//  Users might want to know if bilinear filtering will be gamma-correct:
+static const bool gamma_aware_bilinear = !linearize_input;
+
+
+//////////////////////  COLOR ENCODING/DECODING FUNCTIONS  /////////////////////
+
+inline float4 encode_output(const float4 color)
+{
+    if(gamma_encode_output)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_input(const float4 color)
+{
+    if(linearize_input)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_gamma_input(const float4 color, const float3 gamma)
+{
+    if(assume_opaque_alpha)
+    {
+        return float4(pow(color.rgb, gamma), 1.0);
+    }
+    else
+    {
+        return float4(pow(color.rgb, gamma), color.a);
+    }
+}
+
+//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯
+//#define tex2D_linearize(C, D) decode_input(vec4(texture(C, D)))
+// EDIT: it's the 'const' in front of the coords that's doing it
+
+///////////////////////////  TEXTURE LOOKUP WRAPPERS  //////////////////////////
+
+//  "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a wide array of linearizing texture lookup wrapper functions.  The
+//  Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
+//  lookups are provided for completeness in case that changes someday.  Nobody
+//  is likely to use the *fetch and *proj functions, but they're included just
+//  in case.  The only tex*D texture sampling functions omitted are:
+//      - tex*Dcmpbias
+//      - tex*Dcmplod
+//      - tex*DARRAY*
+//      - tex*DMS*
+//      - Variants returning integers
+//  Standard line length restrictions are ignored below for vertical brevity.
+/*
+//  tex1D:
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex1Dbias:
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dbias(tex, tex_coords));   }
+
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dbias(tex, tex_coords, texel_off));    }
+
+//  tex1Dfetch:
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
+{   return decode_input(tex1Dfetch(tex, tex_coords));  }
+
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex1Dlod:
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dlod(tex, tex_coords));    }
+
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dlod(tex, tex_coords, texel_off));     }
+
+//  tex1Dproj:
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+*/
+//  tex2D:
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords, texel_off));    }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex2Dbias:
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
+//{   return decode_input(tex2Dbias(tex, tex_coords));   }
+
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dbias(tex, tex_coords, texel_off));    }
+
+//  tex2Dfetch:
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
+//{   return decode_input(tex2Dfetch(tex, tex_coords));  }
+
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords)
+{   return decode_input(textureLod(tex, tex_coords.xy, 0.0));    }
+
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));     }
+/*
+//  tex2Dproj:
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+*/
+/*
+//  tex3D:
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
+{   return decode_input(tex3D(tex, tex_coords));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, texel_off));    }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex3Dbias:
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dbias(tex, tex_coords));   }
+
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dbias(tex, tex_coords, texel_off));    }
+
+//  tex3Dfetch:
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
+{   return decode_input(tex3Dfetch(tex, tex_coords));  }
+
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex3Dlod:
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dlod(tex, tex_coords));    }
+
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dlod(tex, tex_coords, texel_off));     }
+
+//  tex3Dproj:
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dproj(tex, tex_coords));   }
+
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dproj(tex, tex_coords, texel_off));    }
+/////////*
+
+//  NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  This narrow selection of nonstandard tex2D* functions can be useful:
+
+//  tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0)));   }
+
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off));    }
+
+
+//  MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a narrower selection of tex2D* wrapper functions that decode an
+//  input sample with a specified gamma value.  These are useful for reading
+//  LUT's and for reading the input of pass0 in a later pass.
+
+//  tex2D:
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma);   }
+
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+/*
+//  tex2Dbias:
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma);   }
+
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma);    }
+
+//  tex2Dfetch:
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma);  }
+
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma);   }
+*/
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma);    }
+
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma);     }
+
+
+#endif  //  GAMMA_MANAGEMENT_H
+
+////////////////////////////  END GAMMA-MANAGEMENT  //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+/////////////////////////////  SCANLINE FUNCTIONS  /////////////////////////////
+
+inline float3 get_gaussian_sigma(const float3 color, const float sigma_range)
+{
+    //  Requires:   Globals:
+    //              1.) beam_min_sigma and beam_max_sigma are global floats
+    //                  containing the desired minimum and maximum beam standard
+    //                  deviations, for dim and bright colors respectively.
+    //              2.) beam_max_sigma must be > 0.0
+    //              3.) beam_min_sigma must be in (0.0, beam_max_sigma]
+    //              4.) beam_spot_power must be defined as a global float.
+    //              Parameters:
+    //              1.) color is the underlying source color along a scanline
+    //              2.) sigma_range = beam_max_sigma - beam_min_sigma; we take
+    //                  sigma_range as a parameter to avoid repeated computation
+    //                  when beam_{min, max}_sigma are runtime shader parameters
+    //  Optional:   Users may set beam_spot_shape_function to 1 to define the
+    //              inner f(color) subfunction (see below) as:
+    //                  f(color) = sqrt(1.0 - (color - 1.0)*(color - 1.0))
+    //              Otherwise (technically, if beam_spot_shape_function < 0.5):
+    //                  f(color) = pow(color, beam_spot_power)
+    //  Returns:    The standard deviation of the Gaussian beam for "color:"
+    //                  sigma = beam_min_sigma + sigma_range * f(color)
+    //  Details/Discussion:
+    //  The beam's spot shape vaguely resembles an aspect-corrected f() in the
+    //  range [0, 1] (not quite, but it's related).  f(color) = color makes
+    //  spots look like diamonds, and a spherical function or cube balances
+    //  between variable width and a soft/realistic shape.   A beam_spot_power
+    //  > 1.0 can produce an ugly spot shape and more initial clipping, but the
+    //  final shape also differs based on the horizontal resampling filter and
+    //  the phosphor bloom.  For instance, resampling horizontally in nonlinear
+    //  light and/or with a sharp (e.g. Lanczos) filter will sharpen the spot
+    //  shape, but a sixth root is still quite soft.  A power function (default
+    //  1.0/3.0 beam_spot_power) is most flexible, but a fixed spherical curve
+    //  has the highest variability without an awful spot shape.
+    //
+    //  beam_min_sigma affects scanline sharpness/aliasing in dim areas, and its
+    //  difference from beam_max_sigma affects beam width variability.  It only
+    //  affects clipping [for pure Gaussians] if beam_spot_power > 1.0 (which is
+    //  a conservative estimate for a more complex constraint).
+    //
+    //  beam_max_sigma affects clipping and increasing scanline width/softness
+    //  as color increases.  The wider this is, the more scanlines need to be
+    //  evaluated to avoid distortion.  For a pure Gaussian, the max_beam_sigma
+    //  at which the first unused scanline always has a weight < 1.0/255.0 is:
+    //      num scanlines = 2, max_beam_sigma = 0.2089; distortions begin ~0.34
+    //      num scanlines = 3, max_beam_sigma = 0.3879; distortions begin ~0.52
+    //      num scanlines = 4, max_beam_sigma = 0.5723; distortions begin ~0.70
+    //      num scanlines = 5, max_beam_sigma = 0.7591; distortions begin ~0.89
+    //      num scanlines = 6, max_beam_sigma = 0.9483; distortions begin ~1.08
+    //  Generalized Gaussians permit more leeway here as steepness increases.
+    if(beam_spot_shape_function < 0.5)
+    {
+        //  Use a power function:
+        return float3(beam_min_sigma) + sigma_range *
+            pow(color, float3(beam_spot_power));
+    }
+    else
+    {
+        //  Use a spherical function:
+        const float3 color_minus_1 = color - float3(1.0);
+        return float3(beam_min_sigma) + sigma_range *
+            sqrt(float3(1.0) - color_minus_1*color_minus_1);
+    }
+}
+
+inline float3 get_generalized_gaussian_beta(const float3 color,
+    const float shape_range)
+{
+    //  Requires:   Globals:
+    //              1.) beam_min_shape and beam_max_shape are global floats
+    //                  containing the desired min/max generalized Gaussian
+    //                  beta parameters, for dim and bright colors respectively.
+    //              2.) beam_max_shape must be >= 2.0
+    //              3.) beam_min_shape must be in [2.0, beam_max_shape]
+    //              4.) beam_shape_power must be defined as a global float.
+    //              Parameters:
+    //              1.) color is the underlying source color along a scanline
+    //              2.) shape_range = beam_max_shape - beam_min_shape; we take
+    //                  shape_range as a parameter to avoid repeated computation
+    //                  when beam_{min, max}_shape are runtime shader parameters
+    //  Returns:    The type-I generalized Gaussian "shape" parameter beta for
+    //              the given color.
+    //  Details/Discussion:
+    //  Beta affects the scanline distribution as follows:
+    //  a.) beta < 2.0 narrows the peak to a spike with a discontinuous slope
+    //  b.) beta == 2.0 just degenerates to a Gaussian
+    //  c.) beta > 2.0 flattens and widens the peak, then drops off more steeply
+    //      than a Gaussian.  Whereas high sigmas widen and soften peaks, high
+    //      beta widen and sharpen peaks at the risk of aliasing.
+    //  Unlike high beam_spot_powers, high beam_shape_powers actually soften shape
+    //  transitions, whereas lower ones sharpen them (at the risk of aliasing).
+    return beam_min_shape + shape_range * pow(color, float3(beam_shape_power));
+}
+
+float3 scanline_gaussian_integral_contrib(const float3 dist,
+    const float3 color, const float pixel_height, const float sigma_range)
+{
+    //  Requires:   1.) dist is the distance of the [potentially separate R/G/B]
+    //                  point(s) from a scanline in units of scanlines, where
+    //                  1.0 means the sample point straddles the next scanline.
+    //              2.) color is the underlying source color along a scanline.
+    //              3.) pixel_height is the output pixel height in scanlines.
+    //              4.) Requirements of get_gaussian_sigma() must be met.
+    //  Returns:    Return a scanline's light output over a given pixel.
+    //  Details:
+    //  The CRT beam profile follows a roughly Gaussian distribution which is
+    //  wider for bright colors than dark ones.  The integral over the full
+    //  range of a Gaussian function is always 1.0, so we can vary the beam
+    //  with a standard deviation without affecting brightness.  'x' = distance:
+    //      gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2))
+    //      gaussian integral = 0.5 (1.0 + erf(x/(sigma * sqrt(2))))
+    //  Use a numerical approximation of the "error function" (the Gaussian
+    //  indefinite integral) to find the definite integral of the scanline's
+    //  average brightness over a given pixel area.  Even if curved coords were
+    //  used in this pass, a flat scalar pixel height works almost as well as a
+    //  pixel height computed from a full pixel-space to scanline-space matrix.
+    const float3 sigma = get_gaussian_sigma(color, sigma_range);
+    const float3 ph_offset = float3(pixel_height * 0.5);
+    const float3 denom_inv = 1.0/(sigma*sqrt(2.0));
+    const float3 integral_high = erf((dist + ph_offset)*denom_inv);
+    const float3 integral_low = erf((dist - ph_offset)*denom_inv);
+    return color * 0.5*(integral_high - integral_low)/pixel_height;
+}
+
+float3 scanline_generalized_gaussian_integral_contrib(float3 dist,
+    float3 color, float pixel_height, float sigma_range,
+    float shape_range)
+{
+    //  Requires:   1.) Requirements of scanline_gaussian_integral_contrib()
+    //                  must be met.
+    //              2.) Requirements of get_gaussian_sigma() must be met.
+    //              3.) Requirements of get_generalized_gaussian_beta() must be
+    //                  met.
+    //  Returns:    Return a scanline's light output over a given pixel.
+    //  A generalized Gaussian distribution allows the shape (beta) to vary
+    //  as well as the width (alpha).  "gamma" refers to the gamma function:
+    //      generalized sample =
+    //          beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta)
+    //  ligamma(s, z) is the lower incomplete gamma function, for which we only
+    //  implement two of four branches (because we keep 1/beta <= 0.5):
+    //      generalized integral = 0.5 + 0.5* sign(x) *
+    //          ligamma(1/beta, (|x|/alpha)**beta)/gamma(1/beta)
+    //  See get_generalized_gaussian_beta() for a discussion of beta.
+    //  We base alpha on the intended Gaussian sigma, but it only strictly
+    //  models models standard deviation at beta == 2, because the standard
+    //  deviation depends on both alpha and beta (keeping alpha independent is
+    //  faster and preserves intuitive behavior and a full spectrum of results).
+    const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
+    const float3 beta = get_generalized_gaussian_beta(color, shape_range);
+    const float3 alpha_inv = float3(1.0)/alpha;
+    const float3 s = float3(1.0)/beta;
+    const float3 ph_offset = float3(pixel_height * 0.5);
+    //  Pass beta to gamma_impl to avoid repeated divides.  Similarly pass
+    //  beta (i.e. 1/s) and 1/gamma(s) to normalized_ligamma_impl.
+    const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, beta);
+    const float3 dist1 = dist + ph_offset;
+    const float3 dist0 = dist - ph_offset;
+    const float3 integral_high = sign(dist1) * normalized_ligamma_impl(
+        s, pow(abs(dist1)*alpha_inv, beta), beta, gamma_s_inv);
+    const float3 integral_low = sign(dist0) * normalized_ligamma_impl(
+        s, pow(abs(dist0)*alpha_inv, beta), beta, gamma_s_inv);
+    return color * 0.5*(integral_high - integral_low)/pixel_height;
+}
+
+float3 scanline_gaussian_sampled_contrib(const float3 dist, const float3 color,
+    const float pixel_height, const float sigma_range)
+{
+    //  See scanline_gaussian integral_contrib() for detailed comments!
+    //  gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2))
+    const float3 sigma = get_gaussian_sigma(color, sigma_range);
+    //  Avoid repeated divides:
+    const float3 sigma_inv = float3(1.0)/sigma;
+    const float3 inner_denom_inv = 0.5 * sigma_inv * sigma_inv;
+    const float3 outer_denom_inv = sigma_inv/sqrt(2.0*pi);
+    if(beam_antialias_level > 0.5)
+    {
+        //  Sample 1/3 pixel away in each direction as well:
+        const float3 sample_offset = float3(pixel_height/3.0);
+        const float3 dist2 = dist + sample_offset;
+        const float3 dist3 = abs(dist - sample_offset);
+        //  Average three pure Gaussian samples:
+        const float3 scale = color/3.0 * outer_denom_inv;
+        const float3 weight1 = exp(-(dist*dist)*inner_denom_inv);
+        const float3 weight2 = exp(-(dist2*dist2)*inner_denom_inv);
+        const float3 weight3 = exp(-(dist3*dist3)*inner_denom_inv);
+        return scale * (weight1 + weight2 + weight3);
+    }
+    else
+    {
+        return color*exp(-(dist*dist)*inner_denom_inv)*outer_denom_inv;
+    }
+}
+
+float3 scanline_generalized_gaussian_sampled_contrib(float3 dist,
+    float3 color, float pixel_height, float sigma_range,
+    float shape_range)
+{
+    //  See scanline_generalized_gaussian_integral_contrib() for details!
+    //  generalized sample =
+    //      beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta)
+    const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
+    const float3 beta = get_generalized_gaussian_beta(color, shape_range);
+    //  Avoid repeated divides:
+    const float3 alpha_inv = float3(1.0)/alpha;
+    const float3 beta_inv = float3(1.0)/beta;
+    const float3 scale = color * beta * 0.5 * alpha_inv /
+        gamma_impl(beta_inv, beta);
+    if(beam_antialias_level > 0.5)
+    {
+        //  Sample 1/3 pixel closer to and farther from the scanline too.
+        const float3 sample_offset = float3(pixel_height/3.0);
+        const float3 dist2 = dist + sample_offset;
+        const float3 dist3 = abs(dist - sample_offset);
+        //  Average three generalized Gaussian samples:
+        const float3 weight1 = exp(-pow(abs(dist*alpha_inv), beta));
+        const float3 weight2 = exp(-pow(abs(dist2*alpha_inv), beta));
+        const float3 weight3 = exp(-pow(abs(dist3*alpha_inv), beta));
+        return scale/3.0 * (weight1 + weight2 + weight3);
+    }
+    else
+    {
+        return scale * exp(-pow(abs(dist*alpha_inv), beta));
+    }
+}
+
+inline float3 scanline_contrib(float3 dist, float3 color,
+    float pixel_height, const float sigma_range, const float shape_range)
+{
+    //  Requires:   1.) Requirements of scanline_gaussian_integral_contrib()
+    //                  must be met.
+    //              2.) Requirements of get_gaussian_sigma() must be met.
+    //              3.) Requirements of get_generalized_gaussian_beta() must be
+    //                  met.
+    //  Returns:    Return a scanline's light output over a given pixel, using
+    //              a generalized or pure Gaussian distribution and sampling or
+    //              integrals as desired by user codepath choices.
+    if(beam_generalized_gaussian)
+    {
+        if(beam_antialias_level > 1.5)
+        {
+            return scanline_generalized_gaussian_integral_contrib(
+                dist, color, pixel_height, sigma_range, shape_range);
+        }
+        else
+        {
+            return scanline_generalized_gaussian_sampled_contrib(
+                dist, color, pixel_height, sigma_range, shape_range);
+        }
+    }
+    else
+    {
+        if(beam_antialias_level > 1.5)
+        {
+            return scanline_gaussian_integral_contrib(
+                dist, color, pixel_height, sigma_range);
+        }
+        else
+        {
+            return scanline_gaussian_sampled_contrib(
+                dist, color, pixel_height, sigma_range);
+        }
+    }
+}
+
+inline float3 get_raw_interpolated_color(const float3 color0,
+    const float3 color1, const float3 color2, const float3 color3,
+    const float4 weights)
+{
+    //  Use max to avoid bizarre artifacts from negative colors:
+    return max(mul(weights, float4x3(color0, color1, color2, color3)), 0.0);
+}
+
+float3 get_interpolated_linear_color(const float3 color0, const float3 color1,
+    const float3 color2, const float3 color3, const float4 weights)
+{
+    //  Requires:   1.) Requirements of include/gamma-management.h must be met:
+    //                  intermediate_gamma must be globally defined, and input
+    //                  colors are interpreted as linear RGB unless you #define
+    //                  GAMMA_ENCODE_EVERY_FBO (in which case they are
+    //                  interpreted as gamma-encoded with intermediate_gamma).
+    //              2.) color0-3 are colors sampled from a texture with tex2D().
+    //                  They are interpreted as defined in requirement 1.
+    //              3.) weights contains weights for each color, summing to 1.0.
+    //              4.) beam_horiz_linear_rgb_weight must be defined as a global
+    //                  float in [0.0, 1.0] describing how much blending should
+    //                  be done in linear RGB (rest is gamma-corrected RGB).
+    //              5.) RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE must be #defined
+    //                  if beam_horiz_linear_rgb_weight is anything other than a
+    //                  static constant, or we may try branching at runtime
+    //                  without dynamic branches allowed (slow).
+    //  Returns:    Return an interpolated color lookup between the four input
+    //              colors based on the weights in weights.  The final color will
+    //              be a linear RGB value, but the blending will be done as
+    //              indicated above.
+    const float intermediate_gamma = get_intermediate_gamma();
+    //  Branch if beam_horiz_linear_rgb_weight is static (for free) or if the
+    //  profile allows dynamic branches (faster than computing extra pows):
+    #ifndef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+    #else
+        #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+            #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+        #endif
+    #endif
+    #ifdef SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+        //  beam_horiz_linear_rgb_weight is static, so we can branch:
+        #ifdef GAMMA_ENCODE_EVERY_FBO
+            const float3 gamma_mixed_color = pow(get_raw_interpolated_color(
+                color0, color1, color2, color3, weights), float3(intermediate_gamma));
+            if(beam_horiz_linear_rgb_weight > 0.0)
+            {
+                const float3 linear_mixed_color = get_raw_interpolated_color(
+                    pow(color0, float3(intermediate_gamma)),
+                    pow(color1, float3(intermediate_gamma)),
+                    pow(color2, float3(intermediate_gamma)),
+                    pow(color3, float3(intermediate_gamma)),
+                    weights);
+                return lerp(gamma_mixed_color, linear_mixed_color,
+                    beam_horiz_linear_rgb_weight);
+            }
+            else
+            {
+                return gamma_mixed_color;
+            }
+        #else
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                color0, color1, color2, color3, weights);
+            if(beam_horiz_linear_rgb_weight < 1.0)
+            {
+                const float3 gamma_mixed_color = get_raw_interpolated_color(
+                    pow(color0, float3(1.0/intermediate_gamma)),
+                    pow(color1, float3(1.0/intermediate_gamma)),
+                    pow(color2, float3(1.0/intermediate_gamma)),
+                    pow(color3, float3(1.0/intermediate_gamma)),
+                    weights);
+                return lerp(gamma_mixed_color, linear_mixed_color,
+                    beam_horiz_linear_rgb_weight);
+            }
+            else
+            {
+                return linear_mixed_color;
+            }
+        #endif  //  GAMMA_ENCODE_EVERY_FBO
+    #else
+        #ifdef GAMMA_ENCODE_EVERY_FBO
+            //  Inputs: color0-3 are colors in gamma-encoded RGB.
+            const float3 gamma_mixed_color = pow(get_raw_interpolated_color(
+                color0, color1, color2, color3, weights), intermediate_gamma);
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                pow(color0, float3(intermediate_gamma)),
+                pow(color1, float3(intermediate_gamma)),
+                pow(color2, float3(intermediate_gamma)),
+                pow(color3, float3(intermediate_gamma)),
+                weights);
+            return lerp(gamma_mixed_color, linear_mixed_color,
+                beam_horiz_linear_rgb_weight);
+        #else
+            //  Inputs: color0-3 are colors in linear RGB.
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                color0, color1, color2, color3, weights);
+            const float3 gamma_mixed_color = get_raw_interpolated_color(
+                    pow(color0, float3(1.0/intermediate_gamma)),
+                    pow(color1, float3(1.0/intermediate_gamma)),
+                    pow(color2, float3(1.0/intermediate_gamma)),
+                    pow(color3, float3(1.0/intermediate_gamma)),
+                    weights);
+			// wtf fixme
+//			const float beam_horiz_linear_rgb_weight1 = 1.0;
+            return lerp(gamma_mixed_color, linear_mixed_color,
+                beam_horiz_linear_rgb_weight);
+        #endif  //  GAMMA_ENCODE_EVERY_FBO
+    #endif  //  SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+}
+
+float3 get_scanline_color(const sampler2D tex, const float2 scanline_uv,
+    const float2 uv_step_x, const float4 weights)
+{
+    //  Requires:   1.) scanline_uv must be vertically snapped to the caller's
+    //                  desired line or scanline and horizontally snapped to the
+    //                  texel just left of the output pixel (color1)
+    //              2.) uv_step_x must contain the horizontal uv distance
+    //                  between texels.
+    //              3.) weights must contain interpolation filter weights for
+    //                  color0, color1, color2, and color3, where color1 is just
+    //                  left of the output pixel.
+    //  Returns:    Return a horizontally interpolated texture lookup using 2-4
+    //              nearby texels, according to weights and the conventions of
+    //              get_interpolated_linear_color().
+    //  We can ignore the outside texture lookups for Quilez resampling.
+    const float3 color1 = COMPAT_TEXTURE(tex, scanline_uv).rgb;
+    const float3 color2 = COMPAT_TEXTURE(tex, scanline_uv + uv_step_x).rgb;
+    float3 color0 = float3(0.0);
+    float3 color3 = float3(0.0);
+    if(beam_horiz_filter > 0.5)
+    {
+        color0 = COMPAT_TEXTURE(tex, scanline_uv - uv_step_x).rgb;
+        color3 = COMPAT_TEXTURE(tex, scanline_uv + 2.0 * uv_step_x).rgb;
+    }
+    //  Sample the texture as-is, whether it's linear or gamma-encoded:
+    //  get_interpolated_linear_color() will handle the difference.
+    return get_interpolated_linear_color(color0, color1, color2, color3, weights);
+}
+
+float3 sample_single_scanline_horizontal(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size,
+    const float2 texture_size_inv)
+{
+    //  TODO: Add function requirements.
+    //  Snap to the previous texel and get sample dists from 2/4 nearby texels:
+    const float2 curr_texel = tex_uv * tex_size;
+    //  Use under_half to fix a rounding bug right around exact texel locations.
+    const float2 prev_texel =
+        floor(curr_texel - float2(under_half)) + float2(0.5);
+    const float2 prev_texel_hor = float2(prev_texel.x, curr_texel.y);
+    const float2 prev_texel_hor_uv = prev_texel_hor * texture_size_inv;
+    const float prev_dist = curr_texel.x - prev_texel_hor.x;
+    const float4 sample_dists = float4(1.0 + prev_dist, prev_dist,
+        1.0 - prev_dist, 2.0 - prev_dist);
+    //  Get Quilez, Lanczos2, or Gaussian resize weights for 2/4 nearby texels:
+    float4 weights;
+    if(beam_horiz_filter < 0.5)
+    {
+        //  Quilez:
+        const float x = sample_dists.y;
+        const float w2 = x*x*x*(x*(x*6.0 - 15.0) + 10.0);
+        weights = float4(0.0, 1.0 - w2, w2, 0.0);
+    }
+    else if(beam_horiz_filter < 1.5)
+    {
+        //  Gaussian:
+        float inner_denom_inv = 1.0/(2.0*beam_horiz_sigma*beam_horiz_sigma);
+        weights = exp(-(sample_dists*sample_dists)*inner_denom_inv);
+    }
+    else
+    {
+        //  Lanczos2:
+        const float4 pi_dists = FIX_ZERO(sample_dists * pi);
+        weights = 2.0 * sin(pi_dists) * sin(pi_dists * 0.5) /
+            (pi_dists * pi_dists);
+    }
+    //  Ensure the weight sum == 1.0:
+    const float4 final_weights = weights/dot(weights, float4(1.0));
+    //  Get the interpolated horizontal scanline color:
+    const float2 uv_step_x = float2(texture_size_inv.x, 0.0);
+    return get_scanline_color(
+        tex, prev_texel_hor_uv, uv_step_x, final_weights);
+}
+
+float3 sample_rgb_scanline_horizontal(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size,
+    const float2 texture_size_inv)
+{
+    //  TODO: Add function requirements.
+    //  Rely on a helper to make convergence easier.
+    if(beam_misconvergence)
+    {
+        const float3 convergence_offsets_rgb =
+            get_convergence_offsets_x_vector();
+        const float3 offset_u_rgb =
+            convergence_offsets_rgb * texture_size_inv.xxx;
+        const float2 scanline_uv_r = tex_uv - float2(offset_u_rgb.r, 0.0);
+        const float2 scanline_uv_g = tex_uv - float2(offset_u_rgb.g, 0.0);
+        const float2 scanline_uv_b = tex_uv - float2(offset_u_rgb.b, 0.0);
+        const float3 sample_r = sample_single_scanline_horizontal(
+            tex, scanline_uv_r, tex_size, texture_size_inv);
+        const float3 sample_g = sample_single_scanline_horizontal(
+            tex, scanline_uv_g, tex_size, texture_size_inv);
+        const float3 sample_b = sample_single_scanline_horizontal(
+            tex, scanline_uv_b, tex_size, texture_size_inv);
+        return float3(sample_r.r, sample_g.g, sample_b.b);
+    }
+    else
+    {
+        return sample_single_scanline_horizontal(tex, tex_uv, tex_size,
+            texture_size_inv);
+    }
+}
+
+float2 get_last_scanline_uv(const float2 tex_uv, const float2 tex_size,
+    const float2 texture_size_inv, const float2 il_step_multiple,
+    const float frame_count, out float dist)
+{
+    //  Compute texture coords for the last/upper scanline, accounting for
+    //  interlacing: With interlacing, only consider even/odd scanlines every
+    //  other frame.  Top-field first (TFF) order puts even scanlines on even
+    //  frames, and BFF order puts them on odd frames.  Texels are centered at:
+    //      frac(tex_uv * tex_size) == x.5
+    //  Caution: If these coordinates ever seem incorrect, first make sure it's
+    //  not because anisotropic filtering is blurring across field boundaries.
+    //  Note: TFF/BFF won't matter for sources that double-weave or similar.
+	// wtf fixme
+//	const float interlace_bff1 = 1.0;
+    const float field_offset = floor(il_step_multiple.y * 0.75) *
+        fmod(frame_count + float(interlace_bff), 2.0);
+    const float2 curr_texel = tex_uv * tex_size;
+    //  Use under_half to fix a rounding bug right around exact texel locations.
+    const float2 prev_texel_num = floor(curr_texel - float2(under_half));
+    const float wrong_field = fmod(
+        prev_texel_num.y + field_offset, il_step_multiple.y);
+    const float2 scanline_texel_num = prev_texel_num - float2(0.0, wrong_field);
+    //  Snap to the center of the previous scanline in the current field:
+    const float2 scanline_texel = scanline_texel_num + float2(0.5);
+    const float2 scanline_uv = scanline_texel * texture_size_inv;
+    //  Save the sample's distance from the scanline, in units of scanlines:
+    dist = (curr_texel.y - scanline_texel.y)/il_step_multiple.y;
+    return scanline_uv;
+}
+
+inline bool is_interlaced(float num_lines)
+{
+    //  Detect interlacing based on the number of lines in the source.
+    if(interlace_detect)
+    {
+        //  NTSC: 525 lines, 262.5/field; 486 active (2 half-lines), 243/field
+        //  NTSC Emulators: Typically 224 or 240 lines
+        //  PAL: 625 lines, 312.5/field; 576 active (typical), 288/field
+        //  PAL Emulators: ?
+        //  ATSC: 720p, 1080i, 1080p
+        //  Where do we place our cutoffs?  Assumptions:
+        //  1.) We only need to care about active lines.
+        //  2.) Anything > 288 and <= 576 lines is probably interlaced.
+        //  3.) Anything > 576 lines is probably not interlaced...
+        //  4.) ...except 1080 lines, which is a crapshoot (user decision).
+        //  5.) Just in case the main program uses calculated video sizes,
+        //      we should nudge the float thresholds a bit.
+        const bool sd_interlace = ((num_lines > 288.5) && (num_lines < 576.5));
+        const bool hd_interlace = bool(interlace_1080i) ?
+            ((num_lines > 1079.5) && (num_lines < 1080.5)) :
+            false;
+        return (sd_interlace || hd_interlace);
+    }
+    else
+    {
+        return false;
+    }
+}
+
+#endif  //  SCANLINE_FUNCTIONS_H
+
+/////////////////////////////  END SCANLINE-FUNCTIONS  ////////////////////////////
+
+//#include "phosphor-mask-resizing.h"
+
+////////////////////////  BEGIN PHOSPHOR-MASK-RESIZING  ////////////////////////
+
+#ifndef PHOSPHOR_MASK_RESIZING_H
+#define PHOSPHOR_MASK_RESIZING_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//#include "../user-settings.h"
+//#include "derived-settings-and-constants.h"
+
+/////////////////////////////  CODEPATH SELECTION  /////////////////////////////
+
+//  Choose a looping strategy based on what's allowed:
+//  Dynamic loops not allowed: Use a flat static loop.
+//  Dynamic loops accomodated: Coarsely branch around static loops.
+//  Dynamic loops assumed allowed: Use a flat dynamic loop.
+#ifndef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #ifdef ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+        #define BREAK_LOOPS_INTO_PIECES
+    #else
+        #define USE_SINGLE_STATIC_LOOP
+    #endif
+#endif  //  No else needed: Dynamic loops assumed.
+
+
+//////////////////////////////////  CONSTANTS  /////////////////////////////////
+
+//  The larger the resized tile, the fewer samples we'll need for downsizing.
+//  See if we can get a static min tile size > mask_min_allowed_tile_size:
+static const float mask_min_allowed_tile_size = ceil(
+    mask_min_allowed_triad_size * mask_triads_per_tile);
+static const float mask_min_expected_tile_size = 
+        mask_min_allowed_tile_size;
+//  Limit the number of sinc resize taps by the maximum minification factor:
+static const float pi_over_lobes = pi/mask_sinc_lobes;
+static const float max_sinc_resize_samples_float = 2.0 * mask_sinc_lobes *
+    mask_resize_src_lut_size.x/mask_min_expected_tile_size;
+//  Vectorized loops sample in multiples of 4.  Round up to be safe:
+static const float max_sinc_resize_samples_m4 = ceil(
+    max_sinc_resize_samples_float * 0.25) * 4.0;
+
+
+/////////////////////////  RESAMPLING FUNCTION HELPERS  ////////////////////////
+
+inline float get_dynamic_loop_size(const float magnification_scale)
+{
+    //  Requires:   The following global constants must be defined:
+    //              1.) mask_sinc_lobes
+    //              2.) max_sinc_resize_samples_m4
+    //  Returns:    The minimum number of texture samples for a correct downsize
+    //              at magnification_scale.
+    //  We're downsizing, so the filter is sized across 2*lobes output pixels
+    //  (not 2*lobes input texels).  This impacts distance measurements and the
+    //  minimum number of input samples needed.
+    const float min_samples_float = 2.0 * mask_sinc_lobes / magnification_scale;
+    const float min_samples_m4 = ceil(min_samples_float * 0.25) * 4.0;
+    #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+        const float max_samples_m4 = max_sinc_resize_samples_m4;
+    #else   // ifdef BREAK_LOOPS_INTO_PIECES
+        //  Simulating loops with branches imposes a 128-sample limit.
+        const float max_samples_m4 = min(128.0, max_sinc_resize_samples_m4);
+    #endif
+    return min(min_samples_m4, max_samples_m4);
+}
+
+float2 get_first_texel_tile_uv_and_dist(const float2 tex_uv, 
+    const float2 tex_size, const float dr, 
+    const float input_tiles_per_texture_r, const float samples,
+    static const bool vertical)
+{
+    //  Requires:   1.) dr == du == 1.0/texture_size.x or
+    //                  dr == dv == 1.0/texture_size.y
+    //                  (whichever direction we're resampling in).
+    //                  It's a scalar to save register space.
+    //              2.) input_tiles_per_texture_r is the number of input tiles
+    //                  that can fit in the input texture in the direction we're
+    //                  resampling this pass.
+    //              3.) vertical indicates whether we're resampling vertically
+    //                  this pass (or horizontally).
+    //  Returns:    Pack and return the first sample's tile_uv coord in [0, 1]
+    //              and its texel distance from the destination pixel, in the
+    //              resized dimension only.
+    //  We'll start with the topmost or leftmost sample and work down or right,
+    //  so get the first sample location and distance.  Modify both dimensions
+    //  as if we're doing a one-pass 2D resize; we'll throw away the unneeded
+    //  (and incorrect) dimension at the end.
+    const float2 curr_texel = tex_uv * tex_size;
+    const float2 prev_texel =
+        floor(curr_texel - float2(under_half)) + float2(0.5);
+    const float2 first_texel = prev_texel - float2(samples/2.0 - 1.0);
+    const float2 first_texel_uv_wrap_2D = first_texel * dr;
+    const float2 first_texel_dist_2D = curr_texel - first_texel;
+    //  Convert from tex_uv to tile_uv coords so we can sub fracs for fmods.
+    const float2 first_texel_tile_uv_wrap_2D =
+        first_texel_uv_wrap_2D * input_tiles_per_texture_r;
+    //  Project wrapped coordinates to the [0, 1] range.  We'll do this with all
+    //  samples,but the first texel is special, since it might be negative.
+    const float2 coord_negative =
+        float2((first_texel_tile_uv_wrap_2D.x < 0.),(first_texel_tile_uv_wrap_2D.y < 0.));
+    const float2 first_texel_tile_uv_2D =
+        frac(first_texel_tile_uv_wrap_2D) + coord_negative;
+    //  Pack the first texel's tile_uv coord and texel distance in 1D:
+    const float2 tile_u_and_dist =
+        float2(first_texel_tile_uv_2D.x, first_texel_dist_2D.x);
+    const float2 tile_v_and_dist =
+        float2(first_texel_tile_uv_2D.y, first_texel_dist_2D.y);
+    return vertical ? tile_v_and_dist : tile_u_and_dist;
+    //return lerp(tile_u_and_dist, tile_v_and_dist, float(vertical));
+}
+
+inline float4 tex2Dlod0try(const sampler2D tex, const float2 tex_uv)
+{
+    //  Mipmapping and anisotropic filtering get confused by sinc-resampling.
+    //  One [slow] workaround is to select the lowest mip level:
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        return textureLod(tex, float4(tex_uv, 0.0, 0.0).xy);
+    #else
+        #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+            return tex2Dbias(tex, float4(tex_uv, 0.0, -16.0));
+        #else
+            return texture(tex, tex_uv);
+        #endif
+    #endif
+}
+
+
+//////////////////////////////  LOOP BODY MACROS  //////////////////////////////
+
+//  Using inline functions can exceed the temporary register limit, so we're
+//  stuck with #define macros (I'm TRULY sorry).  They're declared here instead
+//  of above to be closer to the actual invocation sites.  Steps:
+//  1.) Get the exact texel location.
+//  2.) Sample the phosphor mask (already assumed encoded in linear RGB).
+//  3.) Get the distance from the current pixel and sinc weight:
+//          sinc(dist) = sin(pi * dist)/(pi * dist)
+//      We can also use the slower/smoother Lanczos instead:
+//          L(x) = sinc(dist) * sinc(dist / lobes)
+//  4.) Accumulate the weight sum in weights, and accumulate the weighted texels
+//      in pixel_color (we'll normalize outside the loop at the end).
+//  We vectorize the loop to help reduce the Lanczos window's cost.
+
+    //  The r coord is the coord in the dimension we're resizing along (u or v),
+    //  and first_texel_tile_uv_rrrr is a float4 of the first texel's u or v
+    //  tile_uv coord in [0, 1].  tex_uv_r will contain the tile_uv u or v coord
+    //  for four new texel samples.
+    #define CALCULATE_R_COORD_FOR_4_SAMPLES                                    \
+        const float4 true_i = float4(i_base + i) + float4(0.0, 1.0, 2.0, 3.0); \
+        const float4 tile_uv_r = frac(                                         \
+            first_texel_tile_uv_rrrr + true_i * tile_dr);                      \
+        const float4 tex_uv_r = tile_uv_r * tile_size_uv_r;
+
+    #ifdef PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+        #define CALCULATE_SINC_RESAMPLE_WEIGHTS                                \
+            const float4 pi_dist_over_lobes = pi_over_lobes * dist;            \
+            const float4 weights = min(sin(pi_dist) * sin(pi_dist_over_lobes) /\
+                (pi_dist*pi_dist_over_lobes), float4(1.0));
+    #else
+        #define CALCULATE_SINC_RESAMPLE_WEIGHTS                                \
+            const float4 weights = min(sin(pi_dist)/pi_dist, float4(1.0));
+    #endif
+
+    #define UPDATE_COLOR_AND_WEIGHT_SUMS                                       \
+        const float4 dist = magnification_scale *                              \
+            abs(first_dist_unscaled - true_i);                                 \
+        const float4 pi_dist = pi * dist;                                      \
+        CALCULATE_SINC_RESAMPLE_WEIGHTS;                                       \
+        pixel_color += new_sample0 * weights.xxx;                              \
+        pixel_color += new_sample1 * weights.yyy;                              \
+        pixel_color += new_sample2 * weights.zzz;                              \
+        pixel_color += new_sample3 * weights.www;                              \
+        weight_sum += weights;
+
+    #define VERTICAL_SINC_RESAMPLE_LOOP_BODY                                   \
+        CALCULATE_R_COORD_FOR_4_SAMPLES;                                       \
+        const float3 new_sample0 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.x)).rgb;                                 \
+        const float3 new_sample1 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.y)).rgb;                                 \
+        const float3 new_sample2 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.z)).rgb;                                 \
+        const float3 new_sample3 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.w)).rgb;                                 \
+        UPDATE_COLOR_AND_WEIGHT_SUMS;
+
+    #define HORIZONTAL_SINC_RESAMPLE_LOOP_BODY                                 \
+        CALCULATE_R_COORD_FOR_4_SAMPLES;                                       \
+        const float3 new_sample0 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.x, tex_uv.y)).rgb;                                 \
+        const float3 new_sample1 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.y, tex_uv.y)).rgb;                                 \
+        const float3 new_sample2 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.z, tex_uv.y)).rgb;                                 \
+        const float3 new_sample3 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.w, tex_uv.y)).rgb;                                 \
+        UPDATE_COLOR_AND_WEIGHT_SUMS;
+
+
+////////////////////////////  RESAMPLING FUNCTIONS  ////////////////////////////
+
+float3 downsample_vertical_sinc_tiled(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size, static const float dr,
+    const float magnification_scale, static const float tile_size_uv_r)
+{
+    //  Requires:   1.) dr == du == 1.0/texture_size.x or
+    //                  dr == dv == 1.0/texture_size.y
+    //                  (whichever direction we're resampling in).
+    //                  It's a scalar to save register space.
+    //              2.) tile_size_uv_r is the number of texels an input tile
+    //                  takes up in the input texture, in the direction we're
+    //                  resampling this pass.
+    //              3.) magnification_scale must be <= 1.0.
+    //  Returns:    Return a [Lanczos] sinc-resampled pixel of a vertically
+    //              downsized input tile embedded in an input texture.  (The
+    //              vertical version is special-cased though: It assumes the
+    //              tile size equals the [static] texture size, since it's used
+    //              on an LUT texture input containing one tile.  For more
+    //              generic use, eliminate the "static" in the parameters.)
+    //  The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension
+    //  we're resizing along, e.g. "dy" in this case.
+    #ifdef USE_SINGLE_STATIC_LOOP
+        //  A static loop can be faster, but it might blur too much from using
+        //  more samples than it should.
+        static const int samples = int(max_sinc_resize_samples_m4);
+    #else
+        const int samples = int(get_dynamic_loop_size(magnification_scale));
+    #endif
+
+    //  Get the first sample location (scalar tile uv coord along the resized
+    //  dimension) and distance from the output location (in texels):
+    static const float input_tiles_per_texture_r = 1.0/tile_size_uv_r;
+    //  true = vertical resize:
+    const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist(
+        tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, true);
+    const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx;
+    const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy;
+    //  Get the tile sample offset:
+    static const float tile_dr = dr * input_tiles_per_texture_r;
+
+    //  Sum up each weight and weighted sample color, varying the looping
+    //  strategy based on our expected dynamic loop capabilities.  See the
+    //  loop body macros above.
+    int i_base = 0;
+    float4 weight_sum = float4(0.0);
+    float3 pixel_color = float3(0.0);
+    static const int i_step = 4;
+    #ifdef BREAK_LOOPS_INTO_PIECES
+        if(samples - i_base >= 64)
+        {
+            for(int i = 0; i < 64; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 64;
+        }
+        if(samples - i_base >= 32)
+        {
+            for(int i = 0; i < 32; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 32;
+        }
+        if(samples - i_base >= 16)
+        {
+            for(int i = 0; i < 16; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 16;
+        }
+        if(samples - i_base >= 8)
+        {
+            for(int i = 0; i < 8; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 8;
+        }
+        if(samples - i_base >= 4)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 4;
+        }
+        //  Do another 4-sample block for a total of 128 max samples.
+        if(samples - i_base > 0)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+        }
+    #else
+        for(int i = 0; i < samples; i += i_step)
+        {
+            VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+        }
+    #endif
+    //  Normalize so the weight_sum == 1.0, and return:
+    const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw;
+    const float3 scalar_weight_sum = float3(weight_sum_reduce.x + 
+        weight_sum_reduce.y);
+    return (pixel_color/scalar_weight_sum);
+}
+
+float3 downsample_horizontal_sinc_tiled(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size, const float dr,
+    const float magnification_scale, const float tile_size_uv_r)
+{
+    //  Differences from downsample_horizontal_sinc_tiled:
+    //  1.) The dr and tile_size_uv_r parameters are not static consts.
+    //  2.) The "vertical" parameter to get_first_texel_tile_uv_and_dist is
+    //      set to false instead of true.
+    //  3.) The horizontal version of the loop body is used.
+    //  TODO: If we can get guaranteed compile-time dead code elimination,
+    //  we can combine the vertical/horizontal downsampling functions by:
+    //  1.) Add an extra static const bool parameter called "vertical."
+    //  2.) Supply it with the result of get_first_texel_tile_uv_and_dist().
+    //  3.) Use a conditional assignment in the loop body macro.  This is the
+    //      tricky part: We DO NOT want to incur the extra conditional
+    //      assignment in the inner loop at runtime!
+    //  The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension
+    //  we're resizing along, e.g. "dx" in this case.
+    #ifdef USE_SINGLE_STATIC_LOOP
+        //  If we have to load all samples, we might as well use them.
+        static const int samples = int(max_sinc_resize_samples_m4);
+    #else
+        const int samples = int(get_dynamic_loop_size(magnification_scale));
+    #endif
+
+    //  Get the first sample location (scalar tile uv coord along resized
+    //  dimension) and distance from the output location (in texels):
+    const float input_tiles_per_texture_r = 1.0/tile_size_uv_r;
+    //  false = horizontal resize:
+    const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist(
+        tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, false);
+    const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx;
+    const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy;
+    //  Get the tile sample offset:
+    const float tile_dr = dr * input_tiles_per_texture_r;
+
+    //  Sum up each weight and weighted sample color, varying the looping
+    //  strategy based on our expected dynamic loop capabilities.  See the
+    //  loop body macros above.
+    int i_base = 0;
+    float4 weight_sum = float4(0.0);
+    float3 pixel_color = float3(0.0);
+    static const int i_step = 4;
+    #ifdef BREAK_LOOPS_INTO_PIECES
+        if(samples - i_base >= 64)
+        {
+            for(int i = 0; i < 64; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 64;
+        }
+        if(samples - i_base >= 32)
+        {
+            for(int i = 0; i < 32; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 32;
+        }
+        if(samples - i_base >= 16)
+        {
+            for(int i = 0; i < 16; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 16;
+        }
+        if(samples - i_base >= 8)
+        {
+            for(int i = 0; i < 8; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 8;
+        }
+        if(samples - i_base >= 4)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 4;
+        }
+        //  Do another 4-sample block for a total of 128 max samples.
+        if(samples - i_base > 0)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+        }
+    #else
+        for(int i = 0; i < samples; i += i_step)
+        {
+            HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+        }
+    #endif
+    //  Normalize so the weight_sum == 1.0, and return:
+    const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw;
+    const float3 scalar_weight_sum = float3(weight_sum_reduce.x +
+        weight_sum_reduce.y);
+    return (pixel_color/scalar_weight_sum);
+}
+
+
+////////////////////////////  TILE SIZE CALCULATION  ///////////////////////////
+
+float2 get_resized_mask_tile_size(const float2 estimated_viewport_size,
+    const float2 estimated_mask_resize_output_size,
+    const bool solemnly_swear_same_inputs_for_every_pass)
+{
+    //  Requires:   The following global constants must be defined according to
+    //              certain constraints:
+    //              1.) mask_resize_num_triads: Must be high enough that our
+    //                  mask sampling method won't have artifacts later
+    //                  (long story; see derived-settings-and-constants.h)
+    //              2.) mask_resize_src_lut_size: Texel size of our mask LUT
+    //              3.) mask_triads_per_tile: Num horizontal triads in our LUT
+    //              4.) mask_min_allowed_triad_size: User setting (the more
+    //                  restrictive it is, the faster the resize will go)
+    //              5.) mask_min_allowed_tile_size_x < mask_resize_src_lut_size.x
+    //              6.) mask_triad_size_desired_{runtime, static}
+    //              7.) mask_num_triads_desired_{runtime, static}
+    //              8.) mask_specify_num_triads must be 0.0/1.0 (false/true)
+    //              The function parameters must be defined as follows:
+    //              1.) estimated_viewport_size == (final viewport size);
+    //                  If mask_specify_num_triads is 1.0/true and the viewport
+    //                  estimate is wrong, the number of triads will differ from
+    //                  the user's preference by about the same factor.
+    //              2.) estimated_mask_resize_output_size: Must equal the
+    //                  output size of the MASK_RESIZE pass.
+    //                  Exception: The x component may be estimated garbage if
+    //                  and only if the caller throws away the x result.
+    //              3.) solemnly_swear_same_inputs_for_every_pass: Set to false,
+    //                  unless you can guarantee that every call across every
+    //                  pass will use the same sizes for the other parameters.
+    //              When calling this across multiple passes, always use the
+    //              same y viewport size/scale, and always use the same x
+    //              viewport size/scale when using the x result.
+    //  Returns:    Return the final size of a manually resized mask tile, after
+    //              constraining the desired size to avoid artifacts.  Under
+    //              unusual circumstances, tiles may become stretched vertically
+    //              (see wall of text below).
+    //  Stated tile properties must be correct:
+    static const float tile_aspect_ratio_inv =
+        mask_resize_src_lut_size.y/mask_resize_src_lut_size.x;
+    static const float tile_aspect_ratio = 1.0/tile_aspect_ratio_inv;
+    static const float2 tile_aspect = float2(1.0, tile_aspect_ratio_inv);
+    //  If mask_specify_num_triads is 1.0/true and estimated_viewport_size.x is
+    //  wrong, the user preference will be misinterpreted:
+    const float desired_tile_size_x = mask_triads_per_tile * lerp(
+        mask_triad_size_desired,
+        estimated_viewport_size.x / mask_num_triads_desired,
+        mask_specify_num_triads);
+    if(get_mask_sample_mode() > 0.5)
+    {
+        //  We don't need constraints unless we're sampling MASK_RESIZE.
+        return desired_tile_size_x * tile_aspect;
+    }
+    //  Make sure we're not upsizing:
+    const float temp_tile_size_x =
+        min(desired_tile_size_x, mask_resize_src_lut_size.x);
+    //  Enforce min_tile_size and max_tile_size in both dimensions:
+    const float2 temp_tile_size = temp_tile_size_x * tile_aspect;
+    static const float2 min_tile_size =
+        mask_min_allowed_tile_size * tile_aspect;
+    const float2 max_tile_size =
+        estimated_mask_resize_output_size / mask_resize_num_tiles;
+    const float2 clamped_tile_size =
+        clamp(temp_tile_size, min_tile_size, max_tile_size);
+    //  Try to maintain tile_aspect_ratio.  This is the tricky part:
+    //  If we're currently resizing in the y dimension, the x components
+    //  could be MEANINGLESS.  (If estimated_mask_resize_output_size.x is
+    //  bogus, then so is max_tile_size.x and clamped_tile_size.x.)
+    //  We can't adjust the y size based on clamped_tile_size.x.  If it
+    //  clamps when it shouldn't, it won't clamp again when later passes
+    //  call this function with the correct sizes, and the discrepancy will
+    //  break the sampling coords in MASKED_SCANLINES.  Instead, we'll limit
+    //  the x size based on the y size, but not vice versa, unless the
+    //  caller swears the parameters were the same (correct) in every pass.
+    //  As a result, triads could appear vertically stretched if:
+    //  a.) mask_resize_src_lut_size.x > mask_resize_src_lut_size.y: Wide
+    //      LUT's might clamp x more than y (all provided LUT's are square)
+    //  b.) true_viewport_size.x < true_viewport_size.y: The user is playing
+    //      with a vertically oriented screen (not accounted for anyway)
+    //  c.) mask_resize_viewport_scale.x < masked_resize_viewport_scale.y:
+    //      Viewport scales are equal by default.
+    //  If any of these are the case, you can fix the stretching by setting:
+    //      mask_resize_viewport_scale.x = mask_resize_viewport_scale.y *
+    //          (1.0 / min_expected_aspect_ratio) *
+    //          (mask_resize_src_lut_size.x / mask_resize_src_lut_size.y)
+    const float x_tile_size_from_y =
+        clamped_tile_size.y * tile_aspect_ratio;
+    const float y_tile_size_from_x = lerp(clamped_tile_size.y,
+        clamped_tile_size.x * tile_aspect_ratio_inv,
+        float(solemnly_swear_same_inputs_for_every_pass));
+    const float2 reclamped_tile_size = float2(
+        min(clamped_tile_size.x, x_tile_size_from_y),
+        min(clamped_tile_size.y, y_tile_size_from_x));
+    //  We need integer tile sizes in both directions for tiled sampling to
+    //  work correctly.  Use floor (to make sure we don't round up), but be
+    //  careful to avoid a rounding bug where floor decreases whole numbers:
+    const float2 final_resized_tile_size =
+        floor(reclamped_tile_size + float2(FIX_ZERO(0.0)));
+    return final_resized_tile_size;
+}
+
+
+/////////////////////////  FINAL MASK SAMPLING HELPERS  ////////////////////////
+
+float4 get_mask_sampling_parameters(const float2 mask_resize_texture_size,
+    const float2 mask_resize_video_size, const float2 true_viewport_size,
+    out float2 mask_tiles_per_screen)
+{
+    //  Requires:   1.) Requirements of get_resized_mask_tile_size() must be
+    //                  met, particularly regarding global constants.
+    //              The function parameters must be defined as follows:
+    //              1.) mask_resize_texture_size == MASK_RESIZE.texture_size
+    //                  if get_mask_sample_mode() is 0 (otherwise anything)
+    //              2.) mask_resize_video_size == MASK_RESIZE.video_size
+    //                  if get_mask_sample_mode() is 0 (otherwise anything)
+    //              3.) true_viewport_size == output_size for a pass set to
+    //                  1.0 viewport scale (i.e. it must be correct)
+    //  Returns:    Return a float4 containing:
+    //                  xy: tex_uv coords for the start of the mask tile
+    //                  zw: tex_uv size of the mask tile from start to end
+    //              mask_tiles_per_screen is an out parameter containing the
+    //              number of mask tiles that will fit on the screen.
+    //  First get the final resized tile size.  The viewport size and mask
+    //  resize viewport scale must be correct, but don't solemnly swear they
+    //  were correct in both mask resize passes unless you know it's true.
+    //  (We can better ensure a correct tile aspect ratio if the parameters are
+    //  guaranteed correct in all passes...but if we lie, we'll get inconsistent
+    //  sizes across passes, resulting in broken texture coordinates.)
+    const float mask_sample_mode = get_mask_sample_mode();
+    const float2 mask_resize_tile_size = get_resized_mask_tile_size(
+        true_viewport_size, mask_resize_video_size, false);
+    if(mask_sample_mode < 0.5)
+    {
+        //  Sample MASK_RESIZE: The resized tile is a fraction of the texture
+        //  size and starts at a nonzero offset to allow for border texels:
+        const float2 mask_tile_uv_size = mask_resize_tile_size /
+            mask_resize_texture_size;
+        const float2 skipped_tiles = mask_start_texels/mask_resize_tile_size;
+        const float2 mask_tile_start_uv = skipped_tiles * mask_tile_uv_size;
+        //  mask_tiles_per_screen must be based on the *true* viewport size:
+        mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size;
+        return float4(mask_tile_start_uv, mask_tile_uv_size);
+    }
+    else
+    {
+        //  If we're tiling at the original size (1:1 pixel:texel), redefine a
+        //  "tile" to be the full texture containing many triads.  Otherwise,
+        //  we're hardware-resampling an LUT, and the texture truly contains a
+        //  single unresized phosphor mask tile anyway.
+        static const float2 mask_tile_uv_size = float2(1.0);
+        static const float2 mask_tile_start_uv = float2(0.0);
+        if(mask_sample_mode > 1.5)
+        {
+            //  Repeat the full LUT at a 1:1 pixel:texel ratio without resizing:
+            mask_tiles_per_screen = true_viewport_size/mask_texture_large_size;
+        }
+        else
+        {
+            //  Hardware-resize the original LUT:
+            mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size;
+        }
+        return float4(mask_tile_start_uv, mask_tile_uv_size);
+    }
+}
+/*
+float2 fix_tiling_discontinuities_normalized(const float2 tile_uv,
+    float2 duv_dx, float2 duv_dy)
+{
+    //  Requires:   1.) duv_dx == ddx(tile_uv)
+    //              2.) duv_dy == ddy(tile_uv)
+    //              3.) tile_uv contains tile-relative uv coords in [0, 1],
+    //                  such that (0.5, 0.5) is the center of a tile, etc.
+    //                  ("Tile" can mean texture, the video embedded in the
+    //                  texture, or some other "tile" embedded in a texture.)
+    //  Returns:    Return new tile_uv coords that contain no discontinuities
+    //              across a 2x2 pixel quad.
+    //  Description:
+    //  When uv coords wrap from 1.0 to 0.0, they create a discontinuity in the
+    //  derivatives, which we assume happened if the absolute difference between
+    //  any fragment in a 2x2 block is > ~half a tile.  If the current block has
+    //  a u or v discontinuity and the current fragment is in the first half of
+    //  the tile along that axis (i.e. it wrapped from 1.0 to 0.0), add a tile
+    //  to that coord to make the 2x2 block continuous.  (It will now have a
+    //  coord > 1.0 in the padding area beyond the tile.)  This function takes
+    //  derivatives as parameters so the caller can reuse them.
+    //  In case we're using high-quality (nVidia-style) derivatives, ensure
+    //  diagonically opposite fragments see each other for correctness:
+    duv_dx = abs(duv_dx) + abs(ddy(duv_dx));
+    duv_dy = abs(duv_dy) + abs(ddx(duv_dy));
+    const float2 pixel_in_first_half_tile = float2((tile_uv.x < 0.5),(tile_uv.y < 0.5));
+    const float2 jump_exists = float2(((duv_dx + duv_dy).x > 0.5),((duv_dx + duv_dy).y > 0.5));
+    return tile_uv + jump_exists * pixel_in_first_half_tile;
+}
+*/
+float2 convert_phosphor_tile_uv_wrap_to_tex_uv(const float2 tile_uv_wrap,
+    const float4 mask_tile_start_uv_and_size)
+{
+    //  Requires:   1.) tile_uv_wrap contains tile-relative uv coords, where the
+    //                  tile spans from [0, 1], such that (0.5, 0.5) is at the
+    //                  tile center.  The input coords can range from [0, inf],
+    //                  and their fractional parts map to a repeated tile.
+    //                  ("Tile" can mean texture, the video embedded in the
+    //                  texture, or some other "tile" embedded in a texture.)
+    //              2.) mask_tile_start_uv_and_size.xy contains tex_uv coords
+    //                  for the start of the embedded tile in the full texture.
+    //              3.) mask_tile_start_uv_and_size.zw contains the [fractional]
+    //                  tex_uv size of the embedded tile in the full texture.
+    //  Returns:    Return tex_uv coords (used for texture sampling)
+    //              corresponding to tile_uv_wrap.
+    if(get_mask_sample_mode() < 0.5)
+    {
+        //  Manually repeat the resized mask tile to fill the screen:
+        //  First get fractional tile_uv coords.  Using frac/fmod on coords
+        //  confuses anisotropic filtering; fix it as user options dictate.
+        //  derived-settings-and-constants.h disables incompatible options.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            float2 tile_uv = frac(tile_uv_wrap * 0.5) * 2.0;
+        #else
+            float2 tile_uv = frac(tile_uv_wrap);
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            const float2 tile_uv_dx = ddx(tile_uv);
+            const float2 tile_uv_dy = ddy(tile_uv);
+            tile_uv = fix_tiling_discontinuities_normalized(tile_uv,
+                tile_uv_dx, tile_uv_dy);
+        #endif
+        //  The tile is embedded in a padded FBO, and it may start at a
+        //  nonzero offset if border texels are used to avoid artifacts:
+        const float2 mask_tex_uv = mask_tile_start_uv_and_size.xy +
+            tile_uv * mask_tile_start_uv_and_size.zw;
+        return mask_tex_uv;
+    }
+    else
+    {
+        //  Sample from the input phosphor mask texture with hardware tiling.
+        //  If we're tiling at the original size (mode 2), the "tile" is the
+        //  whole texture, and it contains a large number of triads mapped with
+        //  a 1:1 pixel:texel ratio.  OTHERWISE, the texture contains a single
+        //  unresized tile.  tile_uv_wrap already has correct coords for both!
+        return tile_uv_wrap;
+    }
+}
+
+
+#endif  //  PHOSPHOR_MASK_RESIZING_H
+
+/////////////////////////  END PHOSPHOR-MASK-RESIZING  /////////////////////////
+
+//#include "../../../../include/gamma-management.h"
+// already got it
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////////  HELPERS  //////////////////////////////////
+
+inline float4 tex2Dtiled_mask_linearize(const sampler2D tex,
+    const float2 tex_uv)
+{
+    //  If we're manually tiling a texture, anisotropic filtering can get
+    //  confused.  One workaround is to just select the lowest mip level:
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+            //  TODO: Use tex2Dlod_linearize with a calculated mip level.
+            return tex2Dlod_linearize(tex, float4(tex_uv, 0.0, 0.0));
+        #else
+            #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+                return tex2Dbias_linearize(tex, float4(tex_uv, 0.0, -16.0));
+            #else
+                return tex2D_linearize(tex, tex_uv);
+            #endif
+        #endif
+    #else
+        return tex2D_linearize(tex, tex_uv);
+    #endif
+}
+
+#undef COMPAT_PRECISION
+#undef COMPAT_TEXTURE
+
+// END VERTEX INCLUDES //
+
+//////////////////////////////  FRAGMENT INCLUDES  //////////////////////////////
+
+//#include "bloom-functions.h"
+
+////////////////////////////  BEGIN BLOOM-FUNCTIONS  ///////////////////////////
+
+#ifndef BLOOM_FUNCTIONS_H
+#define BLOOM_FUNCTIONS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These utility functions and constants help several passes determine the
+//  size and center texel weight of the phosphor bloom in a uniform manner.
+
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//  We need to calculate the correct blur sigma using some .cgp constants:
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+////////////////////////////  END USER-SETTINGS  //////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  END DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////////////
+
+//#include "../../../../include/blur-functions.h"
+
+////////////////////////////  BEGIN BLUR-FUNCTIONS  ///////////////////////////
+
+#ifndef BLUR_FUNCTIONS_H
+#define BLUR_FUNCTIONS_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides reusable one-pass and separable (two-pass) blurs.
+//  Requires:   All blurs share these requirements (dxdy requirement is split):
+//              1.) All requirements of gamma-management.h must be satisfied!
+//              2.) filter_linearN must == "true" in your .cgp preset unless
+//                  you're using tex2DblurNresize at 1x scale.
+//              3.) mipmap_inputN must == "true" in your .cgp preset if
+//                  output_size < video_size.
+//              4.) output_size == video_size / pow(2, M), where M is some
+//                  positive integer.  tex2Dblur*resize can resize arbitrarily
+//                  (and the blur will be done after resizing), but arbitrary
+//                  resizes "fail" with other blurs due to the way they mix
+//                  static weights with bilinear sample exploitation.
+//              5.) In general, dxdy should contain the uv pixel spacing:
+//                      dxdy = (video_size/output_size)/texture_size
+//              6.) For separable blurs (tex2DblurNresize and tex2DblurNfast),
+//                  zero out the dxdy component in the unblurred dimension:
+//                      dxdy = float2(dxdy.x, 0.0) or float2(0.0, dxdy.y)
+//              Many blurs share these requirements:
+//              1.) One-pass blurs require scale_xN == scale_yN or scales > 1.0,
+//                  or they will blur more in the lower-scaled dimension.
+//              2.) One-pass shared sample blurs require ddx(), ddy(), and
+//                  tex2Dlod() to be supported by the current Cg profile, and
+//                  the drivers must support high-quality derivatives.
+//              3.) One-pass shared sample blurs require:
+//                      tex_uv.w == log2(video_size/output_size).y;
+//              Non-wrapper blurs share this requirement:
+//              1.) sigma is the intended standard deviation of the blur
+//              Wrapper blurs share this requirement, which is automatically
+//              met (unless OVERRIDE_BLUR_STD_DEVS is #defined; see below):
+//              1.) blurN_std_dev must be global static const float values
+//                  specifying standard deviations for Nx blurs in units
+//                  of destination pixels
+//  Optional:   1.) The including file (or an earlier included file) may
+//                  optionally #define USE_BINOMIAL_BLUR_STD_DEVS to replace
+//                  default standard deviations with those matching a binomial
+//                  distribution.  (See below for details/properties.)
+//              2.) The including file (or an earlier included file) may
+//                  optionally #define OVERRIDE_BLUR_STD_DEVS and override:
+//                      static const float blur3_std_dev
+//                      static const float blur4_std_dev
+//                      static const float blur5_std_dev
+//                      static const float blur6_std_dev
+//                      static const float blur7_std_dev
+//                      static const float blur8_std_dev
+//                      static const float blur9_std_dev
+//                      static const float blur10_std_dev
+//                      static const float blur11_std_dev
+//                      static const float blur12_std_dev
+//                      static const float blur17_std_dev
+//                      static const float blur25_std_dev
+//                      static const float blur31_std_dev
+//                      static const float blur43_std_dev
+//              3.) The including file (or an earlier included file) may
+//                  optionally #define OVERRIDE_ERROR_BLURRING and override:
+//                      static const float error_blurring
+//                  This tuning value helps mitigate weighting errors from one-
+//                  pass shared-sample blurs sharing bilinear samples between
+//                  fragments.  Values closer to 0.0 have "correct" blurriness
+//                  but allow more artifacts, and values closer to 1.0 blur away
+//                  artifacts by sampling closer to halfway between texels.
+//              UPDATE 6/21/14: The above static constants may now be overridden
+//              by non-static uniform constants.  This permits exposing blur
+//              standard deviations as runtime GUI shader parameters.  However,
+//              using them keeps weights from being statically computed, and the
+//              speed hit depends on the blur: On my machine, uniforms kill over
+//              53% of the framerate with tex2Dblur12x12shared, but they only
+//              drop the framerate by about 18% with tex2Dblur11fast.
+//  Quality and Performance Comparisons:
+//  For the purposes of the following discussion, "no sRGB" means
+//  GAMMA_ENCODE_EVERY_FBO is #defined, and "sRGB" means it isn't.
+//  1.) tex2DblurNfast is always faster than tex2DblurNresize.
+//  2.) tex2DblurNresize functions are the only ones that can arbitrarily resize
+//      well, because they're the only ones that don't exploit bilinear samples.
+//      This also means they're the only functions which can be truly gamma-
+//      correct without linear (or sRGB FBO) input, but only at 1x scale.
+//  3.) One-pass shared sample blurs only have a speed advantage without sRGB.
+//      They also have some inaccuracies due to their shared-[bilinear-]sample
+//      design, which grow increasingly bothersome for smaller blurs and higher-
+//      frequency source images (relative to their resolution).  I had high
+//      hopes for them, but their most realistic use case is limited to quickly
+//      reblurring an already blurred input at full resolution.  Otherwise:
+//      a.) If you're blurring a low-resolution source, you want a better blur.
+//      b.) If you're blurring a lower mipmap, you want a better blur.
+//      c.) If you're blurring a high-resolution, high-frequency source, you
+//          want a better blur.
+//  4.) The one-pass blurs without shared samples grow slower for larger blurs,
+//      but they're competitive with separable blurs at 5x5 and smaller, and
+//      even tex2Dblur7x7 isn't bad if you're wanting to conserve passes.
+//  Here are some framerates from a GeForce 8800GTS.  The first pass resizes to
+//  viewport size (4x in this test) and linearizes for sRGB codepaths, and the
+//  remaining passes perform 6 full blurs.  Mipmapped tests are performed at the
+//  same scale, so they just measure the cost of mipmapping each FBO (only every
+//  other FBO is mipmapped for separable blurs, to mimic realistic usage).
+//  Mipmap      Neither     sRGB+Mipmap sRGB        Function
+//  76.0        92.3        131.3       193.7       tex2Dblur3fast
+//  63.2        74.4        122.4       175.5       tex2Dblur3resize
+//  93.7        121.2       159.3       263.2       tex2Dblur3x3
+//  59.7        68.7        115.4       162.1       tex2Dblur3x3resize
+//  63.2        74.4        122.4       175.5       tex2Dblur5fast
+//  49.3        54.8        100.0       132.7       tex2Dblur5resize
+//  59.7        68.7        115.4       162.1       tex2Dblur5x5
+//  64.9        77.2        99.1        137.2       tex2Dblur6x6shared
+//  55.8        63.7        110.4       151.8       tex2Dblur7fast
+//  39.8        43.9        83.9        105.8       tex2Dblur7resize
+//  40.0        44.2        83.2        104.9       tex2Dblur7x7
+//  56.4        65.5        71.9        87.9        tex2Dblur8x8shared
+//  49.3        55.1        99.9        132.5       tex2Dblur9fast
+//  33.3        36.2        72.4        88.0        tex2Dblur9resize
+//  27.8        29.7        61.3        72.2        tex2Dblur9x9
+//  37.2        41.1        52.6        60.2        tex2Dblur10x10shared
+//  44.4        49.5        91.3        117.8       tex2Dblur11fast
+//  28.8        30.8        63.6        75.4        tex2Dblur11resize
+//  33.6        36.5        40.9        45.5        tex2Dblur12x12shared
+//  TODO: Fill in benchmarks for new untested blurs.
+//                                                  tex2Dblur17fast
+//                                                  tex2Dblur25fast
+//                                                  tex2Dblur31fast
+//                                                  tex2Dblur43fast
+//                                                  tex2Dblur3x3resize
+
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+//  Set static standard deviations, but allow users to override them with their
+//  own constants (even non-static uniforms if they're okay with the speed hit):
+#ifndef OVERRIDE_BLUR_STD_DEVS
+    //  blurN_std_dev values are specified in terms of dxdy strides.
+    #ifdef USE_BINOMIAL_BLUR_STD_DEVS
+        //  By request, we can define standard deviations corresponding to a
+        //  binomial distribution with p = 0.5 (related to Pascal's triangle).
+        //  This distribution works such that blurring multiple times should
+        //  have the same result as a single larger blur.  These values are
+        //  larger than default for blurs up to 6x and smaller thereafter.
+        static const float blur3_std_dev = 0.84931640625;
+        static const float blur4_std_dev = 0.84931640625;
+        static const float blur5_std_dev = 1.0595703125;
+        static const float blur6_std_dev = 1.06591796875;
+        static const float blur7_std_dev = 1.17041015625;
+        static const float blur8_std_dev = 1.1720703125;
+        static const float blur9_std_dev = 1.2259765625;
+        static const float blur10_std_dev = 1.21982421875;
+        static const float blur11_std_dev = 1.25361328125;
+        static const float blur12_std_dev = 1.2423828125;
+        static const float blur17_std_dev = 1.27783203125;
+        static const float blur25_std_dev = 1.2810546875;
+        static const float blur31_std_dev = 1.28125;
+        static const float blur43_std_dev = 1.28125;
+    #else
+        //  The defaults are the largest values that keep the largest unused
+        //  blur term on each side <= 1.0/256.0.  (We could get away with more
+        //  or be more conservative, but this compromise is pretty reasonable.)
+        static const float blur3_std_dev = 0.62666015625;
+        static const float blur4_std_dev = 0.66171875;
+        static const float blur5_std_dev = 0.9845703125;
+        static const float blur6_std_dev = 1.02626953125;
+        static const float blur7_std_dev = 1.36103515625;
+        static const float blur8_std_dev = 1.4080078125;
+        static const float blur9_std_dev = 1.7533203125;
+        static const float blur10_std_dev = 1.80478515625;
+        static const float blur11_std_dev = 2.15986328125;
+        static const float blur12_std_dev = 2.215234375;
+        static const float blur17_std_dev = 3.45535583496;
+        static const float blur25_std_dev = 5.3409576416;
+        static const float blur31_std_dev = 6.86488037109;
+        static const float blur43_std_dev = 10.1852050781;
+    #endif  //  USE_BINOMIAL_BLUR_STD_DEVS
+#endif  //  OVERRIDE_BLUR_STD_DEVS
+
+#ifndef OVERRIDE_ERROR_BLURRING
+    //  error_blurring should be in [0.0, 1.0].  Higher values reduce ringing
+    //  in shared-sample blurs but increase blurring and feature shifting.
+    static const float error_blurring = 0.5;
+#endif
+
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//  gamma-management.h relies on pass-specific settings to guide its behavior:
+//  FIRST_PASS, LAST_PASS, GAMMA_ENCODE_EVERY_FBO, etc.  See it for details.
+//#include "gamma-management.h"
+
+////////////////////////////  BEGIN GAMMA-MANAGEMENT  //////////////////////////
+
+#ifndef GAMMA_MANAGEMENT_H
+#define GAMMA_MANAGEMENT_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//  
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides gamma-aware tex*D*() and encode_output() functions.
+//  Requires:   Before #include-ing this file, the including file must #define
+//              the following macros when applicable and follow their rules:
+//              1.) #define FIRST_PASS if this is the first pass.
+//              2.) #define LAST_PASS if this is the last pass.
+//              3.) If sRGB is available, set srgb_framebufferN = "true" for
+//                  every pass except the last in your .cgp preset.
+//              4.) If sRGB isn't available but you want gamma-correctness with
+//                  no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
+//              5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
+//              6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
+//              7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
+//              8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
+//              If an option in [5, 8] is #defined in the first or last pass, it
+//              should be #defined for both.  It shouldn't make a difference
+//              whether it's #defined for intermediate passes or not.
+//  Optional:   The including file (or an earlier included file) may optionally
+//              #define a number of macros indicating it will override certain
+//              macros and associated constants are as follows:
+//              static constants with either static or uniform constants.  The
+//              1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
+//                  static const float ntsc_gamma
+//                  static const float pal_gamma
+//                  static const float crt_reference_gamma_high
+//                  static const float crt_reference_gamma_low
+//                  static const float lcd_reference_gamma
+//                  static const float crt_office_gamma
+//                  static const float lcd_office_gamma
+//              2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
+//                  static const float crt_gamma
+//                  static const float gba_gamma
+//                  static const float lcd_gamma
+//              3.) OVERRIDE_FINAL_GAMMA: The user must first define:
+//                  static const float input_gamma
+//                  static const float intermediate_gamma
+//                  static const float output_gamma
+//                  (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
+//              4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
+//                  static const bool assume_opaque_alpha
+//              The gamma constant overrides must be used in every pass or none,
+//              and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
+//              OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
+//  Usage:      After setting macros appropriately, ignore gamma correction and
+//              replace all tex*D*() calls with equivalent gamma-aware
+//              tex*D*_linearize calls, except:
+//              1.) When you read an LUT, use regular tex*D or a gamma-specified
+//                  function, depending on its gamma encoding:
+//                      tex*D*_linearize_gamma (takes a runtime gamma parameter)
+//              2.) If you must read pass0's original input in a later pass, use
+//                  tex2D_linearize_ntsc_gamma.  If you want to read pass0's
+//                  input with gamma-corrected bilinear filtering, consider
+//                  creating a first linearizing pass and reading from the input
+//                  of pass1 later.
+//              Then, return encode_output(color) from every fragment shader.
+//              Finally, use the global gamma_aware_bilinear boolean if you want
+//              to statically branch based on whether bilinear filtering is
+//              gamma-correct or not (e.g. for placing Gaussian blur samples).
+//
+//  Detailed Policy:
+//  tex*D*_linearize() functions enforce a consistent gamma-management policy
+//  based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings.  They assume
+//  their input texture has the same encoding characteristics as the input for
+//  the current pass (which doesn't apply to the exceptions listed above).
+//  Similarly, encode_output() enforces a policy based on the LAST_PASS and
+//  GAMMA_ENCODE_EVERY_FBO settings.  Together, they result in one of the
+//  following two pipelines.
+//  Typical pipeline with intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = linear_color;     //  Automatic sRGB encoding
+//      linear_color = intermediate_output;     //  Automatic sRGB decoding
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Typical pipeline without intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
+//      linear_color = pow(intermediate_output, intermediate_gamma);
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
+//  easily get gamma-correctness without banding on devices where sRGB isn't
+//  supported.
+//
+//  Use This Header to Maximize Code Reuse:
+//  The purpose of this header is to provide a consistent interface for texture
+//  reads and output gamma-encoding that localizes and abstracts away all the
+//  annoying details.  This greatly reduces the amount of code in each shader
+//  pass that depends on the pass number in the .cgp preset or whether sRGB
+//  FBO's are being used: You can trivially change the gamma behavior of your
+//  whole pass by commenting or uncommenting 1-3 #defines.  To reuse the same
+//  code in your first, Nth, and last passes, you can even put it all in another
+//  header file and #include it from skeleton .cg files that #define the
+//  appropriate pass-specific settings.
+//
+//  Rationale for Using Three Macros:
+//  This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
+//  SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
+//  a lower maintenance burden on each pass.  At first glance it seems we could
+//  accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
+//  This works for simple use cases where input_gamma == output_gamma, but it
+//  breaks down for more complex scenarios like CRT simulation, where the pass
+//  number determines the gamma encoding of the input and output.
+
+
+///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
+
+//  Set standard gamma constants, but allow users to override them:
+#ifndef OVERRIDE_STANDARD_GAMMA
+    //  Standard encoding gammas:
+    static const float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
+    static const float pal_gamma = 2.8;     //  Never actually 2.8 in practice
+    //  Typical device decoding gammas (only use for emulating devices):
+    //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
+    //  gammas: The standards purposely undercorrected for an analog CRT's
+    //  assumed 2.5 reference display gamma to maintain contrast in assumed
+    //  [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
+    //  These unstated assumptions about display gamma and perceptual rendering
+    //  intent caused a lot of confusion, and more modern CRT's seemed to target
+    //  NTSC 2.2 gamma with circuitry.  LCD displays seem to have followed suit
+    //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
+    //  displays designed to view sRGB in bright environments.  (Standards are
+    //  also in flux again with BT.1886, but it's underspecified for displays.)
+    static const float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
+    static const float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
+    static const float lcd_reference_gamma = 2.5;       //  To match CRT
+    static const float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
+    static const float lcd_office_gamma = 2.2;  //  Approximates sRGB
+#endif  //  OVERRIDE_STANDARD_GAMMA
+
+//  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
+//  but only if they're aware of it.
+#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
+    static const bool assume_opaque_alpha = false;
+#endif
+
+
+///////////////////////  DERIVED CONSTANTS AS FUNCTIONS  ///////////////////////
+
+//  gamma-management.h should be compatible with overriding gamma values with
+//  runtime user parameters, but we can only define other global constants in
+//  terms of static constants, not uniform user parameters.  To get around this
+//  limitation, we need to define derived constants using functions.
+
+//  Set device gamma constants, but allow users to override them:
+#ifdef OVERRIDE_DEVICE_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_crt_gamma()    {   return crt_gamma;   }
+    inline float get_gba_gamma()    {   return gba_gamma;   }
+    inline float get_lcd_gamma()    {   return lcd_gamma;   }
+#else
+    inline float get_crt_gamma()    {   return crt_reference_gamma_high;    }
+    inline float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
+    inline float get_lcd_gamma()    {   return lcd_office_gamma;            }
+#endif  //  OVERRIDE_DEVICE_GAMMA
+
+//  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
+#ifdef OVERRIDE_FINAL_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_intermediate_gamma()   {   return intermediate_gamma;  }
+    inline float get_input_gamma()          {   return input_gamma;         }
+    inline float get_output_gamma()         {   return output_gamma;        }
+#else
+    //  If we gamma-correct every pass, always use ntsc_gamma between passes to
+    //  ensure middle passes don't need to care if anything is being simulated:
+    inline float get_intermediate_gamma()   {   return ntsc_gamma;          }
+    #ifdef SIMULATE_CRT_ON_LCD
+        inline float get_input_gamma()      {   return get_crt_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_LCD
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_LCD_ON_CRT
+        inline float get_input_gamma()      {   return get_lcd_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_CRT
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else   //  Don't simulate anything:
+        inline float get_input_gamma()      {   return ntsc_gamma;          }
+        inline float get_output_gamma()     {   return ntsc_gamma;          }
+    #endif  //  SIMULATE_GBA_ON_CRT
+    #endif  //  SIMULATE_LCD_ON_CRT
+    #endif  //  SIMULATE_GBA_ON_LCD
+    #endif  //  SIMULATE_CRT_ON_LCD
+#endif  //  OVERRIDE_FINAL_GAMMA
+
+//  Set decoding/encoding gammas for the current pass.  Use static constants for
+//  linearize_input and gamma_encode_output, because they aren't derived, and
+//  they let the compiler do dead-code elimination.
+#ifndef GAMMA_ENCODE_EVERY_FBO
+    #ifdef FIRST_PASS
+        static const bool linearize_input = true;
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        static const bool linearize_input = false;
+        inline float get_pass_input_gamma()     {   return 1.0;                 }
+    #endif
+    #ifdef LAST_PASS
+        static const bool gamma_encode_output = true;
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        static const bool gamma_encode_output = false;
+        inline float get_pass_output_gamma()    {   return 1.0;                 }
+    #endif
+#else
+    static const bool linearize_input = true;
+    static const bool gamma_encode_output = true;
+    #ifdef FIRST_PASS
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        inline float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
+    #endif
+    #ifdef LAST_PASS
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        inline float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
+    #endif
+#endif
+
+//  Users might want to know if bilinear filtering will be gamma-correct:
+static const bool gamma_aware_bilinear = !linearize_input;
+
+
+//////////////////////  COLOR ENCODING/DECODING FUNCTIONS  /////////////////////
+
+inline float4 encode_output(const float4 color)
+{
+    if(gamma_encode_output)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_input(const float4 color)
+{
+    if(linearize_input)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_gamma_input(const float4 color, const float3 gamma)
+{
+    if(assume_opaque_alpha)
+    {
+        return float4(pow(color.rgb, gamma), 1.0);
+    }
+    else
+    {
+        return float4(pow(color.rgb, gamma), color.a);
+    }
+}
+
+//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯
+//#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D)))
+// EDIT: it's the 'const' in front of the coords that's doing it
+
+///////////////////////////  TEXTURE LOOKUP WRAPPERS  //////////////////////////
+
+//  "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a wide array of linearizing texture lookup wrapper functions.  The
+//  Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
+//  lookups are provided for completeness in case that changes someday.  Nobody
+//  is likely to use the *fetch and *proj functions, but they're included just
+//  in case.  The only tex*D texture sampling functions omitted are:
+//      - tex*Dcmpbias
+//      - tex*Dcmplod
+//      - tex*DARRAY*
+//      - tex*DMS*
+//      - Variants returning integers
+//  Standard line length restrictions are ignored below for vertical brevity.
+/*
+//  tex1D:
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex1Dbias:
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dbias(tex, tex_coords));   }
+
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dbias(tex, tex_coords, texel_off));    }
+
+//  tex1Dfetch:
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
+{   return decode_input(tex1Dfetch(tex, tex_coords));  }
+
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex1Dlod:
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dlod(tex, tex_coords));    }
+
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dlod(tex, tex_coords, texel_off));     }
+
+//  tex1Dproj:
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+*/
+//  tex2D:
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords, texel_off));    }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex2Dbias:
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
+//{   return decode_input(tex2Dbias(tex, tex_coords));   }
+
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dbias(tex, tex_coords, texel_off));    }
+
+//  tex2Dfetch:
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
+//{   return decode_input(tex2Dfetch(tex, tex_coords));  }
+
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords)
+{   return decode_input(textureLod(tex, tex_coords.xy, 0.0));    }
+
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));     }
+/*
+//  tex2Dproj:
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+*/
+/*
+//  tex3D:
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
+{   return decode_input(tex3D(tex, tex_coords));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, texel_off));    }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex3Dbias:
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dbias(tex, tex_coords));   }
+
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dbias(tex, tex_coords, texel_off));    }
+
+//  tex3Dfetch:
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
+{   return decode_input(tex3Dfetch(tex, tex_coords));  }
+
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex3Dlod:
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dlod(tex, tex_coords));    }
+
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dlod(tex, tex_coords, texel_off));     }
+
+//  tex3Dproj:
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dproj(tex, tex_coords));   }
+
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dproj(tex, tex_coords, texel_off));    }
+/////////*
+
+//  NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  This narrow selection of nonstandard tex2D* functions can be useful:
+
+//  tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0)));   }
+
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off));    }
+
+
+//  MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a narrower selection of tex2D* wrapper functions that decode an
+//  input sample with a specified gamma value.  These are useful for reading
+//  LUT's and for reading the input of pass0 in a later pass.
+
+//  tex2D:
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma);   }
+
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+/*
+//  tex2Dbias:
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma);   }
+
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma);    }
+
+//  tex2Dfetch:
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma);  }
+
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma);   }
+*/
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma);    }
+
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma);     }
+
+
+#endif  //  GAMMA_MANAGEMENT_H
+
+////////////////////////////  END GAMMA-MANAGEMENT  //////////////////////////
+
+//#include "quad-pixel-communication.h"
+
+///////////////////////  BEGIN QUAD-PIXEL-COMMUNICATION  //////////////////////
+
+#ifndef QUAD_PIXEL_COMMUNICATION_H
+#define QUAD_PIXEL_COMMUNICATION_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey*
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DISCLAIMER  /////////////////////////////////
+
+//  *This code was inspired by "Shader Amortization using Pixel Quad Message
+//  Passing" by Eric Penner, published in GPU Pro 2, Chapter VI.2.  My intent
+//  is not to plagiarize his fundamentally similar code and assert my own
+//  copyright, but the algorithmic helper functions require so little code that
+//  implementations can't vary by much except bugfixes and conventions.  I just
+//  wanted to license my own particular code here to avoid ambiguity and make it
+//  clear that as far as I'm concerned, people can do as they please with it.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  Given screen pixel numbers, derive a "quad vector" describing a fragment's
+//  position in its 2x2 pixel quad.  Given that vector, obtain the values of any
+//  variable at neighboring fragments.
+//  Requires:   Using this file in general requires:
+//              1.) ddx() and ddy() are present in the current Cg profile.
+//              2.) The GPU driver is using fine/high-quality derivatives.
+//                  Functions will give incorrect results if this is not true,
+//                  so a test function is included.
+
+
+/////////////////////  QUAD-PIXEL COMMUNICATION PRIMITIVES  ////////////////////
+
+float4 get_quad_vector_naive(float4 output_pixel_num_wrt_uvxy)
+{
+    //  Requires:   Two measures of the current fragment's output pixel number
+    //              in the range ([0, output_size.x), [0, output_size.y)):
+    //              1.) output_pixel_num_wrt_uvxy.xy increase with uv coords.
+    //              2.) output_pixel_num_wrt_uvxy.zw increase with screen xy.
+    //  Returns:    Two measures of the fragment's position in its 2x2 quad:
+    //              1.) The .xy components are its 2x2 placement with respect to
+    //                  uv direction (the origin (0, 0) is at the top-left):
+    //                  top-left     = (-1.0, -1.0) top-right    = ( 1.0, -1.0)
+    //                  bottom-left  = (-1.0,  1.0) bottom-right = ( 1.0,  1.0)
+    //                  You need this to arrange/weight shared texture samples.
+    //              2.) The .zw components are its 2x2 placement with respect to
+    //                  screen xy direction (position); the origin varies.
+    //                  quad_gather needs this measure to work correctly.
+    //              Note: quad_vector.zw = quad_vector.xy * float2(
+    //                      ddx(output_pixel_num_wrt_uvxy.x),
+    //                      ddy(output_pixel_num_wrt_uvxy.y));
+    //  Caveats:    This function assumes the GPU driver always starts 2x2 pixel
+    //              quads at even pixel numbers.  This assumption can be wrong
+    //              for odd output resolutions (nondeterministically so).
+    float4 pixel_odd = frac(output_pixel_num_wrt_uvxy * 0.5) * 2.0;
+    float4 quad_vector = pixel_odd * 2.0 - float4(1.0);
+    return quad_vector;
+}
+
+float4 get_quad_vector(float4 output_pixel_num_wrt_uvxy)
+{
+    //  Requires:   Same as get_quad_vector_naive() (see that first).
+    //  Returns:    Same as get_quad_vector_naive() (see that first), but it's
+    //              correct even if the 2x2 pixel quad starts at an odd pixel,
+    //              which can occur at odd resolutions.
+    float4 quad_vector_guess =
+        get_quad_vector_naive(output_pixel_num_wrt_uvxy);
+    //  If quad_vector_guess.zw doesn't increase with screen xy, we know
+    //  the 2x2 pixel quad starts at an odd pixel:
+    float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_guess.z),
+                                                ddy(quad_vector_guess.w));
+    return quad_vector_guess * odd_start_mirror.xyxy;
+}
+
+float4 get_quad_vector(float2 output_pixel_num_wrt_uv)
+{
+    //  Requires:   1.) ddx() and ddy() are present in the current Cg profile.
+    //              2.) output_pixel_num_wrt_uv must increase with uv coords and
+    //                  measure the current fragment's output pixel number in:
+    //                      ([0, output_size.x), [0, output_size.y))
+    //  Returns:    Same as get_quad_vector_naive() (see that first), but it's
+    //              correct even if the 2x2 pixel quad starts at an odd pixel,
+    //              which can occur at odd resolutions.
+    //  Caveats:    This function requires less information than the version
+    //              taking a float4, but it's potentially slower.
+    //  Do screen coords increase with or against uv?  Get the direction
+    //  with respect to (uv.x, uv.y) for (screen.x, screen.y) in {-1, 1}.
+    float2 screen_uv_mirror = float2(ddx(output_pixel_num_wrt_uv.x),
+                                        ddy(output_pixel_num_wrt_uv.y));
+    float2 pixel_odd_wrt_uv = frac(output_pixel_num_wrt_uv * 0.5) * 2.0;
+    float2 quad_vector_uv_guess = (pixel_odd_wrt_uv - float2(0.5)) * 2.0;
+    float2 quad_vector_screen_guess = quad_vector_uv_guess * screen_uv_mirror;
+    //  If quad_vector_screen_guess doesn't increase with screen xy, we know
+    //  the 2x2 pixel quad starts at an odd pixel:
+    float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_screen_guess.x),
+                                                ddy(quad_vector_screen_guess.y));
+    float4 quad_vector_guess = float4(
+        quad_vector_uv_guess, quad_vector_screen_guess);
+    return quad_vector_guess * odd_start_mirror.xyxy;
+}
+
+void quad_gather(float4 quad_vector, float4 curr,
+    out float4 adjx, out float4 adjy, out float4 diag)
+{
+    //  Requires:   1.) ddx() and ddy() are present in the current Cg profile.
+    //              2.) The GPU driver is using fine/high-quality derivatives.
+    //              3.) quad_vector describes the current fragment's location in
+    //                  its 2x2 pixel quad using get_quad_vector()'s conventions.
+    //              4.) curr is any vector you wish to get neighboring values of.
+    //  Returns:    Values of an input vector (curr) at neighboring fragments
+    //              adjacent x, adjacent y, and diagonal (via out parameters).
+    adjx = curr - ddx(curr) * quad_vector.z;
+    adjy = curr - ddy(curr) * quad_vector.w;
+    diag = adjx - ddy(adjx) * quad_vector.w;
+}
+
+void quad_gather(float4 quad_vector, float3 curr,
+    out float3 adjx, out float3 adjy, out float3 diag)
+{
+    //  Float3 version
+    adjx = curr - ddx(curr) * quad_vector.z;
+    adjy = curr - ddy(curr) * quad_vector.w;
+    diag = adjx - ddy(adjx) * quad_vector.w;
+}
+
+void quad_gather(float4 quad_vector, float2 curr,
+    out float2 adjx, out float2 adjy, out float2 diag)
+{
+    //  Float2 version
+    adjx = curr - ddx(curr) * quad_vector.z;
+    adjy = curr - ddy(curr) * quad_vector.w;
+    diag = adjx - ddy(adjx) * quad_vector.w;
+}
+
+float4 quad_gather(float4 quad_vector, float curr)
+{
+    //  Float version:
+    //  Returns:    return.x == current
+    //              return.y == adjacent x
+    //              return.z == adjacent y
+    //              return.w == diagonal
+    float4 all = float4(curr);
+    all.y = all.x - ddx(all.x) * quad_vector.z;
+    all.zw = all.xy - ddy(all.xy) * quad_vector.w;
+    return all;
+}
+
+float4 quad_gather_sum(float4 quad_vector, float4 curr)
+{
+    //  Requires:   Same as quad_gather()
+    //  Returns:    Sum of an input vector (curr) at all fragments in a quad.
+    float4 adjx, adjy, diag;
+    quad_gather(quad_vector, curr, adjx, adjy, diag);
+    return (curr + adjx + adjy + diag);
+}
+
+float3 quad_gather_sum(float4 quad_vector, float3 curr)
+{
+    //  Float3 version:
+    float3 adjx, adjy, diag;
+    quad_gather(quad_vector, curr, adjx, adjy, diag);
+    return (curr + adjx + adjy + diag);
+}
+
+float2 quad_gather_sum(float4 quad_vector, float2 curr)
+{
+    //  Float2 version:
+    float2 adjx, adjy, diag;
+    quad_gather(quad_vector, curr, adjx, adjy, diag);
+    return (curr + adjx + adjy + diag);
+}
+
+float quad_gather_sum(float4 quad_vector, float curr)
+{
+    //  Float version:
+    float4 all_values = quad_gather(quad_vector, curr);
+    return (all_values.x + all_values.y + all_values.z + all_values.w);
+}
+
+bool fine_derivatives_working(float4 quad_vector, float4 curr)
+{
+    //  Requires:   1.) ddx() and ddy() are present in the current Cg profile.
+    //              2.) quad_vector describes the current fragment's location in
+    //                  its 2x2 pixel quad using get_quad_vector()'s conventions.
+    //              3.) curr must be a test vector with non-constant derivatives
+    //                  (its value should change nonlinearly across fragments).
+    //  Returns:    true if fine/hybrid/high-quality derivatives are used, or
+    //              false if coarse derivatives are used or inconclusive
+    //  Usage:      Test whether quad-pixel communication is working!
+    //  Method:     We can confirm fine derivatives are used if the following
+    //              holds (ever, for any value at any fragment):
+    //                  (ddy(curr) != ddy(adjx)) or (ddx(curr) != ddx(adjy))
+    //              The more values we test (e.g. test a float4 two ways), the
+    //              easier it is to demonstrate fine derivatives are working.
+    //  TODO: Check for floating point exact comparison issues!
+    float4 ddx_curr = ddx(curr);
+    float4 ddy_curr = ddy(curr);
+    float4 adjx = curr - ddx_curr * quad_vector.z;
+    float4 adjy = curr - ddy_curr * quad_vector.w;
+    bool ddy_different = any(bool4(ddy_curr.x != ddy(adjx).x, ddy_curr.y != ddy(adjx).y, ddy_curr.z != ddy(adjx).z, ddy_curr.w != ddy(adjx).w));
+    bool ddx_different = any(bool4(ddx_curr.x != ddx(adjy).x, ddx_curr.y != ddx(adjy).y, ddx_curr.z != ddx(adjy).z, ddx_curr.w != ddx(adjy).w));
+    return any(bool2(ddy_different, ddx_different));
+}
+
+bool fine_derivatives_working_fast(float4 quad_vector, float curr)
+{
+    //  Requires:   Same as fine_derivatives_working()
+    //  Returns:    Same as fine_derivatives_working()
+    //  Usage:      This is faster than fine_derivatives_working() but more
+    //              likely to return false negatives, so it's less useful for
+    //              offline testing/debugging.  It's also useless as the basis
+    //              for dynamic runtime branching as of May 2014: Derivatives
+    //              (and quad-pixel communication) are currently disallowed in
+    //              branches.  However, future GPU's may allow you to use them
+    //              in dynamic branches if you promise the branch condition
+    //              evaluates the same for every fragment in the quad (and/or if
+    //              the driver enforces that promise by making a single fragment
+    //              control branch decisions).  If that ever happens, this
+    //              version may become a more economical choice.
+    float ddx_curr = ddx(curr);
+    float ddy_curr = ddy(curr);
+    float adjx = curr - ddx_curr * quad_vector.z;
+    return (ddy_curr != ddy(adjx));
+}
+
+#endif  //  QUAD_PIXEL_COMMUNICATION_H
+
+////////////////////////  END QUAD-PIXEL-COMMUNICATION  ///////////////////////
+
+//#include "special-functions.h"
+
+///////////////////////////  BEGIN SPECIAL-FUNCTIONS  //////////////////////////
+
+#ifndef SPECIAL_FUNCTIONS_H
+#define SPECIAL_FUNCTIONS_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file implements the following mathematical special functions:
+//  1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2))
+//  2.) gamma(s), a real-numbered extension of the integer factorial function
+//  It also implements normalized_ligamma(s, z), a normalized lower incomplete
+//  gamma function for s < 0.5 only.  Both gamma() and normalized_ligamma() can
+//  be called with an _impl suffix to use an implementation version with a few
+//  extra precomputed parameters (which may be useful for the caller to reuse).
+//  See below for details.
+//
+//  Design Rationale:
+//  Pretty much every line of code in this file is duplicated four times for
+//  different input types (float4/float3/float2/float).  This is unfortunate,
+//  but Cg doesn't allow function templates.  Macros would be far less verbose,
+//  but they would make the code harder to document and read.  I don't expect
+//  these functions will require a whole lot of maintenance changes unless
+//  someone ever has need for more robust incomplete gamma functions, so code
+//  duplication seems to be the lesser evil in this case.
+
+
+///////////////////////////  GAUSSIAN ERROR FUNCTION  //////////////////////////
+
+float4 erf6(float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Return an Abramowitz/Stegun approximation of erf(), where:
+    //                  erf(x) = 2/sqrt(pi) * integral(e**(-x**2))
+    //              This approximation has a max absolute error of 2.5*10**-5
+    //              with solid numerical robustness and efficiency.  See:
+	//                  https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions
+	static const float4 one = float4(1.0);
+	const float4 sign_x = sign(x);
+	const float4 t = one/(one + 0.47047*abs(x));
+	const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float3 erf6(const float3 x)
+{
+    //  Float3 version:
+	static const float3 one = float3(1.0);
+	const float3 sign_x = sign(x);
+	const float3 t = one/(one + 0.47047*abs(x));
+	const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float2 erf6(const float2 x)
+{
+    //  Float2 version:
+	static const float2 one = float2(1.0);
+	const float2 sign_x = sign(x);
+	const float2 t = one/(one + 0.47047*abs(x));
+	const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float erf6(const float x)
+{
+    //  Float version:
+	const float sign_x = sign(x);
+	const float t = 1.0/(1.0 + 0.47047*abs(x));
+	const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float4 erft(const float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Approximate erf() with the hyperbolic tangent.  The error is
+    //              visually noticeable, but it's blazing fast and perceptually
+    //              close...at least on ATI hardware.  See:
+    //                  http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html
+    //  Warning:    Only use this if your hardware drivers correctly implement
+    //              tanh(): My nVidia 8800GTS returns garbage output.
+	return tanh(1.202760580 * x);
+}
+
+float3 erft(const float3 x)
+{
+    //  Float3 version:
+	return tanh(1.202760580 * x);
+}
+
+float2 erft(const float2 x)
+{
+    //  Float2 version:
+	return tanh(1.202760580 * x);
+}
+
+float erft(const float x)
+{
+    //  Float version:
+	return tanh(1.202760580 * x);
+}
+
+inline float4 erf(const float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Some approximation of erf(x), depending on user settings.
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float3 erf(const float3 x)
+{
+    //  Float3 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float2 erf(const float2 x)
+{
+    //  Float2 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float erf(const float x)
+{
+    //  Float version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+
+///////////////////////////  COMPLETE GAMMA FUNCTION  //////////////////////////
+
+float4 gamma_impl(const float4 s, const float4 s_inv)
+{
+    //  Requires:   1.) s is the standard parameter to the gamma function, and
+    //                  it should lie in the [0, 36] range.
+    //              2.) s_inv = 1.0/s.  This implementation function requires
+    //                  the caller to precompute this value, giving users the
+    //                  opportunity to reuse it.
+    //  Returns:    Return approximate gamma function (real-numbered factorial)
+    //              output using the Lanczos approximation with two coefficients
+    //              calculated using Paul Godfrey's method here:
+    //                  http://my.fit.edu/~gabdo/gamma.txt
+    //              An optimal g value for s in [0, 36] is ~1.12906830989, with
+    //              a maximum relative error of 0.000463 for 2**16 equally
+    //              evals.  We could use three coeffs (0.0000346 error) without
+    //              hurting latency, but this allows more parallelism with
+    //              outside instructions.
+	static const float4 g = float4(1.12906830989);
+	static const float4 c0 = float4(0.8109119309638332633713423362694399653724431);
+	static const float4 c1 = float4(0.4808354605142681877121661197951496120000040);
+	static const float4 e = float4(2.71828182845904523536028747135266249775724709);
+	const float4 sph = s + float4(0.5);
+	const float4 lanczos_sum = c0 + c1/(s + float4(1.0));
+	const float4 base = (sph + g)/e;  //  or (s + g + float4(0.5))/e
+	//  gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s).
+	//  This has less error for small s's than (s -= 1.0) at the beginning.
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float3 gamma_impl(const float3 s, const float3 s_inv)
+{
+    //  Float3 version:
+	static const float3 g = float3(1.12906830989);
+	static const float3 c0 = float3(0.8109119309638332633713423362694399653724431);
+	static const float3 c1 = float3(0.4808354605142681877121661197951496120000040);
+	static const float3 e = float3(2.71828182845904523536028747135266249775724709);
+	const float3 sph = s + float3(0.5);
+	const float3 lanczos_sum = c0 + c1/(s + float3(1.0));
+	const float3 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float2 gamma_impl(const float2 s, const float2 s_inv)
+{
+    //  Float2 version:
+	static const float2 g = float2(1.12906830989);
+	static const float2 c0 = float2(0.8109119309638332633713423362694399653724431);
+	static const float2 c1 = float2(0.4808354605142681877121661197951496120000040);
+	static const float2 e = float2(2.71828182845904523536028747135266249775724709);
+	const float2 sph = s + float2(0.5);
+	const float2 lanczos_sum = c0 + c1/(s + float2(1.0));
+	const float2 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float gamma_impl(const float s, const float s_inv)
+{
+    //  Float version:
+	static const float g = 1.12906830989;
+	static const float c0 = 0.8109119309638332633713423362694399653724431;
+	static const float c1 = 0.4808354605142681877121661197951496120000040;
+	static const float e = 2.71828182845904523536028747135266249775724709;
+	const float sph = s + 0.5;
+	const float lanczos_sum = c0 + c1/(s + 1.0);
+	const float base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float4 gamma(const float4 s)
+{
+    //  Requires:   s is the standard parameter to the gamma function, and it
+    //              should lie in the [0, 36] range.
+    //  Returns:    Return approximate gamma function output with a maximum
+    //              relative error of 0.000463.  See gamma_impl for details.
+	return gamma_impl(s, float4(1.0)/s);
+}
+
+float3 gamma(const float3 s)
+{
+    //  Float3 version:
+	return gamma_impl(s, float3(1.0)/s);
+}
+
+float2 gamma(const float2 s)
+{
+    //  Float2 version:
+	return gamma_impl(s, float2(1.0)/s);
+}
+
+float gamma(const float s)
+{
+    //  Float version:
+	return gamma_impl(s, 1.0/s);
+}
+
+
+////////////////  INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT)  ///////////////
+
+//  Lower incomplete gamma function for small s and z (implementation):
+float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z <= ~0.775075
+    //              3.) s_inv = 1.0/s (precomputed for outside reuse)
+    //  Returns:    A series representation for the lower incomplete gamma
+    //              function for small s and small z (4 terms).
+    //  The actual "rolled up" summation looks like:
+	//      last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0;
+	//      sum = last_sign * last_pow / ((s + k) * last_factorial)
+	//      for(int i = 0; i < 4; ++i)
+	//      {
+	//          last_sign *= -1.0; last_pow *= z; last_factorial *= i;
+	//          sum += last_sign * last_pow / ((s + k) * last_factorial);
+	//      }
+	//  Unrolled, constant-unfolded and arranged for madds and parallelism:
+	const float4 scale = pow(z, s);
+	float4 sum = s_inv;  //  Summation iteration 0 result
+	//  Summation iterations 1, 2, and 3:
+	const float4 z_sq = z*z;
+	const float4 denom1 = s + float4(1.0);
+	const float4 denom2 = 2.0*s + float4(4.0);
+	const float4 denom3 = 6.0*s + float4(18.0);
+	//float4 denom4 = 24.0*s + float4(96.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	//sum += z_sq * z_sq / denom4;
+	//  Scale and return:
+	return scale * sum;
+}
+
+float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv)
+{
+    //  Float3 version:
+	const float3 scale = pow(z, s);
+	float3 sum = s_inv;
+	const float3 z_sq = z*z;
+	const float3 denom1 = s + float3(1.0);
+	const float3 denom2 = 2.0*s + float3(4.0);
+	const float3 denom3 = 6.0*s + float3(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv)
+{
+    //  Float2 version:
+	const float2 scale = pow(z, s);
+	float2 sum = s_inv;
+	const float2 z_sq = z*z;
+	const float2 denom1 = s + float2(1.0);
+	const float2 denom2 = 2.0*s + float2(4.0);
+	const float2 denom3 = 6.0*s + float2(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float ligamma_small_z_impl(const float s, const float z, const float s_inv)
+{
+    //  Float version:
+	const float scale = pow(z, s);
+	float sum = s_inv;
+	const float z_sq = z*z;
+	const float denom1 = s + 1.0;
+	const float denom2 = 2.0*s + 4.0;
+	const float denom3 = 6.0*s + 18.0;
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+//  Upper incomplete gamma function for small s and large z (implementation):
+float4 uigamma_large_z_impl(const float4 s, const float4 z)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z > ~0.775075
+    //  Returns:    Gauss's continued fraction representation for the upper
+    //              incomplete gamma function (4 terms).
+	//  The "rolled up" continued fraction looks like this.  The denominator
+    //  is truncated, and it's calculated "from the bottom up:"
+	//      denom = float4('inf');
+	//      float4 one = float4(1.0);
+	//      for(int i = 4; i > 0; --i)
+	//      {
+	//          denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom;
+	//      }
+	//  Unrolled and constant-unfolded for madds and parallelism:
+	const float4 numerator = pow(z, s) * exp(-z);
+	float4 denom = float4(7.0) + z - s;
+	denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom;
+	denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom;
+	denom = float4(1.0) + z - s + (s - float4(1.0))/denom;
+	return numerator / denom;
+}
+
+float3 uigamma_large_z_impl(const float3 s, const float3 z)
+{
+    //  Float3 version:
+	const float3 numerator = pow(z, s) * exp(-z);
+	float3 denom = float3(7.0) + z - s;
+	denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom;
+	denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom;
+	denom = float3(1.0) + z - s + (s - float3(1.0))/denom;
+	return numerator / denom;
+}
+
+float2 uigamma_large_z_impl(const float2 s, const float2 z)
+{
+    //  Float2 version:
+	const float2 numerator = pow(z, s) * exp(-z);
+	float2 denom = float2(7.0) + z - s;
+	denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom;
+	denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom;
+	denom = float2(1.0) + z - s + (s - float2(1.0))/denom;
+	return numerator / denom;
+}
+
+float uigamma_large_z_impl(const float s, const float z)
+{
+    //  Float version:
+	const float numerator = pow(z, s) * exp(-z);
+	float denom = 7.0 + z - s;
+	denom = 5.0 + z - s + (3.0*s - 9.0)/denom;
+	denom = 3.0 + z - s + (2.0*s - 4.0)/denom;
+	denom = 1.0 + z - s + (s - 1.0)/denom;
+	return numerator / denom;
+}
+
+//  Normalized lower incomplete gamma function for small s (implementation):
+float4 normalized_ligamma_impl(const float4 s, const float4 z,
+    const float4 s_inv, const float4 gamma_s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) s_inv = 1/s (precomputed for outside reuse)
+    //              3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse)
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  Since we only care about s < 0.5, we only need
+    //              to evaluate two branches (not four) based on z.  Each branch
+    //              uses four terms, with a max relative error of ~0.00182.  The
+    //              branch threshold and specifics were adapted for fewer terms
+    //              from Gil/Segura/Temme's paper here:
+    //                  http://oai.cwi.nl/oai/asset/20433/20433B.pdf
+	//  Evaluate both branches: Real branches test slower even when available.
+	static const float4 thresh = float4(0.775075);
+	bool4 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	z_is_large.w = z.w > thresh.w;
+	const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	//  Combine the results from both branches:
+	bool4 inverse_z_is_large = not(z_is_large);
+	return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large);
+}
+
+float3 normalized_ligamma_impl(const float3 s, const float3 z,
+    const float3 s_inv, const float3 gamma_s_inv)
+{
+    //  Float3 version:
+	static const float3 thresh = float3(0.775075);
+	bool3 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool3 inverse_z_is_large = not(z_is_large);
+	return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large);
+}
+
+float2 normalized_ligamma_impl(const float2 s, const float2 z,
+    const float2 s_inv, const float2 gamma_s_inv)
+{
+    //  Float2 version:
+	static const float2 thresh = float2(0.775075);
+	bool2 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool2 inverse_z_is_large = not(z_is_large);
+	return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large);
+}
+
+float normalized_ligamma_impl(const float s, const float z,
+    const float s_inv, const float gamma_s_inv)
+{
+    //  Float version:
+	static const float thresh = 0.775075;
+	const bool z_is_large = z > thresh;
+	const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	return large_z * float(z_is_large) + small_z * float(!z_is_large);
+}
+
+//  Normalized lower incomplete gamma function for small s:
+float4 normalized_ligamma(const float4 s, const float4 z)
+{
+    //  Requires:   s < ~0.5
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  See normalized_ligamma_impl() for details.
+	const float4 s_inv = float4(1.0)/s;
+	const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float3 normalized_ligamma(const float3 s, const float3 z)
+{
+    //  Float3 version:
+	const float3 s_inv = float3(1.0)/s;
+	const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float2 normalized_ligamma(const float2 s, const float2 z)
+{
+    //  Float2 version:
+	const float2 s_inv = float2(1.0)/s;
+	const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float normalized_ligamma(const float s, const float z)
+{
+    //  Float version:
+	const float s_inv = 1.0/s;
+	const float gamma_s_inv = 1.0/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+#endif  //  SPECIAL_FUNCTIONS_H
+
+////////////////////////////  END SPECIAL-FUNCTIONS  ///////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////////  HELPERS  //////////////////////////////////
+
+inline float4 uv2_to_uv4(float2 tex_uv)
+{
+    //  Make a float2 uv offset safe for adding to float4 tex2Dlod coords:
+    return float4(tex_uv, 0.0, 0.0);
+}
+
+//  Make a length squared helper macro (for usage with static constants):
+#define LENGTH_SQ(vec) (dot(vec, vec))
+
+inline float get_fast_gaussian_weight_sum_inv(const float sigma)
+{
+    //  We can use the Gaussian integral to calculate the asymptotic weight for
+    //  the center pixel.  Since the unnormalized center pixel weight is 1.0,
+    //  the normalized weight is the same as the weight sum inverse.  Given a
+    //  large enough blur (9+), the asymptotic weight sum is close and faster:
+    //      center_weight = 0.5 *
+    //          (erf(0.5/(sigma*sqrt(2.0))) - erf(-0.5/(sigma*sqrt(2.0))))
+    //      erf(-x) == -erf(x), so we get 0.5 * (2.0 * erf(blah blah)):
+    //  However, we can get even faster results with curve-fitting.  These are
+    //  also closer than the asymptotic results, because they were constructed
+    //  from 64 blurs sizes from [3, 131) and 255 equally-spaced sigmas from
+    //  (0, blurN_std_dev), so the results for smaller sigmas are biased toward
+    //  smaller blurs.  The max error is 0.0031793913.
+    //  Relative FPS: 134.3 with erf, 135.8 with curve-fitting.
+    //static const float temp = 0.5/sqrt(2.0);
+    //return erf(temp/sigma);
+    return min(exp(exp(0.348348412457428/
+        (sigma - 0.0860587260734721))), 0.399334576340352/sigma);
+}
+
+
+////////////////////  ARBITRARILY RESIZABLE SEPARABLE BLURS  ///////////////////
+
+float3 tex2Dblur11resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 11x Gaussian blurred texture lookup using a 11-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  Calculate Gaussian blur kernel weights and a normalization factor for
+    //  distances of 0-4, ignoring constant factors (since we're normalizing).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float weight_sum_inv = 1.0 /
+        (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5));
+    //  Statically normalize weights, sum weighted samples, and return.  Blurs are
+    //  currently optimized for dynamic weights.
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w5 * tex2D_linearize(tex, tex_uv - 5.0 * dxdy).rgb;
+    sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
+    sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb;
+    sum += w5 * tex2D_linearize(tex, tex_uv + 5.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur9resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 9x Gaussian blurred texture lookup using a 9-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
+    sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur7resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 7x Gaussian blurred texture lookup using a 7-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3));
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur5resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 5x Gaussian blurred texture lookup using a 5-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2));
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur3resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 1D 3x Gaussian blurred texture lookup using a 3-tap blur.
+    //              It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1);
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+
+///////////////////////////  FAST SEPARABLE BLURS  ///////////////////////////
+
+float3 tex2Dblur11fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   1.) Global requirements must be met (see file description).
+    //              2.) filter_linearN must = "true" in your .cgp file.
+    //              3.) For gamma-correct bilinear filtering, global
+    //                  gamma_aware_bilinear == true (from gamma-management.h)
+    //  Returns:    A 1D 11x Gaussian blurred texture lookup using 6 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float weight_sum_inv = 1.0 /
+        (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    const float w01 = w0 * 0.5 + w1;
+    const float w23 = w2 + w3;
+    const float w45 = w4 + w5;
+    const float w01_ratio = w1/w01;
+    const float w23_ratio = w3/w23;
+    const float w45_ratio = w5/w45;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w45 * tex2D_linearize(tex, tex_uv - (4.0 + w45_ratio) * dxdy).rgb;
+    sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb;
+    sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb;
+    sum += w45 * tex2D_linearize(tex, tex_uv + (4.0 + w45_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur9fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 9x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 4 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    const float w12 = w1 + w2;
+    const float w34 = w3 + w4;
+    const float w12_ratio = w2/w12;
+    const float w34_ratio = w4/w34;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w34 * tex2D_linearize(tex, tex_uv - (3.0 + w34_ratio) * dxdy).rgb;
+    sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb;
+    sum += w34 * tex2D_linearize(tex, tex_uv + (3.0 + w34_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur7fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 7x Gaussian blurred texture lookup using 4 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    const float w01 = w0 * 0.5 + w1;
+    const float w23 = w2 + w3;
+    const float w01_ratio = w1/w01;
+    const float w23_ratio = w3/w23;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb;
+    sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur5fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 5x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 2 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    const float w12 = w1 + w2;
+    const float w12_ratio = w2/w12;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur3fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 3x Gaussian blurred texture lookup using 2 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    const float w01 = w0 * 0.5 + w1;
+    const float w01_ratio = w1/w01;
+    //  Weights for all samples are the same, so just average them:
+    return 0.5 * (
+        tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb +
+        tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb);
+}
+
+
+////////////////////////////  HUGE SEPARABLE BLURS  ////////////////////////////
+
+//  Huge separable blurs come only in "fast" versions.
+float3 tex2Dblur43fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 43x Gaussian blurred texture lookup using 22 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float w6 = exp(-36.0 * denom_inv);
+    const float w7 = exp(-49.0 * denom_inv);
+    const float w8 = exp(-64.0 * denom_inv);
+    const float w9 = exp(-81.0 * denom_inv);
+    const float w10 = exp(-100.0 * denom_inv);
+    const float w11 = exp(-121.0 * denom_inv);
+    const float w12 = exp(-144.0 * denom_inv);
+    const float w13 = exp(-169.0 * denom_inv);
+    const float w14 = exp(-196.0 * denom_inv);
+    const float w15 = exp(-225.0 * denom_inv);
+    const float w16 = exp(-256.0 * denom_inv);
+    const float w17 = exp(-289.0 * denom_inv);
+    const float w18 = exp(-324.0 * denom_inv);
+    const float w19 = exp(-361.0 * denom_inv);
+    const float w20 = exp(-400.0 * denom_inv);
+    const float w21 = exp(-441.0 * denom_inv);
+    //const float weight_sum_inv = 1.0 /
+    //    (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 +
+    //        w12 + w13 + w14 + w15 + w16 + w17 + w18 + w19 + w20 + w21));
+    const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    const float w0_1 = w0 * 0.5 + w1;
+    const float w2_3 = w2 + w3;
+    const float w4_5 = w4 + w5;
+    const float w6_7 = w6 + w7;
+    const float w8_9 = w8 + w9;
+    const float w10_11 = w10 + w11;
+    const float w12_13 = w12 + w13;
+    const float w14_15 = w14 + w15;
+    const float w16_17 = w16 + w17;
+    const float w18_19 = w18 + w19;
+    const float w20_21 = w20 + w21;
+    const float w0_1_ratio = w1/w0_1;
+    const float w2_3_ratio = w3/w2_3;
+    const float w4_5_ratio = w5/w4_5;
+    const float w6_7_ratio = w7/w6_7;
+    const float w8_9_ratio = w9/w8_9;
+    const float w10_11_ratio = w11/w10_11;
+    const float w12_13_ratio = w13/w12_13;
+    const float w14_15_ratio = w15/w14_15;
+    const float w16_17_ratio = w17/w16_17;
+    const float w18_19_ratio = w19/w18_19;
+    const float w20_21_ratio = w21/w20_21;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w20_21 * tex2D_linearize(tex, tex_uv - (20.0 + w20_21_ratio) * dxdy).rgb;
+    sum += w18_19 * tex2D_linearize(tex, tex_uv - (18.0 + w18_19_ratio) * dxdy).rgb;
+    sum += w16_17 * tex2D_linearize(tex, tex_uv - (16.0 + w16_17_ratio) * dxdy).rgb;
+    sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb;
+    sum += w16_17 * tex2D_linearize(tex, tex_uv + (16.0 + w16_17_ratio) * dxdy).rgb;
+    sum += w18_19 * tex2D_linearize(tex, tex_uv + (18.0 + w18_19_ratio) * dxdy).rgb;
+    sum += w20_21 * tex2D_linearize(tex, tex_uv + (20.0 + w20_21_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur31fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 31x Gaussian blurred texture lookup using 16 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float w6 = exp(-36.0 * denom_inv);
+    const float w7 = exp(-49.0 * denom_inv);
+    const float w8 = exp(-64.0 * denom_inv);
+    const float w9 = exp(-81.0 * denom_inv);
+    const float w10 = exp(-100.0 * denom_inv);
+    const float w11 = exp(-121.0 * denom_inv);
+    const float w12 = exp(-144.0 * denom_inv);
+    const float w13 = exp(-169.0 * denom_inv);
+    const float w14 = exp(-196.0 * denom_inv);
+    const float w15 = exp(-225.0 * denom_inv);
+    //const float weight_sum_inv = 1.0 /
+    //    (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 +
+    //        w9 + w10 + w11 + w12 + w13 + w14 + w15));
+    const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    const float w0_1 = w0 * 0.5 + w1;
+    const float w2_3 = w2 + w3;
+    const float w4_5 = w4 + w5;
+    const float w6_7 = w6 + w7;
+    const float w8_9 = w8 + w9;
+    const float w10_11 = w10 + w11;
+    const float w12_13 = w12 + w13;
+    const float w14_15 = w14 + w15;
+    const float w0_1_ratio = w1/w0_1;
+    const float w2_3_ratio = w3/w2_3;
+    const float w4_5_ratio = w5/w4_5;
+    const float w6_7_ratio = w7/w6_7;
+    const float w8_9_ratio = w9/w8_9;
+    const float w10_11_ratio = w11/w10_11;
+    const float w12_13_ratio = w13/w12_13;
+    const float w14_15_ratio = w15/w14_15;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur25fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 25x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 12 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float w6 = exp(-36.0 * denom_inv);
+    const float w7 = exp(-49.0 * denom_inv);
+    const float w8 = exp(-64.0 * denom_inv);
+    const float w9 = exp(-81.0 * denom_inv);
+    const float w10 = exp(-100.0 * denom_inv);
+    const float w11 = exp(-121.0 * denom_inv);
+    const float w12 = exp(-144.0 * denom_inv);
+    //const float weight_sum_inv = 1.0 / (w0 + 2.0 * (
+    //    w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12));
+    const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    const float w1_2 = w1 + w2;
+    const float w3_4 = w3 + w4;
+    const float w5_6 = w5 + w6;
+    const float w7_8 = w7 + w8;
+    const float w9_10 = w9 + w10;
+    const float w11_12 = w11 + w12;
+    const float w1_2_ratio = w2/w1_2;
+    const float w3_4_ratio = w4/w3_4;
+    const float w5_6_ratio = w6/w5_6;
+    const float w7_8_ratio = w8/w7_8;
+    const float w9_10_ratio = w10/w9_10;
+    const float w11_12_ratio = w12/w11_12;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w11_12 * tex2D_linearize(tex, tex_uv - (11.0 + w11_12_ratio) * dxdy).rgb;
+    sum += w9_10 * tex2D_linearize(tex, tex_uv - (9.0 + w9_10_ratio) * dxdy).rgb;
+    sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w9_10 * tex2D_linearize(tex, tex_uv + (9.0 + w9_10_ratio) * dxdy).rgb;
+    sum += w11_12 * tex2D_linearize(tex, tex_uv + (11.0 + w11_12_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur17fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 17x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 8 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float w6 = exp(-36.0 * denom_inv);
+    const float w7 = exp(-49.0 * denom_inv);
+    const float w8 = exp(-64.0 * denom_inv);
+    //const float weight_sum_inv = 1.0 / (w0 + 2.0 * (
+    //    w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8));
+    const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    const float w1_2 = w1 + w2;
+    const float w3_4 = w3 + w4;
+    const float w5_6 = w5 + w6;
+    const float w7_8 = w7 + w8;
+    const float w1_2_ratio = w2/w1_2;
+    const float w3_4_ratio = w4/w3_4;
+    const float w5_6_ratio = w6/w5_6;
+    const float w7_8_ratio = w8/w7_8;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+
+////////////////////  ARBITRARILY RESIZABLE ONE-PASS BLURS  ////////////////////
+
+float3 tex2Dblur3x3resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Global requirements must be met (see file description).
+    //  Returns:    A 3x3 Gaussian blurred mipmapped texture lookup of the
+    //              resized input.
+    //  Description:
+    //  This is the only arbitrarily resizable one-pass blur; tex2Dblur5x5resize
+    //  would perform like tex2Dblur9x9, MUCH slower than tex2Dblur5resize.
+    const float denom_inv = 0.5/(sigma*sigma);
+    //  Load each sample.  We need all 3x3 samples.  Quad-pixel communication
+    //  won't help either: This should perform like tex2Dblur5x5, but sharing a
+    //  4x4 sample field would perform more like tex2Dblur8x8shared (worse).
+    const float2 sample4_uv = tex_uv;
+    const float2 dx = float2(dxdy.x, 0.0);
+    const float2 dy = float2(0.0, dxdy.y);
+    const float2 sample1_uv = sample4_uv - dy;
+    const float2 sample7_uv = sample4_uv + dy;
+    const float3 sample0 = tex2D_linearize(tex, sample1_uv - dx).rgb;
+    const float3 sample1 = tex2D_linearize(tex, sample1_uv).rgb;
+    const float3 sample2 = tex2D_linearize(tex, sample1_uv + dx).rgb;
+    const float3 sample3 = tex2D_linearize(tex, sample4_uv - dx).rgb;
+    const float3 sample4 = tex2D_linearize(tex, sample4_uv).rgb;
+    const float3 sample5 = tex2D_linearize(tex, sample4_uv + dx).rgb;
+    const float3 sample6 = tex2D_linearize(tex, sample7_uv - dx).rgb;
+    const float3 sample7 = tex2D_linearize(tex, sample7_uv).rgb;
+    const float3 sample8 = tex2D_linearize(tex, sample7_uv + dx).rgb;
+    //  Statically compute Gaussian sample weights:
+    const float w4 = 1.0;
+    const float w1_3_5_7 = exp(-LENGTH_SQ(float2(1.0, 0.0)) * denom_inv);
+    const float w0_2_6_8 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
+    const float weight_sum_inv = 1.0/(w4 + 4.0 * (w1_3_5_7 + w0_2_6_8));
+    //  Weight and sum the samples:
+    const float3 sum = w4 * sample4 +
+        w1_3_5_7 * (sample1 + sample3 + sample5 + sample7) +
+        w0_2_6_8 * (sample0 + sample2 + sample6 + sample8);
+    return sum * weight_sum_inv;
+}
+
+
+////////////////////////////  FASTER ONE-PASS BLURS  ///////////////////////////
+
+float3 tex2Dblur9x9(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Perform a 1-pass 9x9 blur with 5x5 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 9x9 Gaussian blurred mipmapped texture lookup composed of
+    //              5x5 carefully selected bilinear samples.
+    //  Description:
+    //  Perform a 1-pass 9x9 blur with 5x5 bilinear samples.  Adjust the
+    //  bilinear sample location to reflect the true Gaussian weights for each
+    //  underlying texel.  The following diagram illustrates the relative
+    //  locations of bilinear samples.  Each sample with the same number has the
+    //  same weight (notice the symmetry).  The letters a, b, c, d distinguish
+    //  quadrants, and the letters U, D, L, R, C (up, down, left, right, center)
+    //  distinguish 1D directions along the line containing the pixel center:
+    //      6a 5a 2U 5b 6b
+    //      4a 3a 1U 3b 4b
+    //      2L 1L 0C 1R 2R
+    //      4c 3c 1D 3d 4d
+    //      6c 5c 2D 5d 6d
+    //  The following diagram illustrates the underlying equally spaced texels,
+    //  named after the sample that accesses them and subnamed by their location
+    //  within their 2x2, 2x1, 1x2, or 1x1 texel block:
+    //      6a4 6a3 5a4 5a3 2U2 5b3 5b4 6b3 6b4
+    //      6a2 6a1 5a2 5a1 2U1 5b1 5b2 6b1 6b2
+    //      4a4 4a3 3a4 3a3 1U2 3b3 3b4 4b3 4b4
+    //      4a2 4a1 3a2 3a1 1U1 3b1 3b2 4b1 4b2
+    //      2L2 2L1 1L2 1L1 0C1 1R1 1R2 2R1 2R2
+    //      4c2 4c1 3c2 3c1 1D1 3d1 3d2 4d1 4d2
+    //      4c4 4c3 3c4 3c3 1D2 3d3 3d4 4d3 4d4
+    //      6c2 6c1 5c2 5c1 2D1 5d1 5d2 6d1 6d2
+    //      6c4 6c3 5c4 5c3 2D2 5d3 5d4 6d3 6d4
+    //  Note there is only one C texel and only two texels for each U, D, L, or
+    //  R sample.  The center sample is effectively a nearest neighbor sample,
+    //  and the U/D/L/R samples use 1D linear filtering.  All other texels are
+    //  read with bilinear samples somewhere within their 2x2 texel blocks.
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute sampling offsets within each 2x2 texel block, based
+    //  on 1D sampling ratios between texels [1, 2] and [3, 4] texels away from
+    //  the center, and reuse them independently for both dimensions.  Compute
+    //  these offsets based on the relative 1D Gaussian weights of the texels
+    //  in question.  (w1off means "Gaussian weight for the texel 1.0 texels
+    //  away from the pixel center," etc.).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w1off = exp(-1.0 * denom_inv);
+    const float w2off = exp(-4.0 * denom_inv);
+    const float w3off = exp(-9.0 * denom_inv);
+    const float w4off = exp(-16.0 * denom_inv);
+    const float texel1to2ratio = w2off/(w1off + w2off);
+    const float texel3to4ratio = w4off/(w3off + w4off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including x-axis-aligned:
+    const float2 sample1R_texel_offset = float2(1.0, 0.0) + float2(texel1to2ratio, 0.0);
+    const float2 sample2R_texel_offset = float2(3.0, 0.0) + float2(texel3to4ratio, 0.0);
+    const float2 sample3d_texel_offset = float2(1.0, 1.0) + float2(texel1to2ratio, texel1to2ratio);
+    const float2 sample4d_texel_offset = float2(3.0, 1.0) + float2(texel3to4ratio, texel1to2ratio);
+    const float2 sample5d_texel_offset = float2(1.0, 3.0) + float2(texel1to2ratio, texel3to4ratio);
+    const float2 sample6d_texel_offset = float2(3.0, 3.0) + float2(texel3to4ratio, texel3to4ratio);
+
+    //  CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
+    //  Statically compute Gaussian texel weights for the bottom-right quadrant.
+    //  Read underscores as "and."
+    const float w1R1 = w1off;
+    const float w1R2 = w2off;
+    const float w2R1 = w3off;
+    const float w2R2 = w4off;
+    const float w3d1 =     exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
+    const float w3d2_3d3 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv);
+    const float w3d4 =     exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv);
+    const float w4d1_5d1 = exp(-LENGTH_SQ(float2(3.0, 1.0)) * denom_inv);
+    const float w4d2_5d3 = exp(-LENGTH_SQ(float2(4.0, 1.0)) * denom_inv);
+    const float w4d3_5d2 = exp(-LENGTH_SQ(float2(3.0, 2.0)) * denom_inv);
+    const float w4d4_5d4 = exp(-LENGTH_SQ(float2(4.0, 2.0)) * denom_inv);
+    const float w6d1 =     exp(-LENGTH_SQ(float2(3.0, 3.0)) * denom_inv);
+    const float w6d2_6d3 = exp(-LENGTH_SQ(float2(4.0, 3.0)) * denom_inv);
+    const float w6d4 =     exp(-LENGTH_SQ(float2(4.0, 4.0)) * denom_inv);
+    //  Statically add texel weights in each sample to get sample weights:
+    const float w0 = 1.0;
+    const float w1 = w1R1 + w1R2;
+    const float w2 = w2R1 + w2R2;
+    const float w3 = w3d1 + 2.0 * w3d2_3d3 + w3d4;
+    const float w4 = w4d1_5d1 + w4d2_5d3 + w4d3_5d2 + w4d4_5d4;
+    const float w5 = w4;
+    const float w6 = w6d1 + 2.0 * w6d2_6d3 + w6d4;
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv =
+        1.0/(w0 + 4.0 * (w1 + w2 + w3 + w4 + w5 + w6));
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 25 samples (1 nearest, 8 linear, 16 bilinear) using symmetry:
+    const float2 mirror_x = float2(-1.0, 1.0);
+    const float2 mirror_y = float2(1.0, -1.0);
+    const float2 mirror_xy = float2(-1.0, -1.0);
+    const float2 dxdy_mirror_x = dxdy * mirror_x;
+    const float2 dxdy_mirror_y = dxdy * mirror_y;
+    const float2 dxdy_mirror_xy = dxdy * mirror_xy;
+    //  Sampling order doesn't seem to affect performance, so just be clear:
+    const float3 sample0C = tex2D_linearize(tex, tex_uv).rgb;
+    const float3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb;
+    const float3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb;
+    const float3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb;
+    const float3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb;
+    const float3 sample2R = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset).rgb;
+    const float3 sample2D = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset.yx).rgb;
+    const float3 sample2L = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset).rgb;
+    const float3 sample2U = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset.yx).rgb;
+    const float3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb;
+    const float3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb;
+    const float3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb;
+    const float3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb;
+    const float3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb;
+    const float3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb;
+    const float3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb;
+    const float3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb;
+    const float3 sample5d = tex2D_linearize(tex, tex_uv + dxdy * sample5d_texel_offset).rgb;
+    const float3 sample5c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample5d_texel_offset).rgb;
+    const float3 sample5b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample5d_texel_offset).rgb;
+    const float3 sample5a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample5d_texel_offset).rgb;
+    const float3 sample6d = tex2D_linearize(tex, tex_uv + dxdy * sample6d_texel_offset).rgb;
+    const float3 sample6c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample6d_texel_offset).rgb;
+    const float3 sample6b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample6d_texel_offset).rgb;
+    const float3 sample6a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample6d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    float3 sum = w0 * sample0C;
+    sum += w1 * (sample1R + sample1D + sample1L + sample1U);
+    sum += w2 * (sample2R + sample2D + sample2L + sample2U);
+    sum += w3 * (sample3d + sample3c + sample3b + sample3a);
+    sum += w4 * (sample4d + sample4c + sample4b + sample4a);
+    sum += w5 * (sample5d + sample5c + sample5b + sample5a);
+    sum += w6 * (sample6d + sample6c + sample6b + sample6a);
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur7x7(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Perform a 1-pass 7x7 blur with 5x5 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 7x7 Gaussian blurred mipmapped texture lookup composed of
+    //              4x4 carefully selected bilinear samples.
+    //  Description:
+    //  First see the descriptions for tex2Dblur9x9() and tex2Dblur7().  This
+    //  blur mixes concepts from both.  The sample layout is as follows:
+    //      4a 3a 3b 4b
+    //      2a 1a 1b 2b
+    //      2c 1c 1d 2d
+    //      4c 3c 3d 4d
+    //  The texel layout is as follows.  Note that samples 3a/3b, 1a/1b, 1c/1d,
+    //  and 3c/3d share a vertical column of texels, and samples 2a/2c, 1a/1c,
+    //  1b/1d, and 2b/2d share a horizontal row of texels (all sample1's share
+    //  the center texel):
+    //      4a4  4a3  3a4  3ab3 3b4  4b3  4b4
+    //      4a2  4a1  3a2  3ab1 3b2  4b1  4b2
+    //      2a4  2a3  1a4  1ab3 1b4  2b3  2b4
+    //      2ac2 2ac1 1ac2 1*   1bd2 2bd1 2bd2
+    //      2c4  2c3  1c4  1cd3 1d4  2d3  2d4
+    //      4c2  4c1  3c2  3cd1 3d2  4d1  4d2
+    //      4c4  4c3  3c4  3cd3 3d4  4d3  4d4
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off = 1.0;
+    const float w1off = exp(-1.0 * denom_inv);
+    const float w2off = exp(-4.0 * denom_inv);
+    const float w3off = exp(-9.0 * denom_inv);
+    const float texel0to1ratio = w1off/(w0off * 0.5 + w1off);
+    const float texel2to3ratio = w3off/(w2off + w3off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including axis-aligned:
+    const float2 sample1d_texel_offset = float2(texel0to1ratio, texel0to1ratio);
+    const float2 sample2d_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample3d_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample4d_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+
+    //  CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
+    //  Statically compute Gaussian texel weights for the bottom-right quadrant.
+    //  Read underscores as "and."
+    const float w1abcd = 1.0;
+    const float w1bd2_1cd3 = exp(-LENGTH_SQ(float2(1.0, 0.0)) * denom_inv);
+    const float w2bd1_3cd1 = exp(-LENGTH_SQ(float2(2.0, 0.0)) * denom_inv);
+    const float w2bd2_3cd2 = exp(-LENGTH_SQ(float2(3.0, 0.0)) * denom_inv);
+    const float w1d4 =       exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
+    const float w2d3_3d2 =   exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv);
+    const float w2d4_3d4 =   exp(-LENGTH_SQ(float2(3.0, 1.0)) * denom_inv);
+    const float w4d1 =       exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv);
+    const float w4d2_4d3 =   exp(-LENGTH_SQ(float2(3.0, 2.0)) * denom_inv);
+    const float w4d4 =       exp(-LENGTH_SQ(float2(3.0, 3.0)) * denom_inv);
+    //  Statically add texel weights in each sample to get sample weights.
+    //  Split weights for shared texels between samples sharing them:
+    const float w1 = w1abcd * 0.25 + w1bd2_1cd3 + w1d4;
+    const float w2_3 = (w2bd1_3cd1 + w2bd2_3cd2) * 0.5 + w2d3_3d2 + w2d4_3d4;
+    const float w4 = w4d1 + 2.0 * w4d2_4d3 + w4d4;
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv =
+        1.0/(4.0 * (w1 + 2.0 * w2_3 + w4));
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 16 samples using symmetry:
+    const float2 mirror_x = float2(-1.0, 1.0);
+    const float2 mirror_y = float2(1.0, -1.0);
+    const float2 mirror_xy = float2(-1.0, -1.0);
+    const float2 dxdy_mirror_x = dxdy * mirror_x;
+    const float2 dxdy_mirror_y = dxdy * mirror_y;
+    const float2 dxdy_mirror_xy = dxdy * mirror_xy;
+    const float3 sample1a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample1d_texel_offset).rgb;
+    const float3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb;
+    const float3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb;
+    const float3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb;
+    const float3 sample1b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample1d_texel_offset).rgb;
+    const float3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb;
+    const float3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb;
+    const float3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb;
+    const float3 sample1c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample1d_texel_offset).rgb;
+    const float3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb;
+    const float3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb;
+    const float3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb;
+    const float3 sample1d = tex2D_linearize(tex, tex_uv + dxdy * sample1d_texel_offset).rgb;
+    const float3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb;
+    const float3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb;
+    const float3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w1 * (sample1a + sample1b + sample1c + sample1d);
+    sum += w2_3 * (sample2a + sample2b + sample2c + sample2d);
+    sum += w2_3 * (sample3a + sample3b + sample3c + sample3d);
+    sum += w4 * (sample4a + sample4b + sample4c + sample4d);
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur5x5(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Perform a 1-pass 5x5 blur with 3x3 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 5x5 Gaussian blurred mipmapped texture lookup composed of
+    //              3x3 carefully selected bilinear samples.
+    //  Description:
+    //  First see the description for tex2Dblur9x9().  This blur uses the same
+    //  concept and sample/texel locations except on a smaller scale.  Samples:
+    //      2a 1U 2b
+    //      1L 0C 1R
+    //      2c 1D 2d
+    //  Texels:
+    //      2a4 2a3 1U2 2b3 2b4
+    //      2a2 2a1 1U1 2b1 2b2
+    //      1L2 1L1 0C1 1R1 1R2
+    //      2c2 2c1 1D1 2d1 2d2
+    //      2c4 2c3 1D2 2d3 2d4
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w1off = exp(-1.0 * denom_inv);
+    const float w2off = exp(-4.0 * denom_inv);
+    const float texel1to2ratio = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including x-axis-aligned:
+    const float2 sample1R_texel_offset = float2(1.0, 0.0) + float2(texel1to2ratio, 0.0);
+    const float2 sample2d_texel_offset = float2(1.0, 1.0) + float2(texel1to2ratio, texel1to2ratio);
+
+    //  CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
+    //  Statically compute Gaussian texel weights for the bottom-right quadrant.
+    //  Read underscores as "and."
+    const float w1R1 = w1off;
+    const float w1R2 = w2off;
+    const float w2d1 =   exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
+    const float w2d2_3 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv);
+    const float w2d4 =   exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv);
+    //  Statically add texel weights in each sample to get sample weights:
+    const float w0 = 1.0;
+    const float w1 = w1R1 + w1R2;
+    const float w2 = w2d1 + 2.0 * w2d2_3 + w2d4;
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv = 1.0/(w0 + 4.0 * (w1 + w2));
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 9 samples (1 nearest, 4 linear, 4 bilinear) using symmetry:
+    const float2 mirror_x = float2(-1.0, 1.0);
+    const float2 mirror_y = float2(1.0, -1.0);
+    const float2 mirror_xy = float2(-1.0, -1.0);
+    const float2 dxdy_mirror_x = dxdy * mirror_x;
+    const float2 dxdy_mirror_y = dxdy * mirror_y;
+    const float2 dxdy_mirror_xy = dxdy * mirror_xy;
+    const float3 sample0C = tex2D_linearize(tex, tex_uv).rgb;
+    const float3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb;
+    const float3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb;
+    const float3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb;
+    const float3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb;
+    const float3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb;
+    const float3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb;
+    const float3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb;
+    const float3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    float3 sum = w0 * sample0C;
+    sum += w1 * (sample1R + sample1D + sample1L + sample1U);
+    sum += w2 * (sample2a + sample2b + sample2c + sample2d);
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur3x3(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Perform a 1-pass 3x3 blur with 5x5 bilinear samples.
+    //  Requires:   Same as tex2Dblur9()
+    //  Returns:    A 3x3 Gaussian blurred mipmapped texture lookup composed of
+    //              2x2 carefully selected bilinear samples.
+    //  Description:
+    //  First see the descriptions for tex2Dblur9x9() and tex2Dblur7().  This
+    //  blur mixes concepts from both.  The sample layout is as follows:
+    //      0a 0b
+    //      0c 0d
+    //  The texel layout is as follows.  Note that samples 0a/0b and 0c/0d share
+    //  a vertical column of texels, and samples 0a/0c and 0b/0d share a
+    //  horizontal row of texels (all samples share the center texel):
+    //      0a3  0ab2 0b3
+    //      0ac1 0*0  0bd1
+    //      0c3  0cd2 0d3
+
+    //  COMPUTE TEXTURE COORDS:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off = 1.0;
+    const float w1off = exp(-1.0 * denom_inv);
+    const float texel0to1ratio = w1off/(w0off * 0.5 + w1off);
+    //  Statically compute texel offsets from the fragment center to each
+    //  bilinear sample in the bottom-right quadrant, including axis-aligned:
+    const float2 sample0d_texel_offset = float2(texel0to1ratio, texel0to1ratio);
+
+    //  LOAD TEXTURE SAMPLES:
+    //  Load all 4 samples using symmetry:
+    const float2 mirror_x = float2(-1.0, 1.0);
+    const float2 mirror_y = float2(1.0, -1.0);
+    const float2 mirror_xy = float2(-1.0, -1.0);
+    const float2 dxdy_mirror_x = dxdy * mirror_x;
+    const float2 dxdy_mirror_y = dxdy * mirror_y;
+    const float2 dxdy_mirror_xy = dxdy * mirror_xy;
+    const float3 sample0a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample0d_texel_offset).rgb;
+    const float3 sample0b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample0d_texel_offset).rgb;
+    const float3 sample0c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample0d_texel_offset).rgb;
+    const float3 sample0d = tex2D_linearize(tex, tex_uv + dxdy * sample0d_texel_offset).rgb;
+
+    //  SUM WEIGHTED SAMPLES:
+    //  Weights for all samples are the same, so just average them:
+    return 0.25 * (sample0a + sample0b + sample0c + sample0d);
+}
+
+
+//////////////////  LINEAR ONE-PASS BLURS WITH SHARED SAMPLES  /////////////////
+
+float3 tex2Dblur12x12shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
+    const float sigma)
+{
+    //  Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
+    //  Requires:   1.) Same as tex2Dblur9()
+    //              2.) ddx() and ddy() are present in the current Cg profile.
+    //              3.) The GPU driver is using fine/high-quality derivatives.
+    //              4.) quad_vector *correctly* describes the current fragment's
+    //                  location in its pixel quad, by the conventions noted in
+    //                  get_quad_vector[_naive].
+    //              5.) tex_uv.w = log2(video_size/output_size).y
+    //              6.) tex2Dlod() is present in the current Cg profile.
+    //  Optional:   Tune artifacts vs. excessive blurriness with the global
+    //              float error_blurring.
+    //  Returns:    A blurred texture lookup using a "virtual" 12x12 Gaussian
+    //              blur (a 6x6 blur of carefully selected bilinear samples)
+    //              of the given mip level.  There will be subtle inaccuracies,
+    //              especially for small or high-frequency detailed sources.
+    //  Description:
+    //  Perform a 1-pass blur with shared texture lookups across a pixel quad.
+    //  We'll get neighboring samples with high-quality ddx/ddy derivatives, as
+    //  in GPU Pro 2, Chapter VI.2, "Shader Amortization using Pixel Quad
+    //  Message Passing" by Eric Penner.
+    //
+    //  Our "virtual" 12x12 blur will be comprised of ((6 - 1)^2)/4 + 3 = 12
+    //  bilinear samples, where bilinear sampling positions are computed from
+    //  the relative Gaussian weights of the 4 surrounding texels.  The catch is
+    //  that the appropriate texel weights and sample coords differ for each
+    //  fragment, but we're reusing most of the same samples across a quad of
+    //  destination fragments.  (We do use unique coords for the four nearest
+    //  samples at each fragment.)  Mixing bilinear filtering and sample-sharing
+    //  therefore introduces some error into the weights, and this can get nasty
+    //  when the source image is small or high-frequency.  Computing bilinear
+    //  ratios based on weights at the sample field center results in sharpening
+    //  and ringing artifacts, but we can move samples closer to halfway between
+    //  texels to try blurring away the error (which can move features around by
+    //  a texel or so).  Tune this with the global float "error_blurring".
+    //
+    //  The pixel quad's sample field covers 12x12 texels, accessed through 6x6
+    //  bilinear (2x2 texel) taps.  Each fragment depends on a window of 10x10
+    //  texels (5x5 bilinear taps), and each fragment is responsible for loading
+    //  a 6x6 texel quadrant as a 3x3 block of bilinear taps, plus 3 more taps
+    //  to use unique bilinear coords for sample0* for each fragment.  This
+    //  diagram illustrates the relative locations of bilinear samples 1-9 for
+    //  each quadrant a, b, c, d (note samples will not be equally spaced):
+    //      8a 7a 6a 6b 7b 8b
+    //      5a 4a 3a 3b 4b 5b
+    //      2a 1a 0a 0b 1b 2b
+    //      2c 1c 0c 0d 1d 2d
+    //      5c 4c 3c 3d 4d 5d
+    //      8c 7c 6c 6d 7d 8d
+    //  The following diagram illustrates the underlying equally spaced texels,
+    //  named after the sample that accesses them and subnamed by their location
+    //  within their 2x2 texel block:
+    //      8a3 8a2 7a3 7a2 6a3 6a2 6b2 6b3 7b2 7b3 8b2 8b3
+    //      8a1 8a0 7a1 7a0 6a1 6a0 6b0 6b1 7b0 7b1 8b0 8b1
+    //      5a3 5a2 4a3 4a2 3a3 3a2 3b2 3b3 4b2 4b3 5b2 5b3
+    //      5a1 5a0 4a1 4a0 3a1 3a0 3b0 3b1 4b0 4b1 5b0 5b1
+    //      2a3 2a2 1a3 1a2 0a3 0a2 0b2 0b3 1b2 1b3 2b2 2b3
+    //      2a1 2a0 1a1 1a0 0a1 0a0 0b0 0b1 1b0 1b1 2b0 2b1
+    //      2c1 2c0 1c1 1c0 0c1 0c0 0d0 0d1 1d0 1d1 2d0 2d1
+    //      2c3 2c2 1c3 1c2 0c3 0c2 0d2 0d3 1d2 1d3 2d2 2d3
+    //      5c1 5c0 4c1 4c0 3c1 3c0 3d0 3d1 4d0 4d1 5d0 5d1
+    //      5c3 5c2 4c3 4c2 3c3 3c2 3d2 3d3 4d2 4d3 5d2 5d3
+    //      8c1 8c0 7c1 7c0 6c1 6c0 6d0 6d1 7d0 7d1 8d0 8d1
+    //      8c3 8c2 7c3 7c2 6c3 6c2 6d2 6d3 7d2 7d3 8d2 8d3
+    //  With this symmetric arrangement, we don't have to know which absolute
+    //  quadrant a sample lies in to assign kernel weights; it's enough to know
+    //  the sample number and the relative quadrant of the sample (relative to
+    //  the current quadrant):
+    //      {current, adjacent x, adjacent y, diagonal}
+
+    //  COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Statically compute sampling offsets within each 2x2 texel block, based
+    //  on appropriate 1D Gaussian sampling ratio between texels [0, 1], [2, 3],
+    //  and [4, 5] away from the fragment, and reuse them independently for both
+    //  dimensions.  Use the sample field center as the estimated destination,
+    //  but nudge the result closer to halfway between texels to blur error.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off   = 1.0;
+    const float w0_5off = exp(-(0.5*0.5) * denom_inv);
+    const float w1off   = exp(-(1.0*1.0) * denom_inv);
+    const float w1_5off = exp(-(1.5*1.5) * denom_inv);
+    const float w2off   = exp(-(2.0*2.0) * denom_inv);
+    const float w2_5off = exp(-(2.5*2.5) * denom_inv);
+    const float w3_5off = exp(-(3.5*3.5) * denom_inv);
+    const float w4_5off = exp(-(4.5*4.5) * denom_inv);
+    const float w5_5off = exp(-(5.5*5.5) * denom_inv);
+    const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
+    const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
+    const float texel4to5ratio = lerp(w5_5off/(w4_5off + w5_5off), 0.5, error_blurring);
+    //  We don't share sample0*, so use the nearest destination fragment:
+    const float texel0to1ratio_nearest = w1off/(w0off + w1off);
+    const float texel1to2ratio_nearest = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the bottom-right fragment to each
+    //  bilinear sample in the bottom-right quadrant:
+    const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample2_texel_offset = float2(4.0, 0.0) + float2(texel4to5ratio, texel0to1ratio);
+    const float2 sample3_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample4_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+    const float2 sample5_texel_offset = float2(4.0, 2.0) + float2(texel4to5ratio, texel2to3ratio);
+    const float2 sample6_texel_offset = float2(0.0, 4.0) + float2(texel0to1ratio, texel4to5ratio);
+    const float2 sample7_texel_offset = float2(2.0, 4.0) + float2(texel2to3ratio, texel4to5ratio);
+    const float2 sample8_texel_offset = float2(4.0, 4.0) + float2(texel4to5ratio, texel4to5ratio);
+
+    //  CALCULATE KERNEL WEIGHTS:
+    //  Statically compute bilinear sample weights at each destination fragment
+    //  based on the sum of their 4 underlying texel weights.  Assume a same-
+    //  resolution blur, so each symmetrically named sample weight will compute
+    //  the same at every fragment in the pixel quad: We can therefore compute
+    //  texel weights based only on the bottom-right quadrant (fragment at 0d0).
+    //  Too avoid too much boilerplate code, use a macro to get all 4 texel
+    //  weights for a bilinear sample based on the offset of its top-left texel:
+    #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
+        (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
+    const float w8diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -6.0);
+    const float w7diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -6.0);
+    const float w6diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -6.0);
+    const float w6adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -6.0);
+    const float w7adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -6.0);
+    const float w8adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -6.0);
+    const float w5diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -4.0);
+    const float w4diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0);
+    const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0);
+    const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0);
+    const float w4adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0);
+    const float w5adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -4.0);
+    const float w2diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -2.0);
+    const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0);
+    const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
+    const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
+    const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
+    const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -2.0);
+    const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 0.0);
+    const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0);
+    const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
+    const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
+    const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
+    const float w2curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 0.0);
+    const float w5adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 2.0);
+    const float w4adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0);
+    const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
+    const float w3curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
+    const float w4curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
+    const float w5curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 2.0);
+    const float w8adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 4.0);
+    const float w7adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 4.0);
+    const float w6adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 4.0);
+    const float w6curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 4.0);
+    const float w7curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 4.0);
+    const float w8curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 4.0);
+    #undef GET_TEXEL_QUAD_WEIGHTS
+    //  Statically pack weights for runtime:
+    const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
+    const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag);
+    const float4 w2 = float4(w2curr, w2adjx, w2adjy, w2diag);
+    const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag);
+    const float4 w4 = float4(w4curr, w4adjx, w4adjy, w4diag);
+    const float4 w5 = float4(w5curr, w5adjx, w5adjy, w5diag);
+    const float4 w6 = float4(w6curr, w6adjx, w6adjy, w6diag);
+    const float4 w7 = float4(w7curr, w7adjx, w7adjy, w7diag);
+    const float4 w8 = float4(w8curr, w8adjx, w8adjy, w8diag);
+    //  Get the weight sum inverse (normalization factor):
+    const float4 weight_sum4 = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8;
+    const float2 weight_sum2 = weight_sum4.xy + weight_sum4.zw;
+    const float weight_sum = weight_sum2.x + weight_sum2.y;
+    const float weight_sum_inv = 1.0/(weight_sum);
+
+    //  LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
+    const float2 dxdy_curr = dxdy * quad_vector.xy;
+    //  Load bilinear samples for the current quadrant (for this fragment):
+    const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
+    const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
+    const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
+    const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
+    const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
+    const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
+    const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
+    const float3 sample4curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample4_texel_offset)).rgb;
+    const float3 sample5curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample5_texel_offset)).rgb;
+    const float3 sample6curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample6_texel_offset)).rgb;
+    const float3 sample7curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample7_texel_offset)).rgb;
+    const float3 sample8curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample8_texel_offset)).rgb;
+
+    //  GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
+    //  Fetch the samples from other fragments in the 2x2 quad:
+    float3 sample1adjx, sample1adjy, sample1diag;
+    float3 sample2adjx, sample2adjy, sample2diag;
+    float3 sample3adjx, sample3adjy, sample3diag;
+    float3 sample4adjx, sample4adjy, sample4diag;
+    float3 sample5adjx, sample5adjy, sample5diag;
+    float3 sample6adjx, sample6adjy, sample6diag;
+    float3 sample7adjx, sample7adjy, sample7diag;
+    float3 sample8adjx, sample8adjy, sample8diag;
+    quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
+    quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
+    quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag);
+    quad_gather(quad_vector, sample4curr, sample4adjx, sample4adjy, sample4diag);
+    quad_gather(quad_vector, sample5curr, sample5adjx, sample5adjy, sample5diag);
+    quad_gather(quad_vector, sample6curr, sample6adjx, sample6adjy, sample6diag);
+    quad_gather(quad_vector, sample7curr, sample7adjx, sample7adjy, sample7diag);
+    quad_gather(quad_vector, sample8curr, sample8adjx, sample8adjy, sample8diag);
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    //  Fill each row of a matrix with an rgb sample and pre-multiply by the
+    //  weights to obtain a weighted result:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
+    sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag));
+    sum += mul(w2, float4x3(sample2curr, sample2adjx, sample2adjy, sample2diag));
+    sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag));
+    sum += mul(w4, float4x3(sample4curr, sample4adjx, sample4adjy, sample4diag));
+    sum += mul(w5, float4x3(sample5curr, sample5adjx, sample5adjy, sample5diag));
+    sum += mul(w6, float4x3(sample6curr, sample6adjx, sample6adjy, sample6diag));
+    sum += mul(w7, float4x3(sample7curr, sample7adjx, sample7adjy, sample7diag));
+    sum += mul(w8, float4x3(sample8curr, sample8adjx, sample8adjy, sample8diag));
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur10x10shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
+    const float sigma)
+{
+    //  Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
+    //  Requires:   Same as tex2Dblur12x12shared()
+    //  Returns:    A blurred texture lookup using a "virtual" 10x10 Gaussian
+    //              blur (a 5x5 blur of carefully selected bilinear samples)
+    //              of the given mip level.  There will be subtle inaccuracies,
+    //              especially for small or high-frequency detailed sources.
+    //  Description:
+    //  First see the description for tex2Dblur12x12shared().  This
+    //  function shares the same concept and sample placement, but each fragment
+    //  only uses 25 of the 36 samples taken across the pixel quad (to cover a
+    //  5x5 sample area, or 10x10 texel area), and it uses a lower standard
+    //  deviation to compensate.  Thanks to symmetry, the 11 omitted samples
+    //  are always the "same:"
+    //      8adjx, 2adjx, 5adjx,
+    //      6adjy, 7adjy, 8adjy,
+    //      2diag, 5diag, 6diag, 7diag, 8diag
+
+    //  COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off   = 1.0;
+    const float w0_5off = exp(-(0.5*0.5) * denom_inv);
+    const float w1off   = exp(-(1.0*1.0) * denom_inv);
+    const float w1_5off = exp(-(1.5*1.5) * denom_inv);
+    const float w2off   = exp(-(2.0*2.0) * denom_inv);
+    const float w2_5off = exp(-(2.5*2.5) * denom_inv);
+    const float w3_5off = exp(-(3.5*3.5) * denom_inv);
+    const float w4_5off = exp(-(4.5*4.5) * denom_inv);
+    const float w5_5off = exp(-(5.5*5.5) * denom_inv);
+    const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
+    const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
+    const float texel4to5ratio = lerp(w5_5off/(w4_5off + w5_5off), 0.5, error_blurring);
+    //  We don't share sample0*, so use the nearest destination fragment:
+    const float texel0to1ratio_nearest = w1off/(w0off + w1off);
+    const float texel1to2ratio_nearest = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the bottom-right fragment to each
+    //  bilinear sample in the bottom-right quadrant:
+    const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample2_texel_offset = float2(4.0, 0.0) + float2(texel4to5ratio, texel0to1ratio);
+    const float2 sample3_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample4_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+    const float2 sample5_texel_offset = float2(4.0, 2.0) + float2(texel4to5ratio, texel2to3ratio);
+    const float2 sample6_texel_offset = float2(0.0, 4.0) + float2(texel0to1ratio, texel4to5ratio);
+    const float2 sample7_texel_offset = float2(2.0, 4.0) + float2(texel2to3ratio, texel4to5ratio);
+    const float2 sample8_texel_offset = float2(4.0, 4.0) + float2(texel4to5ratio, texel4to5ratio);
+
+    //  CALCULATE KERNEL WEIGHTS:
+    //  Statically compute bilinear sample weights at each destination fragment
+    //  from the sum of their 4 texel weights (details in tex2Dblur12x12shared).
+    #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
+        (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
+    //  We only need 25 of the 36 sample weights.  Skip the following weights:
+    //      8adjx, 2adjx, 5adjx,
+    //      6adjy, 7adjy, 8adjy,
+    //      2diag, 5diag, 6diag, 7diag, 8diag
+    const float w4diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0);
+    const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0);
+    const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0);
+    const float w4adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0);
+    const float w5adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -4.0);
+    const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0);
+    const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
+    const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
+    const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
+    const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -2.0);
+    const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0);
+    const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
+    const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
+    const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
+    const float w2curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 0.0);
+    const float w4adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0);
+    const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
+    const float w3curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
+    const float w4curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
+    const float w5curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 2.0);
+    const float w7adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 4.0);
+    const float w6adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 4.0);
+    const float w6curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 4.0);
+    const float w7curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 4.0);
+    const float w8curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 4.0);
+    #undef GET_TEXEL_QUAD_WEIGHTS
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv = 1.0/(w0curr + w1curr + w2curr + w3curr +
+        w4curr + w5curr + w6curr + w7curr + w8curr +
+        w0adjx + w1adjx + w3adjx + w4adjx + w6adjx + w7adjx +
+        w0adjy + w1adjy + w2adjy + w3adjy + w4adjy + w5adjy +
+        w0diag + w1diag + w3diag + w4diag);
+    //  Statically pack most weights for runtime.  Note the mixed packing:
+    const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
+    const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag);
+    const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag);
+    const float4 w4 = float4(w4curr, w4adjx, w4adjy, w4diag);
+    const float4 w2and5 = float4(w2curr, w2adjy, w5curr, w5adjy);
+    const float4 w6and7 = float4(w6curr, w6adjx, w7curr, w7adjx);
+
+    //  LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
+    const float2 dxdy_curr = dxdy * quad_vector.xy;
+    //  Load bilinear samples for the current quadrant (for this fragment):
+    const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
+    const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
+    const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
+    const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
+    const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
+    const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
+    const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
+    const float3 sample4curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample4_texel_offset)).rgb;
+    const float3 sample5curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample5_texel_offset)).rgb;
+    const float3 sample6curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample6_texel_offset)).rgb;
+    const float3 sample7curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample7_texel_offset)).rgb;
+    const float3 sample8curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample8_texel_offset)).rgb;
+
+    //  GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
+    //  Fetch the samples from other fragments in the 2x2 quad in order of need:
+    float3 sample1adjx, sample1adjy, sample1diag;
+    float3 sample2adjx, sample2adjy, sample2diag;
+    float3 sample3adjx, sample3adjy, sample3diag;
+    float3 sample4adjx, sample4adjy, sample4diag;
+    float3 sample5adjx, sample5adjy, sample5diag;
+    float3 sample6adjx, sample6adjy, sample6diag;
+    float3 sample7adjx, sample7adjy, sample7diag;
+    quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
+    quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
+    quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag);
+    quad_gather(quad_vector, sample4curr, sample4adjx, sample4adjy, sample4diag);
+    quad_gather(quad_vector, sample5curr, sample5adjx, sample5adjy, sample5diag);
+    quad_gather(quad_vector, sample6curr, sample6adjx, sample6adjy, sample6diag);
+    quad_gather(quad_vector, sample7curr, sample7adjx, sample7adjy, sample7diag);
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    //  Fill each row of a matrix with an rgb sample and pre-multiply by the
+    //  weights to obtain a weighted result.  First do the simple ones:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
+    sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag));
+    sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag));
+    sum += mul(w4, float4x3(sample4curr, sample4adjx, sample4adjy, sample4diag));
+    //  Now do the mixed-sample ones:
+    sum += mul(w2and5, float4x3(sample2curr, sample2adjy, sample5curr, sample5adjy));
+    sum += mul(w6and7, float4x3(sample6curr, sample6adjx, sample7curr, sample7adjx));
+    sum += w8curr * sample8curr;
+    //  Normalize the sum (so the weights add to 1.0) and return:
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur8x8shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
+    const float sigma)
+{
+    //  Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
+    //  Requires:   Same as tex2Dblur12x12shared()
+    //  Returns:    A blurred texture lookup using a "virtual" 8x8 Gaussian
+    //              blur (a 4x4 blur of carefully selected bilinear samples)
+    //              of the given mip level.  There will be subtle inaccuracies,
+    //              especially for small or high-frequency detailed sources.
+    //  Description:
+    //  First see the description for tex2Dblur12x12shared().  This function
+    //  shares the same concept and a similar sample placement, except each
+    //  quadrant contains 4x4 texels and 2x2 samples instead of 6x6 and 3x3
+    //  respectively.  There could be a total of 16 samples, 4 of which each
+    //  fragment is responsible for, but each fragment loads 0a/0b/0c/0d with
+    //  its own offset to reduce shared sample artifacts, bringing the sample
+    //  count for each fragment to 7.  Sample placement:
+    //      3a 2a 2b 3b
+    //      1a 0a 0b 1b
+    //      1c 0c 0d 1d
+    //      3c 2c 2d 3d
+    //  Texel placement:
+    //      3a3 3a2 2a3 2a2 2b2 2b3 3b2 3b3
+    //      3a1 3a0 2a1 2a0 2b0 2b1 3b0 3b1
+    //      1a3 1a2 0a3 0a2 0b2 0b3 1b2 1b3
+    //      1a1 1a0 0a1 0a0 0b0 0b1 1b0 1b1
+    //      1c1 1c0 0c1 0c0 0d0 0d1 1d0 1d1
+    //      1c3 1c2 0c3 0c2 0d2 0d3 1d2 1d3
+    //      3c1 3c0 2c1 2c0 2d0 2d1 3d0 4d1
+    //      3c3 3c2 2c3 2c2 2d2 2d3 3d2 4d3
+    
+    //  COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off   = 1.0;
+    const float w0_5off = exp(-(0.5*0.5) * denom_inv);
+    const float w1off   = exp(-(1.0*1.0) * denom_inv);
+    const float w1_5off = exp(-(1.5*1.5) * denom_inv);
+    const float w2off   = exp(-(2.0*2.0) * denom_inv);
+    const float w2_5off = exp(-(2.5*2.5) * denom_inv);
+    const float w3_5off = exp(-(3.5*3.5) * denom_inv);
+    const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
+    const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
+    //  We don't share sample0*, so use the nearest destination fragment:
+    const float texel0to1ratio_nearest = w1off/(w0off + w1off);
+    const float texel1to2ratio_nearest = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the bottom-right fragment to each
+    //  bilinear sample in the bottom-right quadrant:
+    const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample2_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample3_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+
+    //  CALCULATE KERNEL WEIGHTS:
+    //  Statically compute bilinear sample weights at each destination fragment
+    //  from the sum of their 4 texel weights (details in tex2Dblur12x12shared).
+    #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
+        (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
+    const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0);
+    const float w2diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0);
+    const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0);
+    const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0);
+    const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0);
+    const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
+    const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
+    const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
+    const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0);
+    const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
+    const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
+    const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
+    const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0);
+    const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
+    const float w2curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
+    const float w3curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
+    #undef GET_TEXEL_QUAD_WEIGHTS
+    //  Statically pack weights for runtime:
+    const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
+    const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag);
+    const float4 w2 = float4(w2curr, w2adjx, w2adjy, w2diag);
+    const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag);
+    //  Get the weight sum inverse (normalization factor):
+    const float4 weight_sum4 = w0 + w1 + w2 + w3;
+    const float2 weight_sum2 = weight_sum4.xy + weight_sum4.zw;
+    const float weight_sum = weight_sum2.x + weight_sum2.y;
+    const float weight_sum_inv = 1.0/(weight_sum);
+
+    //  LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
+    const float2 dxdy_curr = dxdy * quad_vector.xy;
+    //  Load bilinear samples for the current quadrant (for this fragment):
+    const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
+    const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
+    const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
+    const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
+    const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
+    const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
+    const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
+
+    //  GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
+    //  Fetch the samples from other fragments in the 2x2 quad:
+    float3 sample1adjx, sample1adjy, sample1diag;
+    float3 sample2adjx, sample2adjy, sample2diag;
+    float3 sample3adjx, sample3adjy, sample3diag;
+    quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
+    quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
+    quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag);
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    //  Fill each row of a matrix with an rgb sample and pre-multiply by the
+    //  weights to obtain a weighted result:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
+    sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag));
+    sum += mul(w2, float4x3(sample2curr, sample2adjx, sample2adjy, sample2diag));
+    sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag));
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur6x6shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
+    const float sigma)
+{
+    //  Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
+    //  Requires:   Same as tex2Dblur12x12shared()
+    //  Returns:    A blurred texture lookup using a "virtual" 6x6 Gaussian
+    //              blur (a 3x3 blur of carefully selected bilinear samples)
+    //              of the given mip level.  There will be some inaccuracies,subtle inaccuracies,
+    //              especially for small or high-frequency detailed sources.
+    //  Description:
+    //  First see the description for tex2Dblur8x8shared().  This
+    //  function shares the same concept and sample placement, but each fragment
+    //  only uses 9 of the 16 samples taken across the pixel quad (to cover a
+    //  3x3 sample area, or 6x6 texel area), and it uses a lower standard
+    //  deviation to compensate.  Thanks to symmetry, the 7 omitted samples
+    //  are always the "same:"
+    //      1adjx, 3adjx
+    //      2adjy, 3adjy
+    //      1diag, 2diag, 3diag
+
+    //  COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off   = 1.0;
+    const float w0_5off = exp(-(0.5*0.5) * denom_inv);
+    const float w1off   = exp(-(1.0*1.0) * denom_inv);
+    const float w1_5off = exp(-(1.5*1.5) * denom_inv);
+    const float w2off   = exp(-(2.0*2.0) * denom_inv);
+    const float w2_5off = exp(-(2.5*2.5) * denom_inv);
+    const float w3_5off = exp(-(3.5*3.5) * denom_inv);
+    const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
+    const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
+    //  We don't share sample0*, so use the nearest destination fragment:
+    const float texel0to1ratio_nearest = w1off/(w0off + w1off);
+    const float texel1to2ratio_nearest = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the bottom-right fragment to each
+    //  bilinear sample in the bottom-right quadrant:
+    const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample2_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample3_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+
+    //  CALCULATE KERNEL WEIGHTS:
+    //  Statically compute bilinear sample weights at each destination fragment
+    //  from the sum of their 4 texel weights (details in tex2Dblur12x12shared).
+    #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
+        (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
+    //  We only need 9 of the 16 sample weights.  Skip the following weights:
+    //      1adjx, 3adjx
+    //      2adjy, 3adjy
+    //      1diag, 2diag, 3diag
+    const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
+    const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
+    const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
+    const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
+    const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
+    const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
+    const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
+    const float w2curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
+    const float w3curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
+    #undef GET_TEXEL_QUAD_WEIGHTS
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv = 1.0/(w0curr + w1curr + w2curr + w3curr +
+        w0adjx + w2adjx + w0adjy + w1adjy + w0diag);
+    //  Statically pack some weights for runtime:
+    const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
+
+    //  LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
+    const float2 dxdy_curr = dxdy * quad_vector.xy;
+    //  Load bilinear samples for the current quadrant (for this fragment):
+    const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
+    const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
+    const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
+    const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
+    const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
+    const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
+    const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
+
+    //  GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
+    //  Fetch the samples from other fragments in the 2x2 quad:
+    float3 sample1adjx, sample1adjy, sample1diag;
+    float3 sample2adjx, sample2adjy, sample2diag;
+    quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
+    quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    //  Fill each row of a matrix with an rgb sample and pre-multiply by the
+    //  weights to obtain a weighted result for sample1*, and handle the rest
+    //  of the weights more directly/verbosely:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
+    sum += w1curr * sample1curr + w1adjy * sample1adjy + w2curr * sample2curr +
+            w2adjx * sample2adjx + w3curr * sample3curr;
+    return sum * weight_sum_inv;
+}
+
+
+///////////////////////  MAX OPTIMAL SIGMA BLUR WRAPPERS  //////////////////////
+
+//  The following blurs are static wrappers around the dynamic blurs above.
+//  HOPEFULLY, the compiler will be smart enough to do constant-folding.
+
+//  Resizable separable blurs:
+inline float3 tex2Dblur11resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur11resize(tex, tex_uv, dxdy, blur11_std_dev);
+}
+inline float3 tex2Dblur9resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur9resize(tex, tex_uv, dxdy, blur9_std_dev);
+}
+inline float3 tex2Dblur7resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur7resize(tex, tex_uv, dxdy, blur7_std_dev);
+}
+inline float3 tex2Dblur5resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur5resize(tex, tex_uv, dxdy, blur5_std_dev);
+}
+inline float3 tex2Dblur3resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur3resize(tex, tex_uv, dxdy, blur3_std_dev);
+}
+//  Fast separable blurs:
+inline float3 tex2Dblur11fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur11fast(tex, tex_uv, dxdy, blur11_std_dev);
+}
+inline float3 tex2Dblur9fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur9fast(tex, tex_uv, dxdy, blur9_std_dev);
+}
+inline float3 tex2Dblur7fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur7fast(tex, tex_uv, dxdy, blur7_std_dev);
+}
+inline float3 tex2Dblur5fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur5fast(tex, tex_uv, dxdy, blur5_std_dev);
+}
+inline float3 tex2Dblur3fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur3fast(tex, tex_uv, dxdy, blur3_std_dev);
+}
+//  Huge, "fast" separable blurs:
+inline float3 tex2Dblur43fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur43fast(tex, tex_uv, dxdy, blur43_std_dev);
+}
+inline float3 tex2Dblur31fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur31fast(tex, tex_uv, dxdy, blur31_std_dev);
+}
+inline float3 tex2Dblur25fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur25fast(tex, tex_uv, dxdy, blur25_std_dev);
+}
+inline float3 tex2Dblur17fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur17fast(tex, tex_uv, dxdy, blur17_std_dev);
+}
+//  Resizable one-pass blurs:
+inline float3 tex2Dblur3x3resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur3x3resize(tex, tex_uv, dxdy, blur3_std_dev);
+}
+//  "Fast" one-pass blurs:
+inline float3 tex2Dblur9x9(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur9x9(tex, tex_uv, dxdy, blur9_std_dev);
+}
+inline float3 tex2Dblur7x7(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur7x7(tex, tex_uv, dxdy, blur7_std_dev);
+}
+inline float3 tex2Dblur5x5(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur5x5(tex, tex_uv, dxdy, blur5_std_dev);
+}
+inline float3 tex2Dblur3x3(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur3x3(tex, tex_uv, dxdy, blur3_std_dev);
+}
+//  "Fast" shared-sample one-pass blurs:
+inline float3 tex2Dblur12x12shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
+{
+    return tex2Dblur12x12shared(tex, tex_uv, dxdy, quad_vector, blur12_std_dev);
+}
+inline float3 tex2Dblur10x10shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
+{
+    return tex2Dblur10x10shared(tex, tex_uv, dxdy, quad_vector, blur10_std_dev);
+}
+inline float3 tex2Dblur8x8shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
+{
+    return tex2Dblur8x8shared(tex, tex_uv, dxdy, quad_vector, blur8_std_dev);
+}
+inline float3 tex2Dblur6x6shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
+{
+    return tex2Dblur6x6shared(tex, tex_uv, dxdy, quad_vector, blur6_std_dev);
+}
+
+
+#endif  //  BLUR_FUNCTIONS_H
+
+////////////////////////////  END BLUR-FUNCTIONS  ///////////////////////////
+
+///////////////////////////////  BLOOM CONSTANTS  //////////////////////////////
+
+//  Compute constants with manual inlines of the functions below:
+static const float bloom_diff_thresh = 1.0/256.0;
+
+
+
+///////////////////////////////////  HELPERS  //////////////////////////////////
+
+inline float get_min_sigma_to_blur_triad(const float triad_size,
+    const float thresh)
+{
+    //  Requires:   1.) triad_size is the final phosphor triad size in pixels
+    //              2.) thresh is the max desired pixel difference in the
+    //                  blurred triad (e.g. 1.0/256.0).
+    //  Returns:    Return the minimum sigma that will fully blur a phosphor
+    //              triad on the screen to an even color, within thresh.
+    //              This closed-form function was found by curve-fitting data.
+    //  Estimate: max error = ~0.086036, mean sq. error = ~0.0013387:
+    return -0.05168 + 0.6113*triad_size -
+        1.122*triad_size*sqrt(0.000416 + thresh);
+    //  Estimate: max error = ~0.16486, mean sq. error = ~0.0041041:
+    //return 0.5985*triad_size - triad_size*sqrt(thresh)
+}
+
+inline float get_absolute_scale_blur_sigma(const float thresh)
+{
+    //  Requires:   1.) min_expected_triads must be a global float.  The number
+    //                  of horizontal phosphor triads in the final image must be
+    //                  >= min_allowed_viewport_triads.x for realistic results.
+    //              2.) bloom_approx_scale_x must be a global float equal to the
+    //                  absolute horizontal scale of BLOOM_APPROX.
+    //              3.) bloom_approx_scale_x/min_allowed_viewport_triads.x
+    //                  should be <= 1.1658025090 to keep the final result <
+    //                  0.62666015625 (the largest sigma ensuring the largest
+    //                  unused texel weight stays < 1.0/256.0 for a 3x3 blur).
+    //              4.) thresh is the max desired pixel difference in the
+    //                  blurred triad (e.g. 1.0/256.0).
+    //  Returns:    Return the minimum Gaussian sigma that will blur the pass
+    //              output as much as it would have taken to blur away
+    //              bloom_approx_scale_x horizontal phosphor triads.
+    //  Description:
+    //  BLOOM_APPROX should look like a downscaled phosphor blur.  Ideally, we'd
+    //  use the same blur sigma as the actual phosphor bloom and scale it down
+    //  to the current resolution with (bloom_approx_scale_x/viewport_size_x), but
+    //  we don't know the viewport size in this pass.  Instead, we'll blur as
+    //  much as it would take to blur away min_allowed_viewport_triads.x.  This
+    //  will blur "more than necessary" if the user actually uses more triads,
+    //  but that's not terrible either, because blurring a constant fraction of
+    //  the viewport may better resemble a true optical bloom anyway (since the
+    //  viewport will generally be about the same fraction of each player's
+    //  field of view, regardless of screen size and resolution).
+    //  Assume an extremely large viewport size for asymptotic results.
+    return bloom_approx_scale_x/max_viewport_size_x *
+        get_min_sigma_to_blur_triad(
+            max_viewport_size_x/min_allowed_viewport_triads.x, thresh);
+}
+
+inline float get_center_weight(const float sigma)
+{
+    //  Given a Gaussian blur sigma, get the blur weight for the center texel.
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        return get_fast_gaussian_weight_sum_inv(sigma);
+    #else
+        const float denom_inv = 0.5/(sigma*sigma);
+        const float w0 = 1.0;
+        const float w1 = exp(-1.0 * denom_inv);
+        const float w2 = exp(-4.0 * denom_inv);
+        const float w3 = exp(-9.0 * denom_inv);
+        const float w4 = exp(-16.0 * denom_inv);
+        const float w5 = exp(-25.0 * denom_inv);
+        const float w6 = exp(-36.0 * denom_inv);
+        const float w7 = exp(-49.0 * denom_inv);
+        const float w8 = exp(-64.0 * denom_inv);
+        const float w9 = exp(-81.0 * denom_inv);
+        const float w10 = exp(-100.0 * denom_inv);
+        const float w11 = exp(-121.0 * denom_inv);
+        const float w12 = exp(-144.0 * denom_inv);
+        const float w13 = exp(-169.0 * denom_inv);
+        const float w14 = exp(-196.0 * denom_inv);
+        const float w15 = exp(-225.0 * denom_inv);
+        const float w16 = exp(-256.0 * denom_inv);
+        const float w17 = exp(-289.0 * denom_inv);
+        const float w18 = exp(-324.0 * denom_inv);
+        const float w19 = exp(-361.0 * denom_inv);
+        const float w20 = exp(-400.0 * denom_inv);
+        const float w21 = exp(-441.0 * denom_inv);
+        //  Note: If the implementation uses a smaller blur than the max allowed,
+        //  the worst case scenario is that the center weight will be overestimated,
+        //  so we'll put a bit more energy into the brightpass...no huge deal.
+        //  Then again, if the implementation uses a larger blur than the max
+        //  "allowed" because of dynamic branching, the center weight could be
+        //  underestimated, which is more of a problem...consider always using
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+            //  43x blur:
+            const float weight_sum_inv = 1.0 /
+                (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 +
+                w11 + w12 + w13 + w14 + w15 + w16 + w17 + w18 + w19 + w20 + w21));
+        #else
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+            //  31x blur:
+            const float weight_sum_inv = 1.0 /
+                (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 +
+                w8 + w9 + w10 + w11 + w12 + w13 + w14 + w15));
+        #else
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+            //  25x blur:
+            const float weight_sum_inv = 1.0 / (w0 + 2.0 * (
+                w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12));
+        #else
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+            //  17x blur:
+            const float weight_sum_inv = 1.0 / (w0 + 2.0 * (
+                w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8));
+        #else
+            //  9x blur:
+            const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+        const float center_weight = weight_sum_inv * weight_sum_inv;
+        return center_weight;
+    #endif
+}
+
+inline float3 tex2DblurNfast(const sampler2D texture, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  If sigma is static, we can safely branch and use the smallest blur
+    //  that's big enough.  Ignore #define hints, because we'll only use a
+    //  large blur if we actually need it, and the branches cost nothing.
+    #ifndef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #define PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE
+    #else
+        //  It's still worth branching if the profile supports dynamic branches:
+        //  It's much faster than using a hugely excessive blur, but each branch
+        //  eats ~1% FPS.
+        #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+            #define PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE
+        #endif
+    #endif
+    //  Failed optimization notes:
+    //  I originally created a same-size mipmapped 5-tap separable blur10 that
+    //  could handle any sigma by reaching into lower mip levels.  It was
+    //  as fast as blur25fast for runtime sigmas and a tad faster than
+    //  blur31fast for static sigmas, but mipmapping two viewport-size passes
+    //  ate 10% of FPS across all codepaths, so it wasn't worth it.
+    #ifdef PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE
+        if(sigma <= blur9_std_dev)
+        {
+            return tex2Dblur9fast(texture, tex_uv, dxdy, sigma);
+        }
+        else if(sigma <= blur17_std_dev)
+        {
+            return tex2Dblur17fast(texture, tex_uv, dxdy, sigma);
+        }
+        else if(sigma <= blur25_std_dev)
+        {
+            return tex2Dblur25fast(texture, tex_uv, dxdy, sigma);
+        }
+        else if(sigma <= blur31_std_dev)
+        {
+            return tex2Dblur31fast(texture, tex_uv, dxdy, sigma);
+        }
+        else
+        {
+            return tex2Dblur43fast(texture, tex_uv, dxdy, sigma);
+        }
+    #else
+        //  If we can't afford to branch, we can only guess at what blur
+        //  size we need.  Therefore, use the largest blur allowed.
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+            return tex2Dblur43fast(texture, tex_uv, dxdy, sigma);
+        #else
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+            return tex2Dblur31fast(texture, tex_uv, dxdy, sigma);
+        #else
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+            return tex2Dblur25fast(texture, tex_uv, dxdy, sigma);
+        #else
+        #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+            return tex2Dblur17fast(texture, tex_uv, dxdy, sigma);
+        #else
+            return tex2Dblur9fast(texture, tex_uv, dxdy, sigma);
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+        #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    #endif  //  PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE
+}
+
+inline float get_bloom_approx_sigma(const float output_size_x_runtime,
+    const float estimated_viewport_size_x)
+{
+    //  Requires:   1.) output_size_x_runtime == BLOOM_APPROX.output_size.x.
+    //                  This is included for dynamic codepaths just in case the
+    //                  following two globals are incorrect:
+    //              2.) bloom_approx_size_x_for_skip should == the same
+    //                  if PHOSPHOR_BLOOM_FAKE is #defined
+    //              3.) bloom_approx_size_x should == the same otherwise
+    //  Returns:    For gaussian4x4, return a dynamic small bloom sigma that's
+    //              as close to optimal as possible given available information.
+    //              For blur3x3, return the a static small bloom sigma that
+    //              works well for typical cases.  Otherwise, we're using simple
+    //              bilinear filtering, so use static calculations.
+    //  Assume the default static value.  This is a compromise that ensures
+    //  typical triads are blurred, even if unusually large ones aren't.
+    static const float mask_num_triads_static =
+        max(min_allowed_viewport_triads.x, mask_num_triads_desired_static);
+    const float mask_num_triads_from_size =
+        estimated_viewport_size_x/mask_triad_size_desired;
+    const float mask_num_triads_runtime = max(min_allowed_viewport_triads.x,
+        lerp(mask_num_triads_from_size, mask_num_triads_desired,
+            mask_specify_num_triads));
+    //  Assume an extremely large viewport size for asymptotic results:
+    static const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0);
+    if(bloom_approx_filter > 1.5)   //  4x4 true Gaussian resize
+    {
+        //  Use the runtime num triads and output size:
+        const float asymptotic_triad_size =
+            max_viewport_size_x/mask_num_triads_runtime;
+        const float asymptotic_sigma = get_min_sigma_to_blur_triad(
+            asymptotic_triad_size, bloom_diff_thresh);
+        const float bloom_approx_sigma =
+            asymptotic_sigma * output_size_x_runtime/max_viewport_size_x;
+        //  The BLOOM_APPROX input has to be ORIG_LINEARIZED to avoid moire, but
+        //  account for the Gaussian scanline sigma from the last pass too.
+        //  The bloom will be too wide horizontally but tall enough vertically.
+        return length(float2(bloom_approx_sigma, beam_max_sigma));
+    }
+    else    //  3x3 blur resize (the bilinear resize doesn't need a sigma)
+    {
+        //  We're either using blur3x3 or bilinear filtering.  The biggest
+        //  reason to choose blur3x3 is to avoid dynamic weights, so use a
+        //  static calculation.
+        #ifdef PHOSPHOR_BLOOM_FAKE
+            static const float output_size_x_static =
+                bloom_approx_size_x_for_fake;
+        #else
+            static const float output_size_x_static = bloom_approx_size_x;
+        #endif
+        static const float asymptotic_triad_size =
+            max_viewport_size_x/mask_num_triads_static;
+        const float asymptotic_sigma = get_min_sigma_to_blur_triad(
+            asymptotic_triad_size, bloom_diff_thresh);
+        const float bloom_approx_sigma =
+            asymptotic_sigma * output_size_x_static/max_viewport_size_x;
+        //  The BLOOM_APPROX input has to be ORIG_LINEARIZED to avoid moire, but
+        //  try accounting for the Gaussian scanline sigma from the last pass
+        //  too; use the static default value:
+        return length(float2(bloom_approx_sigma, beam_max_sigma_static));
+    }
+}
+
+inline float get_final_bloom_sigma(const float bloom_sigma_runtime)
+{
+    //  Requires:   1.) bloom_sigma_runtime is a precalculated sigma that's
+    //                  optimal for the [known] triad size.
+    //              2.) Call this from a fragment shader (not a vertex shader),
+    //                  or blurring with static sigmas won't be constant-folded.
+    //  Returns:    Return the optimistic static sigma if the triad size is
+    //              known at compile time.  Otherwise return the optimal runtime
+    //              sigma (10% slower) or an implementation-specific compromise
+    //              between an optimistic or pessimistic static sigma.
+    //  Notes:      Call this from the fragment shader, NOT the vertex shader,
+    //              so static sigmas can be constant-folded!
+    const float bloom_sigma_optimistic = get_min_sigma_to_blur_triad(
+        mask_triad_size_desired_static, bloom_diff_thresh);
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        return bloom_sigma_runtime;
+    #else
+        //  Overblurring looks as bad as underblurring, so assume average-size
+        //  triads, not worst-case huge triads:
+        return bloom_sigma_optimistic;
+    #endif
+}
+
+
+#endif  //  BLOOM_FUNCTIONS_H
+
+////////////////////////////  END BLOOM-FUNCTIONS  ///////////////////////////
+
+void main() {
+    //  This pass: Sample (misconverged?) scanlines to the final horizontal
+    //  resolution, apply halation (bouncing electrons), and apply the phosphor
+    //  mask.  Fake a bloom if requested.  Unless we fake a bloom, the output
+    //  will be dim from the scanline auto-dim, mask dimming, and low gamma.
+
+    //  Horizontally sample the current row (a vertically interpolated scanline)
+    //  and account for horizontal convergence offsets, given in units of texels.
+    const float3 scanline_color_dim = sample_rgb_scanline_horizontal(
+        VERTICAL_SCANLINEStexture, scanline_tex_uv,
+        VERTICAL_SCANLINEStexture_size, scanline_texture_size_inv);
+    const float auto_dim_factor = levels_autodim_temp;
+
+    //  Sample the phosphor mask:
+    const float2 tile_uv_wrap = video_uv * mask_tiles_per_screen;
+    const float2 mask_tex_uv = convert_phosphor_tile_uv_wrap_to_tex_uv(
+        tile_uv_wrap, mask_tile_start_uv_and_size);
+    float3 phosphor_mask_sample;
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        const bool sample_orig_luts = get_mask_sample_mode() > 0.5;
+    #else
+        static const bool sample_orig_luts = true;
+    #endif
+    if(sample_orig_luts)
+    {
+        //  If mask_type is static, this branch will be resolved statically.
+        if(mask_type < 0.5)
+        {
+            phosphor_mask_sample = tex2D_linearize(
+                mask_grille_texture_large, mask_tex_uv).rgb;
+        }
+        else if(mask_type < 1.5)
+        {
+            phosphor_mask_sample = tex2D_linearize(
+                mask_slot_texture_large, mask_tex_uv).rgb;
+        }
+        else
+        {
+            phosphor_mask_sample = tex2D_linearize(
+                mask_shadow_texture_large, mask_tex_uv).rgb;
+        }
+    }
+    else
+    {
+        //  Sample the resized mask, and avoid tiling artifacts:
+        phosphor_mask_sample = tex2Dtiled_mask_linearize(
+            MASK_RESIZEtexture, mask_tex_uv).rgb;
+    }
+
+    //  Sample the halation texture (auto-dim to match the scanlines), and
+    //  account for both horizontal and vertical convergence offsets, given
+    //  in units of texels horizontally and same-field scanlines vertically:
+    const float3 halation_color = tex2D_linearize(
+        HALATION_BLURtexture, halation_tex_uv).rgb;
+
+    //  Apply halation: Halation models electrons flying around under the glass
+    //  and hitting the wrong phosphors (of any color).  It desaturates, so
+    //  average the halation electrons to a scalar.  Reduce the local scanline
+    //  intensity accordingly to conserve energy.
+    const float3 halation_intensity_dim =
+        float3(dot(halation_color, float3(auto_dim_factor/3.0)));
+    const float3 electron_intensity_dim = lerp(scanline_color_dim,
+        halation_intensity_dim, halation_weight);
+
+    //  Apply the phosphor mask:
+    const float3 phosphor_emission_dim = electron_intensity_dim *
+        phosphor_mask_sample;
+
+    #ifdef PHOSPHOR_BLOOM_FAKE
+        //  The BLOOM_APPROX pass approximates a blurred version of a masked
+        //  and scanlined image.  It's usually used to compute the brightpass,
+        //  but we can also use it to fake the bloom stage entirely.  Caveats:
+        //  1.) A fake bloom is conceptually different, since we're mixing in a
+        //      fully blurred low-res image, and the biggest implication are:
+        //  2.) If mask_amplify is incorrect, results deteriorate more quickly.
+        //  3.) The inaccurate blurring hurts quality in high-contrast areas.
+        //  4.) The bloom_underestimate_levels parameter seems less sensitive.
+        //  Reverse the auto-dimming and amplify to compensate for mask dimming:
+		#define PHOSPHOR_BLOOM_FAKE_WITH_SIMPLE_BLEND
+        #ifdef PHOSPHOR_BLOOM_FAKE_WITH_SIMPLE_BLEND
+            static const float blur_contrast = 1.05;
+        #else
+            static const float blur_contrast = 1.0;
+        #endif
+        const float mask_amplify = get_mask_amplify();
+        const float undim_factor = 1.0/auto_dim_factor;
+        const float3 phosphor_emission =
+            phosphor_emission_dim * undim_factor * mask_amplify;
+        //  Get a phosphor blur estimate, accounting for convergence offsets:
+        const float3 electron_intensity = electron_intensity_dim * undim_factor;
+        const float3 phosphor_blur_approx_soft = tex2D_linearize(
+            BLOOM_APPROXtexture, blur3x3_tex_uv).rgb;
+        const float3 phosphor_blur_approx = lerp(phosphor_blur_approx_soft,
+            electron_intensity, 0.1) * blur_contrast;
+        //  We could blend between phosphor_emission and phosphor_blur_approx,
+        //  solving for the minimum blend_ratio that avoids clipping past 1.0:
+        //      1.0 >= total_intensity
+        //      1.0 >= phosphor_emission * (1.0 - blend_ratio) +
+        //              phosphor_blur_approx * blend_ratio
+        //      blend_ratio = (phosphor_emission - 1.0)/
+        //          (phosphor_emission - phosphor_blur_approx);
+        //  However, this blurs far more than necessary, because it aims for
+        //  full brightness, not minimal blurring.  To fix it, base blend_ratio
+        //  on a max area intensity only so it varies more smoothly:
+        const float3 phosphor_blur_underestimate =
+            phosphor_blur_approx * bloom_underestimate_levels;
+        const float3 area_max_underestimate =
+            phosphor_blur_underestimate * mask_amplify;
+        #ifdef PHOSPHOR_BLOOM_FAKE_WITH_SIMPLE_BLEND
+            const float3 blend_ratio_temp =
+                (area_max_underestimate - float3(1.0, 1.0, 1.0)) /
+                (area_max_underestimate - phosphor_blur_underestimate);
+        #else
+            //  Try doing it like an area-based brightpass.  This is nearly
+            //  identical, but it's worth toying with the code in case I ever
+            //  find a way to make it look more like a real bloom.  (I've had
+            //  some promising textures from combining an area-based blend ratio
+            //  for the phosphor blur and a more brightpass-like blend-ratio for
+            //  the phosphor emission, but I haven't found a way to make the
+            //  brightness correct across the whole color range, especially with
+            //  different bloom_underestimate_levels values.)
+            const float desired_triad_size = lerp(mask_triad_size_desired,
+                output_size.x/mask_num_triads_desired,
+                mask_specify_num_triads);
+            const float bloom_sigma = get_min_sigma_to_blur_triad(
+                desired_triad_size, bloom_diff_thresh);
+            const float center_weight = get_center_weight(bloom_sigma);
+            const float3 max_area_contribution_approx =
+                max(float3(0.0, 0.0, 0.0), phosphor_blur_approx -
+                center_weight * phosphor_emission);
+            const float3 area_contrib_underestimate =
+                bloom_underestimate_levels * max_area_contribution_approx;
+            const float3 blend_ratio_temp =
+                ((float3(1.0, 1.0, 1.0) - area_contrib_underestimate) /
+                area_max_underestimate - float3(1.0, 1.0, 1.0)) / (center_weight - 1.0);
+        #endif
+        //  Clamp blend_ratio in case it's out-of-range, but be SUPER careful:
+        //  min/max/clamp are BIZARRELY broken with lerp (optimization bug?),
+        //  and this redundant sequence avoids bugs, at least on nVidia cards:
+        const float3 blend_ratio_clamped = max(clamp(blend_ratio_temp, 0.0, 1.0), 0.0);
+        const float3 blend_ratio = lerp(blend_ratio_clamped, float3(1.0,1.0,1.0), bloom_excess);
+        //  Blend the blurred and unblurred images:
+        const float3 phosphor_emission_unclipped =
+            lerp(phosphor_emission, phosphor_blur_approx, blend_ratio);
+        //  Simulate refractive diffusion by reusing the halation sample.
+        const float3 pixel_color = lerp(phosphor_emission_unclipped,
+            halation_color, diffusion_weight);
+    #else
+        const float3 pixel_color = phosphor_emission_dim;
+    #endif
+    //  Encode if necessary, and output.
+    FragColor = encode_output(float4(pixel_color, 1.0));
+}
\ No newline at end of file
diff --git a/shaders/CRT-Royale.shader/scanlines-horizontal-apply-mask.vs b/shaders/CRT-Royale.shader/scanlines-horizontal-apply-mask.vs
new file mode 100644
index 00000000..41e6f7c1
--- /dev/null
+++ b/shaders/CRT-Royale.shader/scanlines-horizontal-apply-mask.vs
@@ -0,0 +1,6047 @@
+#version 150
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+in vec4 position;
+in vec2 texCoord;
+
+out Vertex {
+   vec2 vTexCoord;
+   vec2 video_uv;
+   vec2 scanline_tex_uv;
+   vec2 blur3x3_tex_uv;
+   vec2 halation_tex_uv;
+   vec2 scanline_texture_size_inv;
+   vec4 mask_tile_start_uv_and_size;
+   vec2 mask_tiles_per_screen;
+};
+
+uniform vec4 targetSize;
+uniform vec4 sourceSize[];
+uniform int phase;
+
+// USER SETTINGS BLOCK //
+
+#define crt_gamma 2.500000
+#define lcd_gamma 2.200000
+#define levels_contrast 1.0
+#define halation_weight 0.0
+#define diffusion_weight 0.075
+#define bloom_underestimate_levels 0.8
+#define bloom_excess 0.000000
+#define beam_min_sigma 0.020000
+#define beam_max_sigma 0.300000
+#define beam_spot_power 0.330000
+#define beam_min_shape 2.000000
+#define beam_max_shape 4.000000
+#define beam_shape_power 0.250000
+#define beam_horiz_filter 0.000000
+#define beam_horiz_sigma 0.35
+#define beam_horiz_linear_rgb_weight 1.000000
+#define convergence_offset_x_r -0.000000
+#define convergence_offset_x_g 0.000000
+#define convergence_offset_x_b 0.000000
+#define convergence_offset_y_r 0.000000
+#define convergence_offset_y_g -0.000000
+#define convergence_offset_y_b 0.000000
+#define mask_type 1.000000
+#define mask_sample_mode_desired 0.000000
+#define mask_specify_num_triads 0.000000
+#define mask_triad_size_desired 3.000000
+#define mask_num_triads_desired 480.000000
+#define aa_subpixel_r_offset_x_runtime -0.0
+#define aa_subpixel_r_offset_y_runtime 0.000000
+#define aa_cubic_c 0.500000
+#define aa_gauss_sigma 0.500000
+#define geom_mode_runtime 0.000000
+#define geom_radius 2.000000
+#define geom_view_dist 2.000000
+#define geom_tilt_angle_x 0.000000
+#define geom_tilt_angle_y 0.000000
+#define geom_aspect_ratio_x 432.000000
+#define geom_aspect_ratio_y 329.000000
+#define geom_overscan_x 1.000000
+#define geom_overscan_y 1.000000
+#define border_size 0.015
+#define border_darkness 2.0
+#define border_compress 2.500000
+#define interlace_bff 0.000000
+#define interlace_1080i 0.000000
+
+#define VERTICAL_SCANLINEStexture source[5]
+#define VERTICAL_SCANLINEStexture_size sourceSize[5].xy
+#define VERTICAL_SCANLINESvideo_size sourceSize[5].xy
+#define BLOOM_APPROXtexture source[3]
+#define BLOOM_APPROXtexture_size sourceSize[3].xy
+#define BLOOM_APPROXvideo_size sourceSize[3].xy
+#define HALATION_BLURtexture source[1]
+#define HALATION_BLURtexture_size sourceSize[1].xy
+#define HALATION_BLURvideo_size sourceSize[1].xy
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+	#define MASK_RESIZEtexture source[0]
+#else
+	#define MASK_RESIZEtexture source[0]
+#endif
+#define MASK_RESIZEtexture_size sourceSize[0].xy
+#define MASK_RESIZEvideo_size sourceSize[0].xy
+
+// END USER SETTINGS BLOCK //
+
+// compatibility macros for transparently converting HLSLisms into GLSLisms
+#define mul(a,b) (b*a)
+#define lerp(a,b,c) mix(a,b,c)
+#define saturate(c) clamp(c, 0.0, 1.0)
+#define frac(x) (fract(x))
+#define float2 vec2
+#define float3 vec3
+#define float4 vec4
+#define bool2 bvec2
+#define bool3 bvec3
+#define bool4 bvec4
+#define float2x2 mat2x2
+#define float3x3 mat3x3
+#define float4x4 mat4x4
+#define float4x3 mat4x3
+#define float2x4 mat2x4
+#define IN params
+#define texture_size sourceSize[0].xy
+#define video_size sourceSize[0].xy
+#define output_size targetSize.xy
+#define frame_count phase
+#define static  
+#define inline  
+#define const  
+#define fmod(x,y) mod(x,y)
+#define ddx(c) dFdx(c)
+#define ddy(c) dFdy(c)
+#define atan2(x,y) atan(y,x)
+#define rsqrt(c) inversesqrt(c)
+
+#define input_texture source[0]
+
+#if defined(GL_ES)
+	#define COMPAT_PRECISION mediump
+#else
+	#define COMPAT_PRECISION
+#endif
+
+#if __VERSION__ >= 130
+	#define COMPAT_TEXTURE texture
+#else
+	#define COMPAT_TEXTURE texture2D
+#endif
+
+// VERTEX INCLUDES //
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+////////////////////////////  END USER-SETTINGS  //////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  END DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////////////
+
+//#include "bind-shader-h"
+
+/////////////////////////////  BEGIN BIND-SHADER-PARAMS  ////////////////////////////
+
+#ifndef BIND_SHADER_PARAMS_H
+#define BIND_SHADER_PARAMS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+/////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+////////////////////   END DERIVED-SETTINGS-AND-CONSTANTS   /////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+//  Override some parameters for gamma-management.h and tex2Dantialias.h:
+#define OVERRIDE_DEVICE_GAMMA
+static const float gba_gamma = 3.5; //  Irrelevant but necessary to define.
+#define ANTIALIAS_OVERRIDE_BASICS
+#define ANTIALIAS_OVERRIDE_PARAMETERS
+
+//  Provide accessors for vector constants that pack scalar uniforms:
+inline float2 get_aspect_vector(const float geom_aspect_ratio)
+{
+    //  Get an aspect ratio vector.  Enforce geom_max_aspect_ratio, and prevent
+    //  the absolute scale from affecting the uv-mapping for curvature:
+    const float geom_clamped_aspect_ratio =
+        min(geom_aspect_ratio, geom_max_aspect_ratio);
+    const float2 geom_aspect =
+        normalize(float2(geom_clamped_aspect_ratio, 1.0));
+    return geom_aspect;
+}
+
+inline float2 get_geom_overscan_vector()
+{
+    return float2(geom_overscan_x, geom_overscan_y);
+}
+
+inline float2 get_geom_tilt_angle_vector()
+{
+    return float2(geom_tilt_angle_x, geom_tilt_angle_y);
+}
+
+inline float3 get_convergence_offsets_x_vector()
+{
+    return float3(convergence_offset_x_r, convergence_offset_x_g,
+        convergence_offset_x_b);
+}
+
+inline float3 get_convergence_offsets_y_vector()
+{
+    return float3(convergence_offset_y_r, convergence_offset_y_g,
+        convergence_offset_y_b);
+}
+
+inline float2 get_convergence_offsets_r_vector()
+{
+    return float2(convergence_offset_x_r, convergence_offset_y_r);
+}
+
+inline float2 get_convergence_offsets_g_vector()
+{
+    return float2(convergence_offset_x_g, convergence_offset_y_g);
+}
+
+inline float2 get_convergence_offsets_b_vector()
+{
+    return float2(convergence_offset_x_b, convergence_offset_y_b);
+}
+
+inline float2 get_aa_subpixel_r_offset()
+{
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+            //  WARNING: THIS IS EXTREMELY EXPENSIVE.
+            return float2(aa_subpixel_r_offset_x_runtime,
+                aa_subpixel_r_offset_y_runtime);
+        #else
+            return aa_subpixel_r_offset_static;
+        #endif
+    #else
+        return aa_subpixel_r_offset_static;
+    #endif
+}
+
+//  Provide accessors settings which still need "cooking:"
+inline float get_mask_amplify()
+{
+    static const float mask_grille_amplify = 1.0/mask_grille_avg_color;
+    static const float mask_slot_amplify = 1.0/mask_slot_avg_color;
+    static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color;
+    return mask_type < 0.5 ? mask_grille_amplify :
+        mask_type < 1.5 ? mask_slot_amplify :
+        mask_shadow_amplify;
+}
+
+inline float get_mask_sample_mode()
+{
+    #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_desired;
+        #else
+            return clamp(mask_sample_mode_desired, 1.0, 2.0);
+        #endif
+    #else
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_static;
+        #else
+            return clamp(mask_sample_mode_static, 1.0, 2.0);
+        #endif
+    #endif
+}
+
+#endif  //  BIND_SHADER_PARAMS_H
+
+////////////////////////////  END BIND-SHADER-PARAMS  ///////////////////////////
+
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
+
+//#include "scanline-functions.h"
+
+/////////////////////////////  BEGIN SCANLINE-FUNCTIONS  ////////////////////////////
+
+#ifndef SCANLINE_FUNCTIONS_H
+#define SCANLINE_FUNCTIONS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+////////////////////////////  END USER-SETTINGS  //////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  END DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////////////
+
+//#include "../../../../include/special-functions.h"
+
+///////////////////////////  BEGIN SPECIAL-FUNCTIONS  //////////////////////////
+
+#ifndef SPECIAL_FUNCTIONS_H
+#define SPECIAL_FUNCTIONS_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file implements the following mathematical special functions:
+//  1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2))
+//  2.) gamma(s), a real-numbered extension of the integer factorial function
+//  It also implements normalized_ligamma(s, z), a normalized lower incomplete
+//  gamma function for s < 0.5 only.  Both gamma() and normalized_ligamma() can
+//  be called with an _impl suffix to use an implementation version with a few
+//  extra precomputed parameters (which may be useful for the caller to reuse).
+//  See below for details.
+//
+//  Design Rationale:
+//  Pretty much every line of code in this file is duplicated four times for
+//  different input types (float4/float3/float2/float).  This is unfortunate,
+//  but Cg doesn't allow function templates.  Macros would be far less verbose,
+//  but they would make the code harder to document and read.  I don't expect
+//  these functions will require a whole lot of maintenance changes unless
+//  someone ever has need for more robust incomplete gamma functions, so code
+//  duplication seems to be the lesser evil in this case.
+
+
+///////////////////////////  GAUSSIAN ERROR FUNCTION  //////////////////////////
+
+float4 erf6(float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Return an Abramowitz/Stegun approximation of erf(), where:
+    //                  erf(x) = 2/sqrt(pi) * integral(e**(-x**2))
+    //              This approximation has a max absolute error of 2.5*10**-5
+    //              with solid numerical robustness and efficiency.  See:
+	//                  https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions
+	static const float4 one = float4(1.0);
+	const float4 sign_x = sign(x);
+	const float4 t = one/(one + 0.47047*abs(x));
+	const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float3 erf6(const float3 x)
+{
+    //  Float3 version:
+	static const float3 one = float3(1.0);
+	const float3 sign_x = sign(x);
+	const float3 t = one/(one + 0.47047*abs(x));
+	const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float2 erf6(const float2 x)
+{
+    //  Float2 version:
+	static const float2 one = float2(1.0);
+	const float2 sign_x = sign(x);
+	const float2 t = one/(one + 0.47047*abs(x));
+	const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float erf6(const float x)
+{
+    //  Float version:
+	const float sign_x = sign(x);
+	const float t = 1.0/(1.0 + 0.47047*abs(x));
+	const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float4 erft(const float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Approximate erf() with the hyperbolic tangent.  The error is
+    //              visually noticeable, but it's blazing fast and perceptually
+    //              close...at least on ATI hardware.  See:
+    //                  http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html
+    //  Warning:    Only use this if your hardware drivers correctly implement
+    //              tanh(): My nVidia 8800GTS returns garbage output.
+	return tanh(1.202760580 * x);
+}
+
+float3 erft(const float3 x)
+{
+    //  Float3 version:
+	return tanh(1.202760580 * x);
+}
+
+float2 erft(const float2 x)
+{
+    //  Float2 version:
+	return tanh(1.202760580 * x);
+}
+
+float erft(const float x)
+{
+    //  Float version:
+	return tanh(1.202760580 * x);
+}
+
+inline float4 erf(const float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Some approximation of erf(x), depending on user settings.
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float3 erf(const float3 x)
+{
+    //  Float3 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float2 erf(const float2 x)
+{
+    //  Float2 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float erf(const float x)
+{
+    //  Float version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+
+///////////////////////////  COMPLETE GAMMA FUNCTION  //////////////////////////
+
+float4 gamma_impl(const float4 s, const float4 s_inv)
+{
+    //  Requires:   1.) s is the standard parameter to the gamma function, and
+    //                  it should lie in the [0, 36] range.
+    //              2.) s_inv = 1.0/s.  This implementation function requires
+    //                  the caller to precompute this value, giving users the
+    //                  opportunity to reuse it.
+    //  Returns:    Return approximate gamma function (real-numbered factorial)
+    //              output using the Lanczos approximation with two coefficients
+    //              calculated using Paul Godfrey's method here:
+    //                  http://my.fit.edu/~gabdo/gamma.txt
+    //              An optimal g value for s in [0, 36] is ~1.12906830989, with
+    //              a maximum relative error of 0.000463 for 2**16 equally
+    //              evals.  We could use three coeffs (0.0000346 error) without
+    //              hurting latency, but this allows more parallelism with
+    //              outside instructions.
+	static const float4 g = float4(1.12906830989);
+	static const float4 c0 = float4(0.8109119309638332633713423362694399653724431);
+	static const float4 c1 = float4(0.4808354605142681877121661197951496120000040);
+	static const float4 e = float4(2.71828182845904523536028747135266249775724709);
+	const float4 sph = s + float4(0.5);
+	const float4 lanczos_sum = c0 + c1/(s + float4(1.0));
+	const float4 base = (sph + g)/e;  //  or (s + g + float4(0.5))/e
+	//  gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s).
+	//  This has less error for small s's than (s -= 1.0) at the beginning.
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float3 gamma_impl(const float3 s, const float3 s_inv)
+{
+    //  Float3 version:
+	static const float3 g = float3(1.12906830989);
+	static const float3 c0 = float3(0.8109119309638332633713423362694399653724431);
+	static const float3 c1 = float3(0.4808354605142681877121661197951496120000040);
+	static const float3 e = float3(2.71828182845904523536028747135266249775724709);
+	const float3 sph = s + float3(0.5);
+	const float3 lanczos_sum = c0 + c1/(s + float3(1.0));
+	const float3 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float2 gamma_impl(const float2 s, const float2 s_inv)
+{
+    //  Float2 version:
+	static const float2 g = float2(1.12906830989);
+	static const float2 c0 = float2(0.8109119309638332633713423362694399653724431);
+	static const float2 c1 = float2(0.4808354605142681877121661197951496120000040);
+	static const float2 e = float2(2.71828182845904523536028747135266249775724709);
+	const float2 sph = s + float2(0.5);
+	const float2 lanczos_sum = c0 + c1/(s + float2(1.0));
+	const float2 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float gamma_impl(const float s, const float s_inv)
+{
+    //  Float version:
+	static const float g = 1.12906830989;
+	static const float c0 = 0.8109119309638332633713423362694399653724431;
+	static const float c1 = 0.4808354605142681877121661197951496120000040;
+	static const float e = 2.71828182845904523536028747135266249775724709;
+	const float sph = s + 0.5;
+	const float lanczos_sum = c0 + c1/(s + 1.0);
+	const float base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float4 gamma(const float4 s)
+{
+    //  Requires:   s is the standard parameter to the gamma function, and it
+    //              should lie in the [0, 36] range.
+    //  Returns:    Return approximate gamma function output with a maximum
+    //              relative error of 0.000463.  See gamma_impl for details.
+	return gamma_impl(s, float4(1.0)/s);
+}
+
+float3 gamma(const float3 s)
+{
+    //  Float3 version:
+	return gamma_impl(s, float3(1.0)/s);
+}
+
+float2 gamma(const float2 s)
+{
+    //  Float2 version:
+	return gamma_impl(s, float2(1.0)/s);
+}
+
+float gamma(const float s)
+{
+    //  Float version:
+	return gamma_impl(s, 1.0/s);
+}
+
+
+////////////////  INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT)  ///////////////
+
+//  Lower incomplete gamma function for small s and z (implementation):
+float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z <= ~0.775075
+    //              3.) s_inv = 1.0/s (precomputed for outside reuse)
+    //  Returns:    A series representation for the lower incomplete gamma
+    //              function for small s and small z (4 terms).
+    //  The actual "rolled up" summation looks like:
+	//      last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0;
+	//      sum = last_sign * last_pow / ((s + k) * last_factorial)
+	//      for(int i = 0; i < 4; ++i)
+	//      {
+	//          last_sign *= -1.0; last_pow *= z; last_factorial *= i;
+	//          sum += last_sign * last_pow / ((s + k) * last_factorial);
+	//      }
+	//  Unrolled, constant-unfolded and arranged for madds and parallelism:
+	const float4 scale = pow(z, s);
+	float4 sum = s_inv;  //  Summation iteration 0 result
+	//  Summation iterations 1, 2, and 3:
+	const float4 z_sq = z*z;
+	const float4 denom1 = s + float4(1.0);
+	const float4 denom2 = 2.0*s + float4(4.0);
+	const float4 denom3 = 6.0*s + float4(18.0);
+	//float4 denom4 = 24.0*s + float4(96.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	//sum += z_sq * z_sq / denom4;
+	//  Scale and return:
+	return scale * sum;
+}
+
+float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv)
+{
+    //  Float3 version:
+	const float3 scale = pow(z, s);
+	float3 sum = s_inv;
+	const float3 z_sq = z*z;
+	const float3 denom1 = s + float3(1.0);
+	const float3 denom2 = 2.0*s + float3(4.0);
+	const float3 denom3 = 6.0*s + float3(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv)
+{
+    //  Float2 version:
+	const float2 scale = pow(z, s);
+	float2 sum = s_inv;
+	const float2 z_sq = z*z;
+	const float2 denom1 = s + float2(1.0);
+	const float2 denom2 = 2.0*s + float2(4.0);
+	const float2 denom3 = 6.0*s + float2(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float ligamma_small_z_impl(const float s, const float z, const float s_inv)
+{
+    //  Float version:
+	const float scale = pow(z, s);
+	float sum = s_inv;
+	const float z_sq = z*z;
+	const float denom1 = s + 1.0;
+	const float denom2 = 2.0*s + 4.0;
+	const float denom3 = 6.0*s + 18.0;
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+//  Upper incomplete gamma function for small s and large z (implementation):
+float4 uigamma_large_z_impl(const float4 s, const float4 z)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z > ~0.775075
+    //  Returns:    Gauss's continued fraction representation for the upper
+    //              incomplete gamma function (4 terms).
+	//  The "rolled up" continued fraction looks like this.  The denominator
+    //  is truncated, and it's calculated "from the bottom up:"
+	//      denom = float4('inf');
+	//      float4 one = float4(1.0);
+	//      for(int i = 4; i > 0; --i)
+	//      {
+	//          denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom;
+	//      }
+	//  Unrolled and constant-unfolded for madds and parallelism:
+	const float4 numerator = pow(z, s) * exp(-z);
+	float4 denom = float4(7.0) + z - s;
+	denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom;
+	denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom;
+	denom = float4(1.0) + z - s + (s - float4(1.0))/denom;
+	return numerator / denom;
+}
+
+float3 uigamma_large_z_impl(const float3 s, const float3 z)
+{
+    //  Float3 version:
+	const float3 numerator = pow(z, s) * exp(-z);
+	float3 denom = float3(7.0) + z - s;
+	denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom;
+	denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom;
+	denom = float3(1.0) + z - s + (s - float3(1.0))/denom;
+	return numerator / denom;
+}
+
+float2 uigamma_large_z_impl(const float2 s, const float2 z)
+{
+    //  Float2 version:
+	const float2 numerator = pow(z, s) * exp(-z);
+	float2 denom = float2(7.0) + z - s;
+	denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom;
+	denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom;
+	denom = float2(1.0) + z - s + (s - float2(1.0))/denom;
+	return numerator / denom;
+}
+
+float uigamma_large_z_impl(const float s, const float z)
+{
+    //  Float version:
+	const float numerator = pow(z, s) * exp(-z);
+	float denom = 7.0 + z - s;
+	denom = 5.0 + z - s + (3.0*s - 9.0)/denom;
+	denom = 3.0 + z - s + (2.0*s - 4.0)/denom;
+	denom = 1.0 + z - s + (s - 1.0)/denom;
+	return numerator / denom;
+}
+
+//  Normalized lower incomplete gamma function for small s (implementation):
+float4 normalized_ligamma_impl(const float4 s, const float4 z,
+    const float4 s_inv, const float4 gamma_s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) s_inv = 1/s (precomputed for outside reuse)
+    //              3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse)
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  Since we only care about s < 0.5, we only need
+    //              to evaluate two branches (not four) based on z.  Each branch
+    //              uses four terms, with a max relative error of ~0.00182.  The
+    //              branch threshold and specifics were adapted for fewer terms
+    //              from Gil/Segura/Temme's paper here:
+    //                  http://oai.cwi.nl/oai/asset/20433/20433B.pdf
+	//  Evaluate both branches: Real branches test slower even when available.
+	static const float4 thresh = float4(0.775075);
+	bool4 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	z_is_large.w = z.w > thresh.w;
+	const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	//  Combine the results from both branches:
+	bool4 inverse_z_is_large = not(z_is_large);
+	return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large);
+}
+
+float3 normalized_ligamma_impl(const float3 s, const float3 z,
+    const float3 s_inv, const float3 gamma_s_inv)
+{
+    //  Float3 version:
+	static const float3 thresh = float3(0.775075);
+	bool3 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool3 inverse_z_is_large = not(z_is_large);
+	return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large);
+}
+
+float2 normalized_ligamma_impl(const float2 s, const float2 z,
+    const float2 s_inv, const float2 gamma_s_inv)
+{
+    //  Float2 version:
+	static const float2 thresh = float2(0.775075);
+	bool2 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool2 inverse_z_is_large = not(z_is_large);
+	return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large);
+}
+
+float normalized_ligamma_impl(const float s, const float z,
+    const float s_inv, const float gamma_s_inv)
+{
+    //  Float version:
+	static const float thresh = 0.775075;
+	const bool z_is_large = z > thresh;
+	const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	return large_z * float(z_is_large) + small_z * float(!z_is_large);
+}
+
+//  Normalized lower incomplete gamma function for small s:
+float4 normalized_ligamma(const float4 s, const float4 z)
+{
+    //  Requires:   s < ~0.5
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  See normalized_ligamma_impl() for details.
+	const float4 s_inv = float4(1.0)/s;
+	const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float3 normalized_ligamma(const float3 s, const float3 z)
+{
+    //  Float3 version:
+	const float3 s_inv = float3(1.0)/s;
+	const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float2 normalized_ligamma(const float2 s, const float2 z)
+{
+    //  Float2 version:
+	const float2 s_inv = float2(1.0)/s;
+	const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float normalized_ligamma(const float s, const float z)
+{
+    //  Float version:
+	const float s_inv = 1.0/s;
+	const float gamma_s_inv = 1.0/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+#endif  //  SPECIAL_FUNCTIONS_H
+
+////////////////////////////  END SPECIAL-FUNCTIONS  ///////////////////////////
+
+//#include "../../../../include/gamma-management.h"
+
+////////////////////////////  BEGIN GAMMA-MANAGEMENT  //////////////////////////
+
+#ifndef GAMMA_MANAGEMENT_H
+#define GAMMA_MANAGEMENT_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//  
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides gamma-aware tex*D*() and encode_output() functions.
+//  Requires:   Before #include-ing this file, the including file must #define
+//              the following macros when applicable and follow their rules:
+//              1.) #define FIRST_PASS if this is the first pass.
+//              2.) #define LAST_PASS if this is the last pass.
+//              3.) If sRGB is available, set srgb_framebufferN = "true" for
+//                  every pass except the last in your .cgp preset.
+//              4.) If sRGB isn't available but you want gamma-correctness with
+//                  no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
+//              5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
+//              6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
+//              7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
+//              8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
+//              If an option in [5, 8] is #defined in the first or last pass, it
+//              should be #defined for both.  It shouldn't make a difference
+//              whether it's #defined for intermediate passes or not.
+//  Optional:   The including file (or an earlier included file) may optionally
+//              #define a number of macros indicating it will override certain
+//              macros and associated constants are as follows:
+//              static constants with either static or uniform constants.  The
+//              1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
+//                  static const float ntsc_gamma
+//                  static const float pal_gamma
+//                  static const float crt_reference_gamma_high
+//                  static const float crt_reference_gamma_low
+//                  static const float lcd_reference_gamma
+//                  static const float crt_office_gamma
+//                  static const float lcd_office_gamma
+//              2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
+//                  static const float crt_gamma
+//                  static const float gba_gamma
+//                  static const float lcd_gamma
+//              3.) OVERRIDE_FINAL_GAMMA: The user must first define:
+//                  static const float input_gamma
+//                  static const float intermediate_gamma
+//                  static const float output_gamma
+//                  (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
+//              4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
+//                  static const bool assume_opaque_alpha
+//              The gamma constant overrides must be used in every pass or none,
+//              and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
+//              OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
+//  Usage:      After setting macros appropriately, ignore gamma correction and
+//              replace all tex*D*() calls with equivalent gamma-aware
+//              tex*D*_linearize calls, except:
+//              1.) When you read an LUT, use regular tex*D or a gamma-specified
+//                  function, depending on its gamma encoding:
+//                      tex*D*_linearize_gamma (takes a runtime gamma parameter)
+//              2.) If you must read pass0's original input in a later pass, use
+//                  tex2D_linearize_ntsc_gamma.  If you want to read pass0's
+//                  input with gamma-corrected bilinear filtering, consider
+//                  creating a first linearizing pass and reading from the input
+//                  of pass1 later.
+//              Then, return encode_output(color) from every fragment shader.
+//              Finally, use the global gamma_aware_bilinear boolean if you want
+//              to statically branch based on whether bilinear filtering is
+//              gamma-correct or not (e.g. for placing Gaussian blur samples).
+//
+//  Detailed Policy:
+//  tex*D*_linearize() functions enforce a consistent gamma-management policy
+//  based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings.  They assume
+//  their input texture has the same encoding characteristics as the input for
+//  the current pass (which doesn't apply to the exceptions listed above).
+//  Similarly, encode_output() enforces a policy based on the LAST_PASS and
+//  GAMMA_ENCODE_EVERY_FBO settings.  Together, they result in one of the
+//  following two pipelines.
+//  Typical pipeline with intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = linear_color;     //  Automatic sRGB encoding
+//      linear_color = intermediate_output;     //  Automatic sRGB decoding
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Typical pipeline without intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
+//      linear_color = pow(intermediate_output, intermediate_gamma);
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
+//  easily get gamma-correctness without banding on devices where sRGB isn't
+//  supported.
+//
+//  Use This Header to Maximize Code Reuse:
+//  The purpose of this header is to provide a consistent interface for texture
+//  reads and output gamma-encoding that localizes and abstracts away all the
+//  annoying details.  This greatly reduces the amount of code in each shader
+//  pass that depends on the pass number in the .cgp preset or whether sRGB
+//  FBO's are being used: You can trivially change the gamma behavior of your
+//  whole pass by commenting or uncommenting 1-3 #defines.  To reuse the same
+//  code in your first, Nth, and last passes, you can even put it all in another
+//  header file and #include it from skeleton .cg files that #define the
+//  appropriate pass-specific settings.
+//
+//  Rationale for Using Three Macros:
+//  This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
+//  SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
+//  a lower maintenance burden on each pass.  At first glance it seems we could
+//  accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
+//  This works for simple use cases where input_gamma == output_gamma, but it
+//  breaks down for more complex scenarios like CRT simulation, where the pass
+//  number determines the gamma encoding of the input and output.
+
+
+///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
+
+//  Set standard gamma constants, but allow users to override them:
+#ifndef OVERRIDE_STANDARD_GAMMA
+    //  Standard encoding gammas:
+    static const float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
+    static const float pal_gamma = 2.8;     //  Never actually 2.8 in practice
+    //  Typical device decoding gammas (only use for emulating devices):
+    //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
+    //  gammas: The standards purposely undercorrected for an analog CRT's
+    //  assumed 2.5 reference display gamma to maintain contrast in assumed
+    //  [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
+    //  These unstated assumptions about display gamma and perceptual rendering
+    //  intent caused a lot of confusion, and more modern CRT's seemed to target
+    //  NTSC 2.2 gamma with circuitry.  LCD displays seem to have followed suit
+    //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
+    //  displays designed to view sRGB in bright environments.  (Standards are
+    //  also in flux again with BT.1886, but it's underspecified for displays.)
+    static const float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
+    static const float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
+    static const float lcd_reference_gamma = 2.5;       //  To match CRT
+    static const float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
+    static const float lcd_office_gamma = 2.2;  //  Approximates sRGB
+#endif  //  OVERRIDE_STANDARD_GAMMA
+
+//  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
+//  but only if they're aware of it.
+#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
+    static const bool assume_opaque_alpha = false;
+#endif
+
+
+///////////////////////  DERIVED CONSTANTS AS FUNCTIONS  ///////////////////////
+
+//  gamma-management.h should be compatible with overriding gamma values with
+//  runtime user parameters, but we can only define other global constants in
+//  terms of static constants, not uniform user parameters.  To get around this
+//  limitation, we need to define derived constants using functions.
+
+//  Set device gamma constants, but allow users to override them:
+#ifdef OVERRIDE_DEVICE_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_crt_gamma()    {   return crt_gamma;   }
+    inline float get_gba_gamma()    {   return gba_gamma;   }
+    inline float get_lcd_gamma()    {   return lcd_gamma;   }
+#else
+    inline float get_crt_gamma()    {   return crt_reference_gamma_high;    }
+    inline float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
+    inline float get_lcd_gamma()    {   return lcd_office_gamma;            }
+#endif  //  OVERRIDE_DEVICE_GAMMA
+
+//  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
+#ifdef OVERRIDE_FINAL_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_intermediate_gamma()   {   return intermediate_gamma;  }
+    inline float get_input_gamma()          {   return input_gamma;         }
+    inline float get_output_gamma()         {   return output_gamma;        }
+#else
+    //  If we gamma-correct every pass, always use ntsc_gamma between passes to
+    //  ensure middle passes don't need to care if anything is being simulated:
+    inline float get_intermediate_gamma()   {   return ntsc_gamma;          }
+    #ifdef SIMULATE_CRT_ON_LCD
+        inline float get_input_gamma()      {   return get_crt_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_LCD
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_LCD_ON_CRT
+        inline float get_input_gamma()      {   return get_lcd_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_CRT
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else   //  Don't simulate anything:
+        inline float get_input_gamma()      {   return ntsc_gamma;          }
+        inline float get_output_gamma()     {   return ntsc_gamma;          }
+    #endif  //  SIMULATE_GBA_ON_CRT
+    #endif  //  SIMULATE_LCD_ON_CRT
+    #endif  //  SIMULATE_GBA_ON_LCD
+    #endif  //  SIMULATE_CRT_ON_LCD
+#endif  //  OVERRIDE_FINAL_GAMMA
+
+//  Set decoding/encoding gammas for the current pass.  Use static constants for
+//  linearize_input and gamma_encode_output, because they aren't derived, and
+//  they let the compiler do dead-code elimination.
+#ifndef GAMMA_ENCODE_EVERY_FBO
+    #ifdef FIRST_PASS
+        static const bool linearize_input = true;
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        static const bool linearize_input = false;
+        inline float get_pass_input_gamma()     {   return 1.0;                 }
+    #endif
+    #ifdef LAST_PASS
+        static const bool gamma_encode_output = true;
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        static const bool gamma_encode_output = false;
+        inline float get_pass_output_gamma()    {   return 1.0;                 }
+    #endif
+#else
+    static const bool linearize_input = true;
+    static const bool gamma_encode_output = true;
+    #ifdef FIRST_PASS
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        inline float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
+    #endif
+    #ifdef LAST_PASS
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        inline float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
+    #endif
+#endif
+
+//  Users might want to know if bilinear filtering will be gamma-correct:
+static const bool gamma_aware_bilinear = !linearize_input;
+
+
+//////////////////////  COLOR ENCODING/DECODING FUNCTIONS  /////////////////////
+
+inline float4 encode_output(const float4 color)
+{
+    if(gamma_encode_output)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_input(const float4 color)
+{
+    if(linearize_input)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_gamma_input(const float4 color, const float3 gamma)
+{
+    if(assume_opaque_alpha)
+    {
+        return float4(pow(color.rgb, gamma), 1.0);
+    }
+    else
+    {
+        return float4(pow(color.rgb, gamma), color.a);
+    }
+}
+
+//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯
+//#define tex2D_linearize(C, D) decode_input(vec4(texture(C, D)))
+// EDIT: it's the 'const' in front of the coords that's doing it
+
+///////////////////////////  TEXTURE LOOKUP WRAPPERS  //////////////////////////
+
+//  "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a wide array of linearizing texture lookup wrapper functions.  The
+//  Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
+//  lookups are provided for completeness in case that changes someday.  Nobody
+//  is likely to use the *fetch and *proj functions, but they're included just
+//  in case.  The only tex*D texture sampling functions omitted are:
+//      - tex*Dcmpbias
+//      - tex*Dcmplod
+//      - tex*DARRAY*
+//      - tex*DMS*
+//      - Variants returning integers
+//  Standard line length restrictions are ignored below for vertical brevity.
+/*
+//  tex1D:
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex1Dbias:
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dbias(tex, tex_coords));   }
+
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dbias(tex, tex_coords, texel_off));    }
+
+//  tex1Dfetch:
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
+{   return decode_input(tex1Dfetch(tex, tex_coords));  }
+
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex1Dlod:
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dlod(tex, tex_coords));    }
+
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dlod(tex, tex_coords, texel_off));     }
+
+//  tex1Dproj:
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+*/
+//  tex2D:
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords, texel_off));    }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex2Dbias:
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
+//{   return decode_input(tex2Dbias(tex, tex_coords));   }
+
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dbias(tex, tex_coords, texel_off));    }
+
+//  tex2Dfetch:
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
+//{   return decode_input(tex2Dfetch(tex, tex_coords));  }
+
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords)
+{   return decode_input(textureLod(tex, tex_coords.xy, 0.0));    }
+
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));     }
+/*
+//  tex2Dproj:
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+*/
+/*
+//  tex3D:
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
+{   return decode_input(tex3D(tex, tex_coords));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, texel_off));    }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex3Dbias:
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dbias(tex, tex_coords));   }
+
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dbias(tex, tex_coords, texel_off));    }
+
+//  tex3Dfetch:
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
+{   return decode_input(tex3Dfetch(tex, tex_coords));  }
+
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex3Dlod:
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dlod(tex, tex_coords));    }
+
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dlod(tex, tex_coords, texel_off));     }
+
+//  tex3Dproj:
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dproj(tex, tex_coords));   }
+
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dproj(tex, tex_coords, texel_off));    }
+/////////*
+
+//  NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  This narrow selection of nonstandard tex2D* functions can be useful:
+
+//  tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0)));   }
+
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off));    }
+
+
+//  MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a narrower selection of tex2D* wrapper functions that decode an
+//  input sample with a specified gamma value.  These are useful for reading
+//  LUT's and for reading the input of pass0 in a later pass.
+
+//  tex2D:
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma);   }
+
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+/*
+//  tex2Dbias:
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma);   }
+
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma);    }
+
+//  tex2Dfetch:
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma);  }
+
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma);   }
+*/
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma);    }
+
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma);     }
+
+
+#endif  //  GAMMA_MANAGEMENT_H
+
+////////////////////////////  END GAMMA-MANAGEMENT  //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+/////////////////////////////  SCANLINE FUNCTIONS  /////////////////////////////
+
+inline float3 get_gaussian_sigma(const float3 color, const float sigma_range)
+{
+    //  Requires:   Globals:
+    //              1.) beam_min_sigma and beam_max_sigma are global floats
+    //                  containing the desired minimum and maximum beam standard
+    //                  deviations, for dim and bright colors respectively.
+    //              2.) beam_max_sigma must be > 0.0
+    //              3.) beam_min_sigma must be in (0.0, beam_max_sigma]
+    //              4.) beam_spot_power must be defined as a global float.
+    //              Parameters:
+    //              1.) color is the underlying source color along a scanline
+    //              2.) sigma_range = beam_max_sigma - beam_min_sigma; we take
+    //                  sigma_range as a parameter to avoid repeated computation
+    //                  when beam_{min, max}_sigma are runtime shader parameters
+    //  Optional:   Users may set beam_spot_shape_function to 1 to define the
+    //              inner f(color) subfunction (see below) as:
+    //                  f(color) = sqrt(1.0 - (color - 1.0)*(color - 1.0))
+    //              Otherwise (technically, if beam_spot_shape_function < 0.5):
+    //                  f(color) = pow(color, beam_spot_power)
+    //  Returns:    The standard deviation of the Gaussian beam for "color:"
+    //                  sigma = beam_min_sigma + sigma_range * f(color)
+    //  Details/Discussion:
+    //  The beam's spot shape vaguely resembles an aspect-corrected f() in the
+    //  range [0, 1] (not quite, but it's related).  f(color) = color makes
+    //  spots look like diamonds, and a spherical function or cube balances
+    //  between variable width and a soft/realistic shape.   A beam_spot_power
+    //  > 1.0 can produce an ugly spot shape and more initial clipping, but the
+    //  final shape also differs based on the horizontal resampling filter and
+    //  the phosphor bloom.  For instance, resampling horizontally in nonlinear
+    //  light and/or with a sharp (e.g. Lanczos) filter will sharpen the spot
+    //  shape, but a sixth root is still quite soft.  A power function (default
+    //  1.0/3.0 beam_spot_power) is most flexible, but a fixed spherical curve
+    //  has the highest variability without an awful spot shape.
+    //
+    //  beam_min_sigma affects scanline sharpness/aliasing in dim areas, and its
+    //  difference from beam_max_sigma affects beam width variability.  It only
+    //  affects clipping [for pure Gaussians] if beam_spot_power > 1.0 (which is
+    //  a conservative estimate for a more complex constraint).
+    //
+    //  beam_max_sigma affects clipping and increasing scanline width/softness
+    //  as color increases.  The wider this is, the more scanlines need to be
+    //  evaluated to avoid distortion.  For a pure Gaussian, the max_beam_sigma
+    //  at which the first unused scanline always has a weight < 1.0/255.0 is:
+    //      num scanlines = 2, max_beam_sigma = 0.2089; distortions begin ~0.34
+    //      num scanlines = 3, max_beam_sigma = 0.3879; distortions begin ~0.52
+    //      num scanlines = 4, max_beam_sigma = 0.5723; distortions begin ~0.70
+    //      num scanlines = 5, max_beam_sigma = 0.7591; distortions begin ~0.89
+    //      num scanlines = 6, max_beam_sigma = 0.9483; distortions begin ~1.08
+    //  Generalized Gaussians permit more leeway here as steepness increases.
+    if(beam_spot_shape_function < 0.5)
+    {
+        //  Use a power function:
+        return float3(beam_min_sigma) + sigma_range *
+            pow(color, float3(beam_spot_power));
+    }
+    else
+    {
+        //  Use a spherical function:
+        const float3 color_minus_1 = color - float3(1.0);
+        return float3(beam_min_sigma) + sigma_range *
+            sqrt(float3(1.0) - color_minus_1*color_minus_1);
+    }
+}
+
+inline float3 get_generalized_gaussian_beta(const float3 color,
+    const float shape_range)
+{
+    //  Requires:   Globals:
+    //              1.) beam_min_shape and beam_max_shape are global floats
+    //                  containing the desired min/max generalized Gaussian
+    //                  beta parameters, for dim and bright colors respectively.
+    //              2.) beam_max_shape must be >= 2.0
+    //              3.) beam_min_shape must be in [2.0, beam_max_shape]
+    //              4.) beam_shape_power must be defined as a global float.
+    //              Parameters:
+    //              1.) color is the underlying source color along a scanline
+    //              2.) shape_range = beam_max_shape - beam_min_shape; we take
+    //                  shape_range as a parameter to avoid repeated computation
+    //                  when beam_{min, max}_shape are runtime shader parameters
+    //  Returns:    The type-I generalized Gaussian "shape" parameter beta for
+    //              the given color.
+    //  Details/Discussion:
+    //  Beta affects the scanline distribution as follows:
+    //  a.) beta < 2.0 narrows the peak to a spike with a discontinuous slope
+    //  b.) beta == 2.0 just degenerates to a Gaussian
+    //  c.) beta > 2.0 flattens and widens the peak, then drops off more steeply
+    //      than a Gaussian.  Whereas high sigmas widen and soften peaks, high
+    //      beta widen and sharpen peaks at the risk of aliasing.
+    //  Unlike high beam_spot_powers, high beam_shape_powers actually soften shape
+    //  transitions, whereas lower ones sharpen them (at the risk of aliasing).
+    return beam_min_shape + shape_range * pow(color, float3(beam_shape_power));
+}
+
+float3 scanline_gaussian_integral_contrib(const float3 dist,
+    const float3 color, const float pixel_height, const float sigma_range)
+{
+    //  Requires:   1.) dist is the distance of the [potentially separate R/G/B]
+    //                  point(s) from a scanline in units of scanlines, where
+    //                  1.0 means the sample point straddles the next scanline.
+    //              2.) color is the underlying source color along a scanline.
+    //              3.) pixel_height is the output pixel height in scanlines.
+    //              4.) Requirements of get_gaussian_sigma() must be met.
+    //  Returns:    Return a scanline's light output over a given pixel.
+    //  Details:
+    //  The CRT beam profile follows a roughly Gaussian distribution which is
+    //  wider for bright colors than dark ones.  The integral over the full
+    //  range of a Gaussian function is always 1.0, so we can vary the beam
+    //  with a standard deviation without affecting brightness.  'x' = distance:
+    //      gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2))
+    //      gaussian integral = 0.5 (1.0 + erf(x/(sigma * sqrt(2))))
+    //  Use a numerical approximation of the "error function" (the Gaussian
+    //  indefinite integral) to find the definite integral of the scanline's
+    //  average brightness over a given pixel area.  Even if curved coords were
+    //  used in this pass, a flat scalar pixel height works almost as well as a
+    //  pixel height computed from a full pixel-space to scanline-space matrix.
+    const float3 sigma = get_gaussian_sigma(color, sigma_range);
+    const float3 ph_offset = float3(pixel_height * 0.5);
+    const float3 denom_inv = 1.0/(sigma*sqrt(2.0));
+    const float3 integral_high = erf((dist + ph_offset)*denom_inv);
+    const float3 integral_low = erf((dist - ph_offset)*denom_inv);
+    return color * 0.5*(integral_high - integral_low)/pixel_height;
+}
+
+float3 scanline_generalized_gaussian_integral_contrib(float3 dist,
+    float3 color, float pixel_height, float sigma_range,
+    float shape_range)
+{
+    //  Requires:   1.) Requirements of scanline_gaussian_integral_contrib()
+    //                  must be met.
+    //              2.) Requirements of get_gaussian_sigma() must be met.
+    //              3.) Requirements of get_generalized_gaussian_beta() must be
+    //                  met.
+    //  Returns:    Return a scanline's light output over a given pixel.
+    //  A generalized Gaussian distribution allows the shape (beta) to vary
+    //  as well as the width (alpha).  "gamma" refers to the gamma function:
+    //      generalized sample =
+    //          beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta)
+    //  ligamma(s, z) is the lower incomplete gamma function, for which we only
+    //  implement two of four branches (because we keep 1/beta <= 0.5):
+    //      generalized integral = 0.5 + 0.5* sign(x) *
+    //          ligamma(1/beta, (|x|/alpha)**beta)/gamma(1/beta)
+    //  See get_generalized_gaussian_beta() for a discussion of beta.
+    //  We base alpha on the intended Gaussian sigma, but it only strictly
+    //  models models standard deviation at beta == 2, because the standard
+    //  deviation depends on both alpha and beta (keeping alpha independent is
+    //  faster and preserves intuitive behavior and a full spectrum of results).
+    const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
+    const float3 beta = get_generalized_gaussian_beta(color, shape_range);
+    const float3 alpha_inv = float3(1.0)/alpha;
+    const float3 s = float3(1.0)/beta;
+    const float3 ph_offset = float3(pixel_height * 0.5);
+    //  Pass beta to gamma_impl to avoid repeated divides.  Similarly pass
+    //  beta (i.e. 1/s) and 1/gamma(s) to normalized_ligamma_impl.
+    const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, beta);
+    const float3 dist1 = dist + ph_offset;
+    const float3 dist0 = dist - ph_offset;
+    const float3 integral_high = sign(dist1) * normalized_ligamma_impl(
+        s, pow(abs(dist1)*alpha_inv, beta), beta, gamma_s_inv);
+    const float3 integral_low = sign(dist0) * normalized_ligamma_impl(
+        s, pow(abs(dist0)*alpha_inv, beta), beta, gamma_s_inv);
+    return color * 0.5*(integral_high - integral_low)/pixel_height;
+}
+
+float3 scanline_gaussian_sampled_contrib(const float3 dist, const float3 color,
+    const float pixel_height, const float sigma_range)
+{
+    //  See scanline_gaussian integral_contrib() for detailed comments!
+    //  gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2))
+    const float3 sigma = get_gaussian_sigma(color, sigma_range);
+    //  Avoid repeated divides:
+    const float3 sigma_inv = float3(1.0)/sigma;
+    const float3 inner_denom_inv = 0.5 * sigma_inv * sigma_inv;
+    const float3 outer_denom_inv = sigma_inv/sqrt(2.0*pi);
+    if(beam_antialias_level > 0.5)
+    {
+        //  Sample 1/3 pixel away in each direction as well:
+        const float3 sample_offset = float3(pixel_height/3.0);
+        const float3 dist2 = dist + sample_offset;
+        const float3 dist3 = abs(dist - sample_offset);
+        //  Average three pure Gaussian samples:
+        const float3 scale = color/3.0 * outer_denom_inv;
+        const float3 weight1 = exp(-(dist*dist)*inner_denom_inv);
+        const float3 weight2 = exp(-(dist2*dist2)*inner_denom_inv);
+        const float3 weight3 = exp(-(dist3*dist3)*inner_denom_inv);
+        return scale * (weight1 + weight2 + weight3);
+    }
+    else
+    {
+        return color*exp(-(dist*dist)*inner_denom_inv)*outer_denom_inv;
+    }
+}
+
+float3 scanline_generalized_gaussian_sampled_contrib(float3 dist,
+    float3 color, float pixel_height, float sigma_range,
+    float shape_range)
+{
+    //  See scanline_generalized_gaussian_integral_contrib() for details!
+    //  generalized sample =
+    //      beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta)
+    const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
+    const float3 beta = get_generalized_gaussian_beta(color, shape_range);
+    //  Avoid repeated divides:
+    const float3 alpha_inv = float3(1.0)/alpha;
+    const float3 beta_inv = float3(1.0)/beta;
+    const float3 scale = color * beta * 0.5 * alpha_inv /
+        gamma_impl(beta_inv, beta);
+    if(beam_antialias_level > 0.5)
+    {
+        //  Sample 1/3 pixel closer to and farther from the scanline too.
+        const float3 sample_offset = float3(pixel_height/3.0);
+        const float3 dist2 = dist + sample_offset;
+        const float3 dist3 = abs(dist - sample_offset);
+        //  Average three generalized Gaussian samples:
+        const float3 weight1 = exp(-pow(abs(dist*alpha_inv), beta));
+        const float3 weight2 = exp(-pow(abs(dist2*alpha_inv), beta));
+        const float3 weight3 = exp(-pow(abs(dist3*alpha_inv), beta));
+        return scale/3.0 * (weight1 + weight2 + weight3);
+    }
+    else
+    {
+        return scale * exp(-pow(abs(dist*alpha_inv), beta));
+    }
+}
+
+inline float3 scanline_contrib(float3 dist, float3 color,
+    float pixel_height, const float sigma_range, const float shape_range)
+{
+    //  Requires:   1.) Requirements of scanline_gaussian_integral_contrib()
+    //                  must be met.
+    //              2.) Requirements of get_gaussian_sigma() must be met.
+    //              3.) Requirements of get_generalized_gaussian_beta() must be
+    //                  met.
+    //  Returns:    Return a scanline's light output over a given pixel, using
+    //              a generalized or pure Gaussian distribution and sampling or
+    //              integrals as desired by user codepath choices.
+    if(beam_generalized_gaussian)
+    {
+        if(beam_antialias_level > 1.5)
+        {
+            return scanline_generalized_gaussian_integral_contrib(
+                dist, color, pixel_height, sigma_range, shape_range);
+        }
+        else
+        {
+            return scanline_generalized_gaussian_sampled_contrib(
+                dist, color, pixel_height, sigma_range, shape_range);
+        }
+    }
+    else
+    {
+        if(beam_antialias_level > 1.5)
+        {
+            return scanline_gaussian_integral_contrib(
+                dist, color, pixel_height, sigma_range);
+        }
+        else
+        {
+            return scanline_gaussian_sampled_contrib(
+                dist, color, pixel_height, sigma_range);
+        }
+    }
+}
+
+inline float3 get_raw_interpolated_color(const float3 color0,
+    const float3 color1, const float3 color2, const float3 color3,
+    const float4 weights)
+{
+    //  Use max to avoid bizarre artifacts from negative colors:
+    return max(mul(weights, float4x3(color0, color1, color2, color3)), 0.0);
+}
+
+float3 get_interpolated_linear_color(const float3 color0, const float3 color1,
+    const float3 color2, const float3 color3, const float4 weights)
+{
+    //  Requires:   1.) Requirements of include/gamma-management.h must be met:
+    //                  intermediate_gamma must be globally defined, and input
+    //                  colors are interpreted as linear RGB unless you #define
+    //                  GAMMA_ENCODE_EVERY_FBO (in which case they are
+    //                  interpreted as gamma-encoded with intermediate_gamma).
+    //              2.) color0-3 are colors sampled from a texture with tex2D().
+    //                  They are interpreted as defined in requirement 1.
+    //              3.) weights contains weights for each color, summing to 1.0.
+    //              4.) beam_horiz_linear_rgb_weight must be defined as a global
+    //                  float in [0.0, 1.0] describing how much blending should
+    //                  be done in linear RGB (rest is gamma-corrected RGB).
+    //              5.) RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE must be #defined
+    //                  if beam_horiz_linear_rgb_weight is anything other than a
+    //                  static constant, or we may try branching at runtime
+    //                  without dynamic branches allowed (slow).
+    //  Returns:    Return an interpolated color lookup between the four input
+    //              colors based on the weights in weights.  The final color will
+    //              be a linear RGB value, but the blending will be done as
+    //              indicated above.
+    const float intermediate_gamma = get_intermediate_gamma();
+    //  Branch if beam_horiz_linear_rgb_weight is static (for free) or if the
+    //  profile allows dynamic branches (faster than computing extra pows):
+    #ifndef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+    #else
+        #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+            #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+        #endif
+    #endif
+    #ifdef SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+        //  beam_horiz_linear_rgb_weight is static, so we can branch:
+        #ifdef GAMMA_ENCODE_EVERY_FBO
+            const float3 gamma_mixed_color = pow(get_raw_interpolated_color(
+                color0, color1, color2, color3, weights), float3(intermediate_gamma));
+            if(beam_horiz_linear_rgb_weight > 0.0)
+            {
+                const float3 linear_mixed_color = get_raw_interpolated_color(
+                    pow(color0, float3(intermediate_gamma)),
+                    pow(color1, float3(intermediate_gamma)),
+                    pow(color2, float3(intermediate_gamma)),
+                    pow(color3, float3(intermediate_gamma)),
+                    weights);
+                return lerp(gamma_mixed_color, linear_mixed_color,
+                    beam_horiz_linear_rgb_weight);
+            }
+            else
+            {
+                return gamma_mixed_color;
+            }
+        #else
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                color0, color1, color2, color3, weights);
+            if(beam_horiz_linear_rgb_weight < 1.0)
+            {
+                const float3 gamma_mixed_color = get_raw_interpolated_color(
+                    pow(color0, float3(1.0/intermediate_gamma)),
+                    pow(color1, float3(1.0/intermediate_gamma)),
+                    pow(color2, float3(1.0/intermediate_gamma)),
+                    pow(color3, float3(1.0/intermediate_gamma)),
+                    weights);
+                return lerp(gamma_mixed_color, linear_mixed_color,
+                    beam_horiz_linear_rgb_weight);
+            }
+            else
+            {
+                return linear_mixed_color;
+            }
+        #endif  //  GAMMA_ENCODE_EVERY_FBO
+    #else
+        #ifdef GAMMA_ENCODE_EVERY_FBO
+            //  Inputs: color0-3 are colors in gamma-encoded RGB.
+            const float3 gamma_mixed_color = pow(get_raw_interpolated_color(
+                color0, color1, color2, color3, weights), intermediate_gamma);
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                pow(color0, float3(intermediate_gamma)),
+                pow(color1, float3(intermediate_gamma)),
+                pow(color2, float3(intermediate_gamma)),
+                pow(color3, float3(intermediate_gamma)),
+                weights);
+            return lerp(gamma_mixed_color, linear_mixed_color,
+                beam_horiz_linear_rgb_weight);
+        #else
+            //  Inputs: color0-3 are colors in linear RGB.
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                color0, color1, color2, color3, weights);
+            const float3 gamma_mixed_color = get_raw_interpolated_color(
+                    pow(color0, float3(1.0/intermediate_gamma)),
+                    pow(color1, float3(1.0/intermediate_gamma)),
+                    pow(color2, float3(1.0/intermediate_gamma)),
+                    pow(color3, float3(1.0/intermediate_gamma)),
+                    weights);
+			// wtf fixme
+//			const float beam_horiz_linear_rgb_weight1 = 1.0;
+            return lerp(gamma_mixed_color, linear_mixed_color,
+                beam_horiz_linear_rgb_weight);
+        #endif  //  GAMMA_ENCODE_EVERY_FBO
+    #endif  //  SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+}
+
+float3 get_scanline_color(const sampler2D tex, const float2 scanline_uv,
+    const float2 uv_step_x, const float4 weights)
+{
+    //  Requires:   1.) scanline_uv must be vertically snapped to the caller's
+    //                  desired line or scanline and horizontally snapped to the
+    //                  texel just left of the output pixel (color1)
+    //              2.) uv_step_x must contain the horizontal uv distance
+    //                  between texels.
+    //              3.) weights must contain interpolation filter weights for
+    //                  color0, color1, color2, and color3, where color1 is just
+    //                  left of the output pixel.
+    //  Returns:    Return a horizontally interpolated texture lookup using 2-4
+    //              nearby texels, according to weights and the conventions of
+    //              get_interpolated_linear_color().
+    //  We can ignore the outside texture lookups for Quilez resampling.
+    const float3 color1 = COMPAT_TEXTURE(tex, scanline_uv).rgb;
+    const float3 color2 = COMPAT_TEXTURE(tex, scanline_uv + uv_step_x).rgb;
+    float3 color0 = float3(0.0);
+    float3 color3 = float3(0.0);
+    if(beam_horiz_filter > 0.5)
+    {
+        color0 = COMPAT_TEXTURE(tex, scanline_uv - uv_step_x).rgb;
+        color3 = COMPAT_TEXTURE(tex, scanline_uv + 2.0 * uv_step_x).rgb;
+    }
+    //  Sample the texture as-is, whether it's linear or gamma-encoded:
+    //  get_interpolated_linear_color() will handle the difference.
+    return get_interpolated_linear_color(color0, color1, color2, color3, weights);
+}
+
+float3 sample_single_scanline_horizontal(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size,
+    const float2 texture_size_inv)
+{
+    //  TODO: Add function requirements.
+    //  Snap to the previous texel and get sample dists from 2/4 nearby texels:
+    const float2 curr_texel = tex_uv * tex_size;
+    //  Use under_half to fix a rounding bug right around exact texel locations.
+    const float2 prev_texel =
+        floor(curr_texel - float2(under_half)) + float2(0.5);
+    const float2 prev_texel_hor = float2(prev_texel.x, curr_texel.y);
+    const float2 prev_texel_hor_uv = prev_texel_hor * texture_size_inv;
+    const float prev_dist = curr_texel.x - prev_texel_hor.x;
+    const float4 sample_dists = float4(1.0 + prev_dist, prev_dist,
+        1.0 - prev_dist, 2.0 - prev_dist);
+    //  Get Quilez, Lanczos2, or Gaussian resize weights for 2/4 nearby texels:
+    float4 weights;
+    if(beam_horiz_filter < 0.5)
+    {
+        //  Quilez:
+        const float x = sample_dists.y;
+        const float w2 = x*x*x*(x*(x*6.0 - 15.0) + 10.0);
+        weights = float4(0.0, 1.0 - w2, w2, 0.0);
+    }
+    else if(beam_horiz_filter < 1.5)
+    {
+        //  Gaussian:
+        float inner_denom_inv = 1.0/(2.0*beam_horiz_sigma*beam_horiz_sigma);
+        weights = exp(-(sample_dists*sample_dists)*inner_denom_inv);
+    }
+    else
+    {
+        //  Lanczos2:
+        const float4 pi_dists = FIX_ZERO(sample_dists * pi);
+        weights = 2.0 * sin(pi_dists) * sin(pi_dists * 0.5) /
+            (pi_dists * pi_dists);
+    }
+    //  Ensure the weight sum == 1.0:
+    const float4 final_weights = weights/dot(weights, float4(1.0));
+    //  Get the interpolated horizontal scanline color:
+    const float2 uv_step_x = float2(texture_size_inv.x, 0.0);
+    return get_scanline_color(
+        tex, prev_texel_hor_uv, uv_step_x, final_weights);
+}
+
+float3 sample_rgb_scanline_horizontal(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size,
+    const float2 texture_size_inv)
+{
+    //  TODO: Add function requirements.
+    //  Rely on a helper to make convergence easier.
+    if(beam_misconvergence)
+    {
+        const float3 convergence_offsets_rgb =
+            get_convergence_offsets_x_vector();
+        const float3 offset_u_rgb =
+            convergence_offsets_rgb * texture_size_inv.xxx;
+        const float2 scanline_uv_r = tex_uv - float2(offset_u_rgb.r, 0.0);
+        const float2 scanline_uv_g = tex_uv - float2(offset_u_rgb.g, 0.0);
+        const float2 scanline_uv_b = tex_uv - float2(offset_u_rgb.b, 0.0);
+        const float3 sample_r = sample_single_scanline_horizontal(
+            tex, scanline_uv_r, tex_size, texture_size_inv);
+        const float3 sample_g = sample_single_scanline_horizontal(
+            tex, scanline_uv_g, tex_size, texture_size_inv);
+        const float3 sample_b = sample_single_scanline_horizontal(
+            tex, scanline_uv_b, tex_size, texture_size_inv);
+        return float3(sample_r.r, sample_g.g, sample_b.b);
+    }
+    else
+    {
+        return sample_single_scanline_horizontal(tex, tex_uv, tex_size,
+            texture_size_inv);
+    }
+}
+
+float2 get_last_scanline_uv(const float2 tex_uv, const float2 tex_size,
+    const float2 texture_size_inv, const float2 il_step_multiple,
+    const float frame_count, out float dist)
+{
+    //  Compute texture coords for the last/upper scanline, accounting for
+    //  interlacing: With interlacing, only consider even/odd scanlines every
+    //  other frame.  Top-field first (TFF) order puts even scanlines on even
+    //  frames, and BFF order puts them on odd frames.  Texels are centered at:
+    //      frac(tex_uv * tex_size) == x.5
+    //  Caution: If these coordinates ever seem incorrect, first make sure it's
+    //  not because anisotropic filtering is blurring across field boundaries.
+    //  Note: TFF/BFF won't matter for sources that double-weave or similar.
+	// wtf fixme
+//	const float interlace_bff1 = 1.0;
+    const float field_offset = floor(il_step_multiple.y * 0.75) *
+        fmod(frame_count + float(interlace_bff), 2.0);
+    const float2 curr_texel = tex_uv * tex_size;
+    //  Use under_half to fix a rounding bug right around exact texel locations.
+    const float2 prev_texel_num = floor(curr_texel - float2(under_half));
+    const float wrong_field = fmod(
+        prev_texel_num.y + field_offset, il_step_multiple.y);
+    const float2 scanline_texel_num = prev_texel_num - float2(0.0, wrong_field);
+    //  Snap to the center of the previous scanline in the current field:
+    const float2 scanline_texel = scanline_texel_num + float2(0.5);
+    const float2 scanline_uv = scanline_texel * texture_size_inv;
+    //  Save the sample's distance from the scanline, in units of scanlines:
+    dist = (curr_texel.y - scanline_texel.y)/il_step_multiple.y;
+    return scanline_uv;
+}
+
+inline bool is_interlaced(float num_lines)
+{
+    //  Detect interlacing based on the number of lines in the source.
+    if(interlace_detect)
+    {
+        //  NTSC: 525 lines, 262.5/field; 486 active (2 half-lines), 243/field
+        //  NTSC Emulators: Typically 224 or 240 lines
+        //  PAL: 625 lines, 312.5/field; 576 active (typical), 288/field
+        //  PAL Emulators: ?
+        //  ATSC: 720p, 1080i, 1080p
+        //  Where do we place our cutoffs?  Assumptions:
+        //  1.) We only need to care about active lines.
+        //  2.) Anything > 288 and <= 576 lines is probably interlaced.
+        //  3.) Anything > 576 lines is probably not interlaced...
+        //  4.) ...except 1080 lines, which is a crapshoot (user decision).
+        //  5.) Just in case the main program uses calculated video sizes,
+        //      we should nudge the float thresholds a bit.
+        const bool sd_interlace = ((num_lines > 288.5) && (num_lines < 576.5));
+        const bool hd_interlace = bool(interlace_1080i) ?
+            ((num_lines > 1079.5) && (num_lines < 1080.5)) :
+            false;
+        return (sd_interlace || hd_interlace);
+    }
+    else
+    {
+        return false;
+    }
+}
+
+#endif  //  SCANLINE_FUNCTIONS_H
+
+/////////////////////////////  END SCANLINE-FUNCTIONS  ////////////////////////////
+
+//#include "phosphor-mask-resizing.h"
+
+////////////////////////  BEGIN PHOSPHOR-MASK-RESIZING  ////////////////////////
+
+#ifndef PHOSPHOR_MASK_RESIZING_H
+#define PHOSPHOR_MASK_RESIZING_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//#include "../user-settings.h"
+//#include "derived-settings-and-constants.h"
+
+/////////////////////////////  CODEPATH SELECTION  /////////////////////////////
+
+//  Choose a looping strategy based on what's allowed:
+//  Dynamic loops not allowed: Use a flat static loop.
+//  Dynamic loops accomodated: Coarsely branch around static loops.
+//  Dynamic loops assumed allowed: Use a flat dynamic loop.
+#ifndef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #ifdef ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+        #define BREAK_LOOPS_INTO_PIECES
+    #else
+        #define USE_SINGLE_STATIC_LOOP
+    #endif
+#endif  //  No else needed: Dynamic loops assumed.
+
+
+//////////////////////////////////  CONSTANTS  /////////////////////////////////
+
+//  The larger the resized tile, the fewer samples we'll need for downsizing.
+//  See if we can get a static min tile size > mask_min_allowed_tile_size:
+static const float mask_min_allowed_tile_size = ceil(
+    mask_min_allowed_triad_size * mask_triads_per_tile);
+static const float mask_min_expected_tile_size = 
+        mask_min_allowed_tile_size;
+//  Limit the number of sinc resize taps by the maximum minification factor:
+static const float pi_over_lobes = pi/mask_sinc_lobes;
+static const float max_sinc_resize_samples_float = 2.0 * mask_sinc_lobes *
+    mask_resize_src_lut_size.x/mask_min_expected_tile_size;
+//  Vectorized loops sample in multiples of 4.  Round up to be safe:
+static const float max_sinc_resize_samples_m4 = ceil(
+    max_sinc_resize_samples_float * 0.25) * 4.0;
+
+
+/////////////////////////  RESAMPLING FUNCTION HELPERS  ////////////////////////
+
+inline float get_dynamic_loop_size(const float magnification_scale)
+{
+    //  Requires:   The following global constants must be defined:
+    //              1.) mask_sinc_lobes
+    //              2.) max_sinc_resize_samples_m4
+    //  Returns:    The minimum number of texture samples for a correct downsize
+    //              at magnification_scale.
+    //  We're downsizing, so the filter is sized across 2*lobes output pixels
+    //  (not 2*lobes input texels).  This impacts distance measurements and the
+    //  minimum number of input samples needed.
+    const float min_samples_float = 2.0 * mask_sinc_lobes / magnification_scale;
+    const float min_samples_m4 = ceil(min_samples_float * 0.25) * 4.0;
+    #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+        const float max_samples_m4 = max_sinc_resize_samples_m4;
+    #else   // ifdef BREAK_LOOPS_INTO_PIECES
+        //  Simulating loops with branches imposes a 128-sample limit.
+        const float max_samples_m4 = min(128.0, max_sinc_resize_samples_m4);
+    #endif
+    return min(min_samples_m4, max_samples_m4);
+}
+
+float2 get_first_texel_tile_uv_and_dist(const float2 tex_uv, 
+    const float2 tex_size, const float dr, 
+    const float input_tiles_per_texture_r, const float samples,
+    static const bool vertical)
+{
+    //  Requires:   1.) dr == du == 1.0/texture_size.x or
+    //                  dr == dv == 1.0/texture_size.y
+    //                  (whichever direction we're resampling in).
+    //                  It's a scalar to save register space.
+    //              2.) input_tiles_per_texture_r is the number of input tiles
+    //                  that can fit in the input texture in the direction we're
+    //                  resampling this pass.
+    //              3.) vertical indicates whether we're resampling vertically
+    //                  this pass (or horizontally).
+    //  Returns:    Pack and return the first sample's tile_uv coord in [0, 1]
+    //              and its texel distance from the destination pixel, in the
+    //              resized dimension only.
+    //  We'll start with the topmost or leftmost sample and work down or right,
+    //  so get the first sample location and distance.  Modify both dimensions
+    //  as if we're doing a one-pass 2D resize; we'll throw away the unneeded
+    //  (and incorrect) dimension at the end.
+    const float2 curr_texel = tex_uv * tex_size;
+    const float2 prev_texel =
+        floor(curr_texel - float2(under_half)) + float2(0.5);
+    const float2 first_texel = prev_texel - float2(samples/2.0 - 1.0);
+    const float2 first_texel_uv_wrap_2D = first_texel * dr;
+    const float2 first_texel_dist_2D = curr_texel - first_texel;
+    //  Convert from tex_uv to tile_uv coords so we can sub fracs for fmods.
+    const float2 first_texel_tile_uv_wrap_2D =
+        first_texel_uv_wrap_2D * input_tiles_per_texture_r;
+    //  Project wrapped coordinates to the [0, 1] range.  We'll do this with all
+    //  samples,but the first texel is special, since it might be negative.
+    const float2 coord_negative =
+        float2((first_texel_tile_uv_wrap_2D.x < 0.),(first_texel_tile_uv_wrap_2D.y < 0.));
+    const float2 first_texel_tile_uv_2D =
+        frac(first_texel_tile_uv_wrap_2D) + coord_negative;
+    //  Pack the first texel's tile_uv coord and texel distance in 1D:
+    const float2 tile_u_and_dist =
+        float2(first_texel_tile_uv_2D.x, first_texel_dist_2D.x);
+    const float2 tile_v_and_dist =
+        float2(first_texel_tile_uv_2D.y, first_texel_dist_2D.y);
+    return vertical ? tile_v_and_dist : tile_u_and_dist;
+    //return lerp(tile_u_and_dist, tile_v_and_dist, float(vertical));
+}
+
+inline float4 tex2Dlod0try(const sampler2D tex, const float2 tex_uv)
+{
+    //  Mipmapping and anisotropic filtering get confused by sinc-resampling.
+    //  One [slow] workaround is to select the lowest mip level:
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        return textureLod(tex, float4(tex_uv, 0.0, 0.0).xy);
+    #else
+        #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+            return tex2Dbias(tex, float4(tex_uv, 0.0, -16.0));
+        #else
+            return texture(tex, tex_uv);
+        #endif
+    #endif
+}
+
+
+//////////////////////////////  LOOP BODY MACROS  //////////////////////////////
+
+//  Using inline functions can exceed the temporary register limit, so we're
+//  stuck with #define macros (I'm TRULY sorry).  They're declared here instead
+//  of above to be closer to the actual invocation sites.  Steps:
+//  1.) Get the exact texel location.
+//  2.) Sample the phosphor mask (already assumed encoded in linear RGB).
+//  3.) Get the distance from the current pixel and sinc weight:
+//          sinc(dist) = sin(pi * dist)/(pi * dist)
+//      We can also use the slower/smoother Lanczos instead:
+//          L(x) = sinc(dist) * sinc(dist / lobes)
+//  4.) Accumulate the weight sum in weights, and accumulate the weighted texels
+//      in pixel_color (we'll normalize outside the loop at the end).
+//  We vectorize the loop to help reduce the Lanczos window's cost.
+
+    //  The r coord is the coord in the dimension we're resizing along (u or v),
+    //  and first_texel_tile_uv_rrrr is a float4 of the first texel's u or v
+    //  tile_uv coord in [0, 1].  tex_uv_r will contain the tile_uv u or v coord
+    //  for four new texel samples.
+    #define CALCULATE_R_COORD_FOR_4_SAMPLES                                    \
+        const float4 true_i = float4(i_base + i) + float4(0.0, 1.0, 2.0, 3.0); \
+        const float4 tile_uv_r = frac(                                         \
+            first_texel_tile_uv_rrrr + true_i * tile_dr);                      \
+        const float4 tex_uv_r = tile_uv_r * tile_size_uv_r;
+
+    #ifdef PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+        #define CALCULATE_SINC_RESAMPLE_WEIGHTS                                \
+            const float4 pi_dist_over_lobes = pi_over_lobes * dist;            \
+            const float4 weights = min(sin(pi_dist) * sin(pi_dist_over_lobes) /\
+                (pi_dist*pi_dist_over_lobes), float4(1.0));
+    #else
+        #define CALCULATE_SINC_RESAMPLE_WEIGHTS                                \
+            const float4 weights = min(sin(pi_dist)/pi_dist, float4(1.0));
+    #endif
+
+    #define UPDATE_COLOR_AND_WEIGHT_SUMS                                       \
+        const float4 dist = magnification_scale *                              \
+            abs(first_dist_unscaled - true_i);                                 \
+        const float4 pi_dist = pi * dist;                                      \
+        CALCULATE_SINC_RESAMPLE_WEIGHTS;                                       \
+        pixel_color += new_sample0 * weights.xxx;                              \
+        pixel_color += new_sample1 * weights.yyy;                              \
+        pixel_color += new_sample2 * weights.zzz;                              \
+        pixel_color += new_sample3 * weights.www;                              \
+        weight_sum += weights;
+
+    #define VERTICAL_SINC_RESAMPLE_LOOP_BODY                                   \
+        CALCULATE_R_COORD_FOR_4_SAMPLES;                                       \
+        const float3 new_sample0 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.x)).rgb;                                 \
+        const float3 new_sample1 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.y)).rgb;                                 \
+        const float3 new_sample2 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.z)).rgb;                                 \
+        const float3 new_sample3 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.w)).rgb;                                 \
+        UPDATE_COLOR_AND_WEIGHT_SUMS;
+
+    #define HORIZONTAL_SINC_RESAMPLE_LOOP_BODY                                 \
+        CALCULATE_R_COORD_FOR_4_SAMPLES;                                       \
+        const float3 new_sample0 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.x, tex_uv.y)).rgb;                                 \
+        const float3 new_sample1 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.y, tex_uv.y)).rgb;                                 \
+        const float3 new_sample2 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.z, tex_uv.y)).rgb;                                 \
+        const float3 new_sample3 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.w, tex_uv.y)).rgb;                                 \
+        UPDATE_COLOR_AND_WEIGHT_SUMS;
+
+
+////////////////////////////  RESAMPLING FUNCTIONS  ////////////////////////////
+
+float3 downsample_vertical_sinc_tiled(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size, static const float dr,
+    const float magnification_scale, static const float tile_size_uv_r)
+{
+    //  Requires:   1.) dr == du == 1.0/texture_size.x or
+    //                  dr == dv == 1.0/texture_size.y
+    //                  (whichever direction we're resampling in).
+    //                  It's a scalar to save register space.
+    //              2.) tile_size_uv_r is the number of texels an input tile
+    //                  takes up in the input texture, in the direction we're
+    //                  resampling this pass.
+    //              3.) magnification_scale must be <= 1.0.
+    //  Returns:    Return a [Lanczos] sinc-resampled pixel of a vertically
+    //              downsized input tile embedded in an input texture.  (The
+    //              vertical version is special-cased though: It assumes the
+    //              tile size equals the [static] texture size, since it's used
+    //              on an LUT texture input containing one tile.  For more
+    //              generic use, eliminate the "static" in the parameters.)
+    //  The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension
+    //  we're resizing along, e.g. "dy" in this case.
+    #ifdef USE_SINGLE_STATIC_LOOP
+        //  A static loop can be faster, but it might blur too much from using
+        //  more samples than it should.
+        static const int samples = int(max_sinc_resize_samples_m4);
+    #else
+        const int samples = int(get_dynamic_loop_size(magnification_scale));
+    #endif
+
+    //  Get the first sample location (scalar tile uv coord along the resized
+    //  dimension) and distance from the output location (in texels):
+    static const float input_tiles_per_texture_r = 1.0/tile_size_uv_r;
+    //  true = vertical resize:
+    const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist(
+        tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, true);
+    const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx;
+    const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy;
+    //  Get the tile sample offset:
+    static const float tile_dr = dr * input_tiles_per_texture_r;
+
+    //  Sum up each weight and weighted sample color, varying the looping
+    //  strategy based on our expected dynamic loop capabilities.  See the
+    //  loop body macros above.
+    int i_base = 0;
+    float4 weight_sum = float4(0.0);
+    float3 pixel_color = float3(0.0);
+    static const int i_step = 4;
+    #ifdef BREAK_LOOPS_INTO_PIECES
+        if(samples - i_base >= 64)
+        {
+            for(int i = 0; i < 64; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 64;
+        }
+        if(samples - i_base >= 32)
+        {
+            for(int i = 0; i < 32; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 32;
+        }
+        if(samples - i_base >= 16)
+        {
+            for(int i = 0; i < 16; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 16;
+        }
+        if(samples - i_base >= 8)
+        {
+            for(int i = 0; i < 8; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 8;
+        }
+        if(samples - i_base >= 4)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 4;
+        }
+        //  Do another 4-sample block for a total of 128 max samples.
+        if(samples - i_base > 0)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+        }
+    #else
+        for(int i = 0; i < samples; i += i_step)
+        {
+            VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+        }
+    #endif
+    //  Normalize so the weight_sum == 1.0, and return:
+    const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw;
+    const float3 scalar_weight_sum = float3(weight_sum_reduce.x + 
+        weight_sum_reduce.y);
+    return (pixel_color/scalar_weight_sum);
+}
+
+float3 downsample_horizontal_sinc_tiled(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size, const float dr,
+    const float magnification_scale, const float tile_size_uv_r)
+{
+    //  Differences from downsample_horizontal_sinc_tiled:
+    //  1.) The dr and tile_size_uv_r parameters are not static consts.
+    //  2.) The "vertical" parameter to get_first_texel_tile_uv_and_dist is
+    //      set to false instead of true.
+    //  3.) The horizontal version of the loop body is used.
+    //  TODO: If we can get guaranteed compile-time dead code elimination,
+    //  we can combine the vertical/horizontal downsampling functions by:
+    //  1.) Add an extra static const bool parameter called "vertical."
+    //  2.) Supply it with the result of get_first_texel_tile_uv_and_dist().
+    //  3.) Use a conditional assignment in the loop body macro.  This is the
+    //      tricky part: We DO NOT want to incur the extra conditional
+    //      assignment in the inner loop at runtime!
+    //  The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension
+    //  we're resizing along, e.g. "dx" in this case.
+    #ifdef USE_SINGLE_STATIC_LOOP
+        //  If we have to load all samples, we might as well use them.
+        static const int samples = int(max_sinc_resize_samples_m4);
+    #else
+        const int samples = int(get_dynamic_loop_size(magnification_scale));
+    #endif
+
+    //  Get the first sample location (scalar tile uv coord along resized
+    //  dimension) and distance from the output location (in texels):
+    const float input_tiles_per_texture_r = 1.0/tile_size_uv_r;
+    //  false = horizontal resize:
+    const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist(
+        tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, false);
+    const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx;
+    const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy;
+    //  Get the tile sample offset:
+    const float tile_dr = dr * input_tiles_per_texture_r;
+
+    //  Sum up each weight and weighted sample color, varying the looping
+    //  strategy based on our expected dynamic loop capabilities.  See the
+    //  loop body macros above.
+    int i_base = 0;
+    float4 weight_sum = float4(0.0);
+    float3 pixel_color = float3(0.0);
+    static const int i_step = 4;
+    #ifdef BREAK_LOOPS_INTO_PIECES
+        if(samples - i_base >= 64)
+        {
+            for(int i = 0; i < 64; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 64;
+        }
+        if(samples - i_base >= 32)
+        {
+            for(int i = 0; i < 32; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 32;
+        }
+        if(samples - i_base >= 16)
+        {
+            for(int i = 0; i < 16; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 16;
+        }
+        if(samples - i_base >= 8)
+        {
+            for(int i = 0; i < 8; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 8;
+        }
+        if(samples - i_base >= 4)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 4;
+        }
+        //  Do another 4-sample block for a total of 128 max samples.
+        if(samples - i_base > 0)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+        }
+    #else
+        for(int i = 0; i < samples; i += i_step)
+        {
+            HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+        }
+    #endif
+    //  Normalize so the weight_sum == 1.0, and return:
+    const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw;
+    const float3 scalar_weight_sum = float3(weight_sum_reduce.x +
+        weight_sum_reduce.y);
+    return (pixel_color/scalar_weight_sum);
+}
+
+
+////////////////////////////  TILE SIZE CALCULATION  ///////////////////////////
+
+float2 get_resized_mask_tile_size(const float2 estimated_viewport_size,
+    const float2 estimated_mask_resize_output_size,
+    const bool solemnly_swear_same_inputs_for_every_pass)
+{
+    //  Requires:   The following global constants must be defined according to
+    //              certain constraints:
+    //              1.) mask_resize_num_triads: Must be high enough that our
+    //                  mask sampling method won't have artifacts later
+    //                  (long story; see derived-settings-and-constants.h)
+    //              2.) mask_resize_src_lut_size: Texel size of our mask LUT
+    //              3.) mask_triads_per_tile: Num horizontal triads in our LUT
+    //              4.) mask_min_allowed_triad_size: User setting (the more
+    //                  restrictive it is, the faster the resize will go)
+    //              5.) mask_min_allowed_tile_size_x < mask_resize_src_lut_size.x
+    //              6.) mask_triad_size_desired_{runtime, static}
+    //              7.) mask_num_triads_desired_{runtime, static}
+    //              8.) mask_specify_num_triads must be 0.0/1.0 (false/true)
+    //              The function parameters must be defined as follows:
+    //              1.) estimated_viewport_size == (final viewport size);
+    //                  If mask_specify_num_triads is 1.0/true and the viewport
+    //                  estimate is wrong, the number of triads will differ from
+    //                  the user's preference by about the same factor.
+    //              2.) estimated_mask_resize_output_size: Must equal the
+    //                  output size of the MASK_RESIZE pass.
+    //                  Exception: The x component may be estimated garbage if
+    //                  and only if the caller throws away the x result.
+    //              3.) solemnly_swear_same_inputs_for_every_pass: Set to false,
+    //                  unless you can guarantee that every call across every
+    //                  pass will use the same sizes for the other parameters.
+    //              When calling this across multiple passes, always use the
+    //              same y viewport size/scale, and always use the same x
+    //              viewport size/scale when using the x result.
+    //  Returns:    Return the final size of a manually resized mask tile, after
+    //              constraining the desired size to avoid artifacts.  Under
+    //              unusual circumstances, tiles may become stretched vertically
+    //              (see wall of text below).
+    //  Stated tile properties must be correct:
+    static const float tile_aspect_ratio_inv =
+        mask_resize_src_lut_size.y/mask_resize_src_lut_size.x;
+    static const float tile_aspect_ratio = 1.0/tile_aspect_ratio_inv;
+    static const float2 tile_aspect = float2(1.0, tile_aspect_ratio_inv);
+    //  If mask_specify_num_triads is 1.0/true and estimated_viewport_size.x is
+    //  wrong, the user preference will be misinterpreted:
+    const float desired_tile_size_x = mask_triads_per_tile * lerp(
+        mask_triad_size_desired,
+        estimated_viewport_size.x / mask_num_triads_desired,
+        mask_specify_num_triads);
+    if(get_mask_sample_mode() > 0.5)
+    {
+        //  We don't need constraints unless we're sampling MASK_RESIZE.
+        return desired_tile_size_x * tile_aspect;
+    }
+    //  Make sure we're not upsizing:
+    const float temp_tile_size_x =
+        min(desired_tile_size_x, mask_resize_src_lut_size.x);
+    //  Enforce min_tile_size and max_tile_size in both dimensions:
+    const float2 temp_tile_size = temp_tile_size_x * tile_aspect;
+    static const float2 min_tile_size =
+        mask_min_allowed_tile_size * tile_aspect;
+    const float2 max_tile_size =
+        estimated_mask_resize_output_size / mask_resize_num_tiles;
+    const float2 clamped_tile_size =
+        clamp(temp_tile_size, min_tile_size, max_tile_size);
+    //  Try to maintain tile_aspect_ratio.  This is the tricky part:
+    //  If we're currently resizing in the y dimension, the x components
+    //  could be MEANINGLESS.  (If estimated_mask_resize_output_size.x is
+    //  bogus, then so is max_tile_size.x and clamped_tile_size.x.)
+    //  We can't adjust the y size based on clamped_tile_size.x.  If it
+    //  clamps when it shouldn't, it won't clamp again when later passes
+    //  call this function with the correct sizes, and the discrepancy will
+    //  break the sampling coords in MASKED_SCANLINES.  Instead, we'll limit
+    //  the x size based on the y size, but not vice versa, unless the
+    //  caller swears the parameters were the same (correct) in every pass.
+    //  As a result, triads could appear vertically stretched if:
+    //  a.) mask_resize_src_lut_size.x > mask_resize_src_lut_size.y: Wide
+    //      LUT's might clamp x more than y (all provided LUT's are square)
+    //  b.) true_viewport_size.x < true_viewport_size.y: The user is playing
+    //      with a vertically oriented screen (not accounted for anyway)
+    //  c.) mask_resize_viewport_scale.x < masked_resize_viewport_scale.y:
+    //      Viewport scales are equal by default.
+    //  If any of these are the case, you can fix the stretching by setting:
+    //      mask_resize_viewport_scale.x = mask_resize_viewport_scale.y *
+    //          (1.0 / min_expected_aspect_ratio) *
+    //          (mask_resize_src_lut_size.x / mask_resize_src_lut_size.y)
+    const float x_tile_size_from_y =
+        clamped_tile_size.y * tile_aspect_ratio;
+    const float y_tile_size_from_x = lerp(clamped_tile_size.y,
+        clamped_tile_size.x * tile_aspect_ratio_inv,
+        float(solemnly_swear_same_inputs_for_every_pass));
+    const float2 reclamped_tile_size = float2(
+        min(clamped_tile_size.x, x_tile_size_from_y),
+        min(clamped_tile_size.y, y_tile_size_from_x));
+    //  We need integer tile sizes in both directions for tiled sampling to
+    //  work correctly.  Use floor (to make sure we don't round up), but be
+    //  careful to avoid a rounding bug where floor decreases whole numbers:
+    const float2 final_resized_tile_size =
+        floor(reclamped_tile_size + float2(FIX_ZERO(0.0)));
+    return final_resized_tile_size;
+}
+
+
+/////////////////////////  FINAL MASK SAMPLING HELPERS  ////////////////////////
+
+float4 get_mask_sampling_parameters(const float2 mask_resize_texture_size,
+    const float2 mask_resize_video_size, const float2 true_viewport_size,
+    out float2 mask_tiles_per_screen)
+{
+    //  Requires:   1.) Requirements of get_resized_mask_tile_size() must be
+    //                  met, particularly regarding global constants.
+    //              The function parameters must be defined as follows:
+    //              1.) mask_resize_texture_size == MASK_RESIZE.texture_size
+    //                  if get_mask_sample_mode() is 0 (otherwise anything)
+    //              2.) mask_resize_video_size == MASK_RESIZE.video_size
+    //                  if get_mask_sample_mode() is 0 (otherwise anything)
+    //              3.) true_viewport_size == output_size for a pass set to
+    //                  1.0 viewport scale (i.e. it must be correct)
+    //  Returns:    Return a float4 containing:
+    //                  xy: tex_uv coords for the start of the mask tile
+    //                  zw: tex_uv size of the mask tile from start to end
+    //              mask_tiles_per_screen is an out parameter containing the
+    //              number of mask tiles that will fit on the screen.
+    //  First get the final resized tile size.  The viewport size and mask
+    //  resize viewport scale must be correct, but don't solemnly swear they
+    //  were correct in both mask resize passes unless you know it's true.
+    //  (We can better ensure a correct tile aspect ratio if the parameters are
+    //  guaranteed correct in all passes...but if we lie, we'll get inconsistent
+    //  sizes across passes, resulting in broken texture coordinates.)
+    const float mask_sample_mode = get_mask_sample_mode();
+    const float2 mask_resize_tile_size = get_resized_mask_tile_size(
+        true_viewport_size, mask_resize_video_size, false);
+    if(mask_sample_mode < 0.5)
+    {
+        //  Sample MASK_RESIZE: The resized tile is a fraction of the texture
+        //  size and starts at a nonzero offset to allow for border texels:
+        const float2 mask_tile_uv_size = mask_resize_tile_size /
+            mask_resize_texture_size;
+        const float2 skipped_tiles = mask_start_texels/mask_resize_tile_size;
+        const float2 mask_tile_start_uv = skipped_tiles * mask_tile_uv_size;
+        //  mask_tiles_per_screen must be based on the *true* viewport size:
+        mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size;
+        return float4(mask_tile_start_uv, mask_tile_uv_size);
+    }
+    else
+    {
+        //  If we're tiling at the original size (1:1 pixel:texel), redefine a
+        //  "tile" to be the full texture containing many triads.  Otherwise,
+        //  we're hardware-resampling an LUT, and the texture truly contains a
+        //  single unresized phosphor mask tile anyway.
+        static const float2 mask_tile_uv_size = float2(1.0);
+        static const float2 mask_tile_start_uv = float2(0.0);
+        if(mask_sample_mode > 1.5)
+        {
+            //  Repeat the full LUT at a 1:1 pixel:texel ratio without resizing:
+            mask_tiles_per_screen = true_viewport_size/mask_texture_large_size;
+        }
+        else
+        {
+            //  Hardware-resize the original LUT:
+            mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size;
+        }
+        return float4(mask_tile_start_uv, mask_tile_uv_size);
+    }
+}
+/*
+float2 fix_tiling_discontinuities_normalized(const float2 tile_uv,
+    float2 duv_dx, float2 duv_dy)
+{
+    //  Requires:   1.) duv_dx == ddx(tile_uv)
+    //              2.) duv_dy == ddy(tile_uv)
+    //              3.) tile_uv contains tile-relative uv coords in [0, 1],
+    //                  such that (0.5, 0.5) is the center of a tile, etc.
+    //                  ("Tile" can mean texture, the video embedded in the
+    //                  texture, or some other "tile" embedded in a texture.)
+    //  Returns:    Return new tile_uv coords that contain no discontinuities
+    //              across a 2x2 pixel quad.
+    //  Description:
+    //  When uv coords wrap from 1.0 to 0.0, they create a discontinuity in the
+    //  derivatives, which we assume happened if the absolute difference between
+    //  any fragment in a 2x2 block is > ~half a tile.  If the current block has
+    //  a u or v discontinuity and the current fragment is in the first half of
+    //  the tile along that axis (i.e. it wrapped from 1.0 to 0.0), add a tile
+    //  to that coord to make the 2x2 block continuous.  (It will now have a
+    //  coord > 1.0 in the padding area beyond the tile.)  This function takes
+    //  derivatives as parameters so the caller can reuse them.
+    //  In case we're using high-quality (nVidia-style) derivatives, ensure
+    //  diagonically opposite fragments see each other for correctness:
+    duv_dx = abs(duv_dx) + abs(ddy(duv_dx));
+    duv_dy = abs(duv_dy) + abs(ddx(duv_dy));
+    const float2 pixel_in_first_half_tile = float2((tile_uv.x < 0.5),(tile_uv.y < 0.5));
+    const float2 jump_exists = float2(((duv_dx + duv_dy).x > 0.5),((duv_dx + duv_dy).y > 0.5));
+    return tile_uv + jump_exists * pixel_in_first_half_tile;
+}
+*/
+float2 convert_phosphor_tile_uv_wrap_to_tex_uv(const float2 tile_uv_wrap,
+    const float4 mask_tile_start_uv_and_size)
+{
+    //  Requires:   1.) tile_uv_wrap contains tile-relative uv coords, where the
+    //                  tile spans from [0, 1], such that (0.5, 0.5) is at the
+    //                  tile center.  The input coords can range from [0, inf],
+    //                  and their fractional parts map to a repeated tile.
+    //                  ("Tile" can mean texture, the video embedded in the
+    //                  texture, or some other "tile" embedded in a texture.)
+    //              2.) mask_tile_start_uv_and_size.xy contains tex_uv coords
+    //                  for the start of the embedded tile in the full texture.
+    //              3.) mask_tile_start_uv_and_size.zw contains the [fractional]
+    //                  tex_uv size of the embedded tile in the full texture.
+    //  Returns:    Return tex_uv coords (used for texture sampling)
+    //              corresponding to tile_uv_wrap.
+    if(get_mask_sample_mode() < 0.5)
+    {
+        //  Manually repeat the resized mask tile to fill the screen:
+        //  First get fractional tile_uv coords.  Using frac/fmod on coords
+        //  confuses anisotropic filtering; fix it as user options dictate.
+        //  derived-settings-and-constants.h disables incompatible options.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            float2 tile_uv = frac(tile_uv_wrap * 0.5) * 2.0;
+        #else
+            float2 tile_uv = frac(tile_uv_wrap);
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            const float2 tile_uv_dx = ddx(tile_uv);
+            const float2 tile_uv_dy = ddy(tile_uv);
+            tile_uv = fix_tiling_discontinuities_normalized(tile_uv,
+                tile_uv_dx, tile_uv_dy);
+        #endif
+        //  The tile is embedded in a padded FBO, and it may start at a
+        //  nonzero offset if border texels are used to avoid artifacts:
+        const float2 mask_tex_uv = mask_tile_start_uv_and_size.xy +
+            tile_uv * mask_tile_start_uv_and_size.zw;
+        return mask_tex_uv;
+    }
+    else
+    {
+        //  Sample from the input phosphor mask texture with hardware tiling.
+        //  If we're tiling at the original size (mode 2), the "tile" is the
+        //  whole texture, and it contains a large number of triads mapped with
+        //  a 1:1 pixel:texel ratio.  OTHERWISE, the texture contains a single
+        //  unresized tile.  tile_uv_wrap already has correct coords for both!
+        return tile_uv_wrap;
+    }
+}
+
+
+#endif  //  PHOSPHOR_MASK_RESIZING_H
+
+/////////////////////////  END PHOSPHOR-MASK-RESIZING  /////////////////////////
+
+//#include "../../../../include/gamma-management.h"
+// already got it
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////////  HELPERS  //////////////////////////////////
+
+inline float4 tex2Dtiled_mask_linearize(const sampler2D tex,
+    const float2 tex_uv)
+{
+    //  If we're manually tiling a texture, anisotropic filtering can get
+    //  confused.  One workaround is to just select the lowest mip level:
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+            //  TODO: Use tex2Dlod_linearize with a calculated mip level.
+            return tex2Dlod_linearize(tex, float4(tex_uv, 0.0, 0.0));
+        #else
+            #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+                return tex2Dbias_linearize(tex, float4(tex_uv, 0.0, -16.0));
+            #else
+                return tex2D_linearize(tex, tex_uv);
+            #endif
+        #endif
+    #else
+        return tex2D_linearize(tex, tex_uv);
+    #endif
+}
+
+#undef COMPAT_PRECISION
+#undef COMPAT_TEXTURE
+
+// END VERTEX INCLUDES //
+
+float bloom_approx_scale_x = targetSize.x / sourceSize[0].y;
+const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0);
+
+void main() {
+   gl_Position = position;
+   vTexCoord = texCoord * 1.00001;
+   
+   float2 tex_uv = vTexCoord.xy;
+	//  Our various input textures use different coords.
+    video_uv = tex_uv * texture_size/video_size;
+    scanline_texture_size_inv =
+        float2(1.0, 1.0)/VERTICAL_SCANLINEStexture_size;
+    //video_uv = video_uv;
+    scanline_tex_uv = video_uv * VERTICAL_SCANLINESvideo_size *
+        scanline_texture_size_inv;
+    blur3x3_tex_uv = video_uv * BLOOM_APPROXvideo_size /
+        BLOOM_APPROXtexture_size;
+    halation_tex_uv = video_uv * HALATION_BLURvideo_size /
+        HALATION_BLURtexture_size;
+    //scanline_texture_size_inv = scanline_texture_size_inv;
+
+    //  Get a consistent name for the final mask texture size.  Sample mode 0
+    //  uses the manually resized mask, but ignore it if we never resized.
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        const float mask_sample_mode = get_mask_sample_mode();
+        const float2 mask_resize_texture_size = mask_sample_mode < 0.5 ?
+            MASK_RESIZEtexture_size : mask_texture_large_size;
+        const float2 mask_resize_video_size = mask_sample_mode < 0.5 ?
+            MASK_RESIZEvideo_size : mask_texture_large_size;
+    #else
+        const float2 mask_resize_texture_size = mask_texture_large_size;
+        const float2 mask_resize_video_size = mask_texture_large_size;
+    #endif
+    //  Compute mask tile dimensions, starting points, etc.:
+    //float2 mask_tiles_per_screen;
+    mask_tile_start_uv_and_size = get_mask_sampling_parameters(
+        mask_resize_texture_size, mask_resize_video_size, output_size,
+        mask_tiles_per_screen);
+    //mask_tiles_per_screen = mask_tiles_per_screen;
+}
\ No newline at end of file
diff --git a/shaders/CRT-Royale.shader/scanlines-vertical-interlacing.fs b/shaders/CRT-Royale.shader/scanlines-vertical-interlacing.fs
new file mode 100644
index 00000000..d090c529
--- /dev/null
+++ b/shaders/CRT-Royale.shader/scanlines-vertical-interlacing.fs
@@ -0,0 +1,5963 @@
+#version 150
+
+uniform sampler2D source[];
+uniform vec4 sourceSize[];
+uniform vec4 targetSize;
+uniform int phase;
+
+in Vertex {
+   vec2 vTexCoord;
+   vec2 uv_step;
+   vec2 il_step_multiple;
+   float pixel_height_in_scanlines;
+};
+
+out vec4 FragColor;
+
+// USER SETTINGS BLOCK //
+
+#define crt_gamma 2.500000
+#define lcd_gamma 2.200000
+#define levels_contrast 1.0
+#define halation_weight 0.0
+#define diffusion_weight 0.075
+#define bloom_underestimate_levels 0.8
+#define bloom_excess 0.000000
+#define beam_min_sigma 0.020000
+#define beam_max_sigma 0.300000
+#define beam_spot_power 0.330000
+#define beam_min_shape 2.000000
+#define beam_max_shape 4.000000
+#define beam_shape_power 0.250000
+#define beam_horiz_filter 0.000000
+#define beam_horiz_sigma 0.35
+#define beam_horiz_linear_rgb_weight 1.000000
+#define convergence_offset_x_r -0.000000
+#define convergence_offset_x_g 0.000000
+#define convergence_offset_x_b 0.000000
+#define convergence_offset_y_r 0.000000
+#define convergence_offset_y_g -0.000000
+#define convergence_offset_y_b 0.000000
+#define mask_type 1.000000
+#define mask_sample_mode_desired 0.000000
+#define mask_specify_num_triads 0.000000
+#define mask_triad_size_desired 3.000000
+#define mask_num_triads_desired 480.000000
+#define aa_subpixel_r_offset_x_runtime -0.0
+#define aa_subpixel_r_offset_y_runtime 0.000000
+#define aa_cubic_c 0.500000
+#define aa_gauss_sigma 0.500000
+#define geom_mode_runtime 0.000000
+#define geom_radius 2.000000
+#define geom_view_dist 2.000000
+#define geom_tilt_angle_x 0.000000
+#define geom_tilt_angle_y 0.000000
+#define geom_aspect_ratio_x 432.000000
+#define geom_aspect_ratio_y 329.000000
+#define geom_overscan_x 1.000000
+#define geom_overscan_y 1.000000
+#define border_size 0.015
+#define border_darkness 2.0
+#define border_compress 2.500000
+#define interlace_bff 0.000000
+#define interlace_1080i 0.000000
+
+// END USER SETTINGS BLOCK //
+
+// compatibility macros for transparently converting HLSLisms into GLSLisms
+#define mul(a,b) (b*a)
+#define lerp(a,b,c) mix(a,b,c)
+#define saturate(c) clamp(c, 0.0, 1.0)
+#define frac(x) (fract(x))
+#define float2 vec2
+#define float3 vec3
+#define float4 vec4
+#define bool2 bvec2
+#define bool3 bvec3
+#define bool4 bvec4
+#define float2x2 mat2x2
+#define float3x3 mat3x3
+#define float4x4 mat4x4
+#define float4x3 mat4x3
+#define float2x4 mat2x4
+#define IN params
+#define texture_size sourceSize[0].xy
+#define video_size sourceSize[0].xy
+#define output_size targetSize.xy
+#define frame_count phase
+#define static  
+#define inline  
+#define const  
+#define fmod(x,y) mod(x,y)
+#define ddx(c) dFdx(c)
+#define ddy(c) dFdy(c)
+#define atan2(x,y) atan(y,x)
+#define rsqrt(c) inversesqrt(c)
+
+#define input_texture source[0]
+
+#if defined(GL_ES)
+	#define COMPAT_PRECISION mediump
+#else
+	#define COMPAT_PRECISION
+#endif
+
+#if __VERSION__ >= 130
+	#define COMPAT_TEXTURE texture
+#else
+	#define COMPAT_TEXTURE texture2D
+#endif
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+////////////////////////////  END USER-SETTINGS  //////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  END DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////////////
+
+//#include "bind-shader-params.h"
+
+/////////////////////////////  BEGIN BIND-SHADER-PARAMS  ////////////////////////////
+
+#ifndef BIND_SHADER_PARAMS_H
+#define BIND_SHADER_PARAMS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+/////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+////////////////////   END DERIVED-SETTINGS-AND-CONSTANTS   /////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+//  Override some parameters for gamma-management.h and tex2Dantialias.h:
+#define OVERRIDE_DEVICE_GAMMA
+static const float gba_gamma = 3.5; //  Irrelevant but necessary to define.
+#define ANTIALIAS_OVERRIDE_BASICS
+#define ANTIALIAS_OVERRIDE_PARAMETERS
+
+//  Provide accessors for vector constants that pack scalar uniforms:
+inline float2 get_aspect_vector(const float geom_aspect_ratio)
+{
+    //  Get an aspect ratio vector.  Enforce geom_max_aspect_ratio, and prevent
+    //  the absolute scale from affecting the uv-mapping for curvature:
+    const float geom_clamped_aspect_ratio =
+        min(geom_aspect_ratio, geom_max_aspect_ratio);
+    const float2 geom_aspect =
+        normalize(float2(geom_clamped_aspect_ratio, 1.0));
+    return geom_aspect;
+}
+
+inline float2 get_geom_overscan_vector()
+{
+    return float2(geom_overscan_x, geom_overscan_y);
+}
+
+inline float2 get_geom_tilt_angle_vector()
+{
+    return float2(geom_tilt_angle_x, geom_tilt_angle_y);
+}
+
+inline float3 get_convergence_offsets_x_vector()
+{
+    return float3(convergence_offset_x_r, convergence_offset_x_g,
+        convergence_offset_x_b);
+}
+
+inline float3 get_convergence_offsets_y_vector()
+{
+    return float3(convergence_offset_y_r, convergence_offset_y_g,
+        convergence_offset_y_b);
+}
+
+inline float2 get_convergence_offsets_r_vector()
+{
+    return float2(convergence_offset_x_r, convergence_offset_y_r);
+}
+
+inline float2 get_convergence_offsets_g_vector()
+{
+    return float2(convergence_offset_x_g, convergence_offset_y_g);
+}
+
+inline float2 get_convergence_offsets_b_vector()
+{
+    return float2(convergence_offset_x_b, convergence_offset_y_b);
+}
+
+inline float2 get_aa_subpixel_r_offset()
+{
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+            //  WARNING: THIS IS EXTREMELY EXPENSIVE.
+            return float2(aa_subpixel_r_offset_x_runtime,
+                aa_subpixel_r_offset_y_runtime);
+        #else
+            return aa_subpixel_r_offset_static;
+        #endif
+    #else
+        return aa_subpixel_r_offset_static;
+    #endif
+}
+
+//  Provide accessors settings which still need "cooking:"
+inline float get_mask_amplify()
+{
+    static const float mask_grille_amplify = 1.0/mask_grille_avg_color;
+    static const float mask_slot_amplify = 1.0/mask_slot_avg_color;
+    static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color;
+    return mask_type < 0.5 ? mask_grille_amplify :
+        mask_type < 1.5 ? mask_slot_amplify :
+        mask_shadow_amplify;
+}
+
+inline float get_mask_sample_mode()
+{
+    #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_desired;
+        #else
+            return clamp(mask_sample_mode_desired, 1.0, 2.0);
+        #endif
+    #else
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_static;
+        #else
+            return clamp(mask_sample_mode_static, 1.0, 2.0);
+        #endif
+    #endif
+}
+
+#endif  //  BIND_SHADER_PARAMS_H
+
+////////////////////////////  END BIND-SHADER-PARAMS  ///////////////////////////
+
+//#include "scanline-functions.h"
+
+/////////////////////////////  BEGIN SCANLINE-FUNCTIONS  ////////////////////////////
+
+#ifndef SCANLINE_FUNCTIONS_H
+#define SCANLINE_FUNCTIONS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+////////////////////////////  END USER-SETTINGS  //////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  END DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////////////
+
+//#include "../../../../include/special-functions.h"
+
+///////////////////////////  BEGIN SPECIAL-FUNCTIONS  //////////////////////////
+
+#ifndef SPECIAL_FUNCTIONS_H
+#define SPECIAL_FUNCTIONS_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file implements the following mathematical special functions:
+//  1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2))
+//  2.) gamma(s), a real-numbered extension of the integer factorial function
+//  It also implements normalized_ligamma(s, z), a normalized lower incomplete
+//  gamma function for s < 0.5 only.  Both gamma() and normalized_ligamma() can
+//  be called with an _impl suffix to use an implementation version with a few
+//  extra precomputed parameters (which may be useful for the caller to reuse).
+//  See below for details.
+//
+//  Design Rationale:
+//  Pretty much every line of code in this file is duplicated four times for
+//  different input types (float4/float3/float2/float).  This is unfortunate,
+//  but Cg doesn't allow function templates.  Macros would be far less verbose,
+//  but they would make the code harder to document and read.  I don't expect
+//  these functions will require a whole lot of maintenance changes unless
+//  someone ever has need for more robust incomplete gamma functions, so code
+//  duplication seems to be the lesser evil in this case.
+
+
+///////////////////////////  GAUSSIAN ERROR FUNCTION  //////////////////////////
+
+float4 erf6(float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Return an Abramowitz/Stegun approximation of erf(), where:
+    //                  erf(x) = 2/sqrt(pi) * integral(e**(-x**2))
+    //              This approximation has a max absolute error of 2.5*10**-5
+    //              with solid numerical robustness and efficiency.  See:
+	//                  https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions
+	static const float4 one = float4(1.0);
+	const float4 sign_x = sign(x);
+	const float4 t = one/(one + 0.47047*abs(x));
+	const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float3 erf6(const float3 x)
+{
+    //  Float3 version:
+	static const float3 one = float3(1.0);
+	const float3 sign_x = sign(x);
+	const float3 t = one/(one + 0.47047*abs(x));
+	const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float2 erf6(const float2 x)
+{
+    //  Float2 version:
+	static const float2 one = float2(1.0);
+	const float2 sign_x = sign(x);
+	const float2 t = one/(one + 0.47047*abs(x));
+	const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float erf6(const float x)
+{
+    //  Float version:
+	const float sign_x = sign(x);
+	const float t = 1.0/(1.0 + 0.47047*abs(x));
+	const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float4 erft(const float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Approximate erf() with the hyperbolic tangent.  The error is
+    //              visually noticeable, but it's blazing fast and perceptually
+    //              close...at least on ATI hardware.  See:
+    //                  http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html
+    //  Warning:    Only use this if your hardware drivers correctly implement
+    //              tanh(): My nVidia 8800GTS returns garbage output.
+	return tanh(1.202760580 * x);
+}
+
+float3 erft(const float3 x)
+{
+    //  Float3 version:
+	return tanh(1.202760580 * x);
+}
+
+float2 erft(const float2 x)
+{
+    //  Float2 version:
+	return tanh(1.202760580 * x);
+}
+
+float erft(const float x)
+{
+    //  Float version:
+	return tanh(1.202760580 * x);
+}
+
+inline float4 erf(const float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Some approximation of erf(x), depending on user settings.
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float3 erf(const float3 x)
+{
+    //  Float3 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float2 erf(const float2 x)
+{
+    //  Float2 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float erf(const float x)
+{
+    //  Float version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+
+///////////////////////////  COMPLETE GAMMA FUNCTION  //////////////////////////
+
+float4 gamma_impl(const float4 s, const float4 s_inv)
+{
+    //  Requires:   1.) s is the standard parameter to the gamma function, and
+    //                  it should lie in the [0, 36] range.
+    //              2.) s_inv = 1.0/s.  This implementation function requires
+    //                  the caller to precompute this value, giving users the
+    //                  opportunity to reuse it.
+    //  Returns:    Return approximate gamma function (real-numbered factorial)
+    //              output using the Lanczos approximation with two coefficients
+    //              calculated using Paul Godfrey's method here:
+    //                  http://my.fit.edu/~gabdo/gamma.txt
+    //              An optimal g value for s in [0, 36] is ~1.12906830989, with
+    //              a maximum relative error of 0.000463 for 2**16 equally
+    //              evals.  We could use three coeffs (0.0000346 error) without
+    //              hurting latency, but this allows more parallelism with
+    //              outside instructions.
+	static const float4 g = float4(1.12906830989);
+	static const float4 c0 = float4(0.8109119309638332633713423362694399653724431);
+	static const float4 c1 = float4(0.4808354605142681877121661197951496120000040);
+	static const float4 e = float4(2.71828182845904523536028747135266249775724709);
+	const float4 sph = s + float4(0.5);
+	const float4 lanczos_sum = c0 + c1/(s + float4(1.0));
+	const float4 base = (sph + g)/e;  //  or (s + g + float4(0.5))/e
+	//  gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s).
+	//  This has less error for small s's than (s -= 1.0) at the beginning.
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float3 gamma_impl(const float3 s, const float3 s_inv)
+{
+    //  Float3 version:
+	static const float3 g = float3(1.12906830989);
+	static const float3 c0 = float3(0.8109119309638332633713423362694399653724431);
+	static const float3 c1 = float3(0.4808354605142681877121661197951496120000040);
+	static const float3 e = float3(2.71828182845904523536028747135266249775724709);
+	const float3 sph = s + float3(0.5);
+	const float3 lanczos_sum = c0 + c1/(s + float3(1.0));
+	const float3 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float2 gamma_impl(const float2 s, const float2 s_inv)
+{
+    //  Float2 version:
+	static const float2 g = float2(1.12906830989);
+	static const float2 c0 = float2(0.8109119309638332633713423362694399653724431);
+	static const float2 c1 = float2(0.4808354605142681877121661197951496120000040);
+	static const float2 e = float2(2.71828182845904523536028747135266249775724709);
+	const float2 sph = s + float2(0.5);
+	const float2 lanczos_sum = c0 + c1/(s + float2(1.0));
+	const float2 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float gamma_impl(const float s, const float s_inv)
+{
+    //  Float version:
+	static const float g = 1.12906830989;
+	static const float c0 = 0.8109119309638332633713423362694399653724431;
+	static const float c1 = 0.4808354605142681877121661197951496120000040;
+	static const float e = 2.71828182845904523536028747135266249775724709;
+	const float sph = s + 0.5;
+	const float lanczos_sum = c0 + c1/(s + 1.0);
+	const float base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float4 gamma(const float4 s)
+{
+    //  Requires:   s is the standard parameter to the gamma function, and it
+    //              should lie in the [0, 36] range.
+    //  Returns:    Return approximate gamma function output with a maximum
+    //              relative error of 0.000463.  See gamma_impl for details.
+	return gamma_impl(s, float4(1.0)/s);
+}
+
+float3 gamma(const float3 s)
+{
+    //  Float3 version:
+	return gamma_impl(s, float3(1.0)/s);
+}
+
+float2 gamma(const float2 s)
+{
+    //  Float2 version:
+	return gamma_impl(s, float2(1.0)/s);
+}
+
+float gamma(const float s)
+{
+    //  Float version:
+	return gamma_impl(s, 1.0/s);
+}
+
+
+////////////////  INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT)  ///////////////
+
+//  Lower incomplete gamma function for small s and z (implementation):
+float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z <= ~0.775075
+    //              3.) s_inv = 1.0/s (precomputed for outside reuse)
+    //  Returns:    A series representation for the lower incomplete gamma
+    //              function for small s and small z (4 terms).
+    //  The actual "rolled up" summation looks like:
+	//      last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0;
+	//      sum = last_sign * last_pow / ((s + k) * last_factorial)
+	//      for(int i = 0; i < 4; ++i)
+	//      {
+	//          last_sign *= -1.0; last_pow *= z; last_factorial *= i;
+	//          sum += last_sign * last_pow / ((s + k) * last_factorial);
+	//      }
+	//  Unrolled, constant-unfolded and arranged for madds and parallelism:
+	const float4 scale = pow(z, s);
+	float4 sum = s_inv;  //  Summation iteration 0 result
+	//  Summation iterations 1, 2, and 3:
+	const float4 z_sq = z*z;
+	const float4 denom1 = s + float4(1.0);
+	const float4 denom2 = 2.0*s + float4(4.0);
+	const float4 denom3 = 6.0*s + float4(18.0);
+	//float4 denom4 = 24.0*s + float4(96.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	//sum += z_sq * z_sq / denom4;
+	//  Scale and return:
+	return scale * sum;
+}
+
+float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv)
+{
+    //  Float3 version:
+	const float3 scale = pow(z, s);
+	float3 sum = s_inv;
+	const float3 z_sq = z*z;
+	const float3 denom1 = s + float3(1.0);
+	const float3 denom2 = 2.0*s + float3(4.0);
+	const float3 denom3 = 6.0*s + float3(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv)
+{
+    //  Float2 version:
+	const float2 scale = pow(z, s);
+	float2 sum = s_inv;
+	const float2 z_sq = z*z;
+	const float2 denom1 = s + float2(1.0);
+	const float2 denom2 = 2.0*s + float2(4.0);
+	const float2 denom3 = 6.0*s + float2(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float ligamma_small_z_impl(const float s, const float z, const float s_inv)
+{
+    //  Float version:
+	const float scale = pow(z, s);
+	float sum = s_inv;
+	const float z_sq = z*z;
+	const float denom1 = s + 1.0;
+	const float denom2 = 2.0*s + 4.0;
+	const float denom3 = 6.0*s + 18.0;
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+//  Upper incomplete gamma function for small s and large z (implementation):
+float4 uigamma_large_z_impl(const float4 s, const float4 z)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z > ~0.775075
+    //  Returns:    Gauss's continued fraction representation for the upper
+    //              incomplete gamma function (4 terms).
+	//  The "rolled up" continued fraction looks like this.  The denominator
+    //  is truncated, and it's calculated "from the bottom up:"
+	//      denom = float4('inf');
+	//      float4 one = float4(1.0);
+	//      for(int i = 4; i > 0; --i)
+	//      {
+	//          denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom;
+	//      }
+	//  Unrolled and constant-unfolded for madds and parallelism:
+	const float4 numerator = pow(z, s) * exp(-z);
+	float4 denom = float4(7.0) + z - s;
+	denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom;
+	denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom;
+	denom = float4(1.0) + z - s + (s - float4(1.0))/denom;
+	return numerator / denom;
+}
+
+float3 uigamma_large_z_impl(const float3 s, const float3 z)
+{
+    //  Float3 version:
+	const float3 numerator = pow(z, s) * exp(-z);
+	float3 denom = float3(7.0) + z - s;
+	denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom;
+	denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom;
+	denom = float3(1.0) + z - s + (s - float3(1.0))/denom;
+	return numerator / denom;
+}
+
+float2 uigamma_large_z_impl(const float2 s, const float2 z)
+{
+    //  Float2 version:
+	const float2 numerator = pow(z, s) * exp(-z);
+	float2 denom = float2(7.0) + z - s;
+	denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom;
+	denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom;
+	denom = float2(1.0) + z - s + (s - float2(1.0))/denom;
+	return numerator / denom;
+}
+
+float uigamma_large_z_impl(const float s, const float z)
+{
+    //  Float version:
+	const float numerator = pow(z, s) * exp(-z);
+	float denom = 7.0 + z - s;
+	denom = 5.0 + z - s + (3.0*s - 9.0)/denom;
+	denom = 3.0 + z - s + (2.0*s - 4.0)/denom;
+	denom = 1.0 + z - s + (s - 1.0)/denom;
+	return numerator / denom;
+}
+
+//  Normalized lower incomplete gamma function for small s (implementation):
+float4 normalized_ligamma_impl(const float4 s, const float4 z,
+    const float4 s_inv, const float4 gamma_s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) s_inv = 1/s (precomputed for outside reuse)
+    //              3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse)
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  Since we only care about s < 0.5, we only need
+    //              to evaluate two branches (not four) based on z.  Each branch
+    //              uses four terms, with a max relative error of ~0.00182.  The
+    //              branch threshold and specifics were adapted for fewer terms
+    //              from Gil/Segura/Temme's paper here:
+    //                  http://oai.cwi.nl/oai/asset/20433/20433B.pdf
+	//  Evaluate both branches: Real branches test slower even when available.
+	static const float4 thresh = float4(0.775075);
+	bool4 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	z_is_large.w = z.w > thresh.w;
+	const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	//  Combine the results from both branches:
+	bool4 inverse_z_is_large = not(z_is_large);
+	return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large);
+}
+
+float3 normalized_ligamma_impl(const float3 s, const float3 z,
+    const float3 s_inv, const float3 gamma_s_inv)
+{
+    //  Float3 version:
+	static const float3 thresh = float3(0.775075);
+	bool3 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool3 inverse_z_is_large = not(z_is_large);
+	return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large);
+}
+
+float2 normalized_ligamma_impl(const float2 s, const float2 z,
+    const float2 s_inv, const float2 gamma_s_inv)
+{
+    //  Float2 version:
+	static const float2 thresh = float2(0.775075);
+	bool2 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool2 inverse_z_is_large = not(z_is_large);
+	return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large);
+}
+
+float normalized_ligamma_impl(const float s, const float z,
+    const float s_inv, const float gamma_s_inv)
+{
+    //  Float version:
+	static const float thresh = 0.775075;
+	const bool z_is_large = z > thresh;
+	const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	return large_z * float(z_is_large) + small_z * float(!z_is_large);
+}
+
+//  Normalized lower incomplete gamma function for small s:
+float4 normalized_ligamma(const float4 s, const float4 z)
+{
+    //  Requires:   s < ~0.5
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  See normalized_ligamma_impl() for details.
+	const float4 s_inv = float4(1.0)/s;
+	const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float3 normalized_ligamma(const float3 s, const float3 z)
+{
+    //  Float3 version:
+	const float3 s_inv = float3(1.0)/s;
+	const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float2 normalized_ligamma(const float2 s, const float2 z)
+{
+    //  Float2 version:
+	const float2 s_inv = float2(1.0)/s;
+	const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float normalized_ligamma(const float s, const float z)
+{
+    //  Float version:
+	const float s_inv = 1.0/s;
+	const float gamma_s_inv = 1.0/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+#endif  //  SPECIAL_FUNCTIONS_H
+
+////////////////////////////  END SPECIAL-FUNCTIONS  ///////////////////////////
+
+//#include "../../../../include/gamma-management.h"
+
+////////////////////////////  BEGIN GAMMA-MANAGEMENT  //////////////////////////
+
+#ifndef GAMMA_MANAGEMENT_H
+#define GAMMA_MANAGEMENT_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//  
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides gamma-aware tex*D*() and encode_output() functions.
+//  Requires:   Before #include-ing this file, the including file must #define
+//              the following macros when applicable and follow their rules:
+//              1.) #define FIRST_PASS if this is the first pass.
+//              2.) #define LAST_PASS if this is the last pass.
+//              3.) If sRGB is available, set srgb_framebufferN = "true" for
+//                  every pass except the last in your .cgp preset.
+//              4.) If sRGB isn't available but you want gamma-correctness with
+//                  no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
+//              5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
+//              6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
+//              7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
+//              8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
+//              If an option in [5, 8] is #defined in the first or last pass, it
+//              should be #defined for both.  It shouldn't make a difference
+//              whether it's #defined for intermediate passes or not.
+//  Optional:   The including file (or an earlier included file) may optionally
+//              #define a number of macros indicating it will override certain
+//              macros and associated constants are as follows:
+//              static constants with either static or uniform constants.  The
+//              1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
+//                  static const float ntsc_gamma
+//                  static const float pal_gamma
+//                  static const float crt_reference_gamma_high
+//                  static const float crt_reference_gamma_low
+//                  static const float lcd_reference_gamma
+//                  static const float crt_office_gamma
+//                  static const float lcd_office_gamma
+//              2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
+//                  static const float crt_gamma
+//                  static const float gba_gamma
+//                  static const float lcd_gamma
+//              3.) OVERRIDE_FINAL_GAMMA: The user must first define:
+//                  static const float input_gamma
+//                  static const float intermediate_gamma
+//                  static const float output_gamma
+//                  (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
+//              4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
+//                  static const bool assume_opaque_alpha
+//              The gamma constant overrides must be used in every pass or none,
+//              and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
+//              OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
+//  Usage:      After setting macros appropriately, ignore gamma correction and
+//              replace all tex*D*() calls with equivalent gamma-aware
+//              tex*D*_linearize calls, except:
+//              1.) When you read an LUT, use regular tex*D or a gamma-specified
+//                  function, depending on its gamma encoding:
+//                      tex*D*_linearize_gamma (takes a runtime gamma parameter)
+//              2.) If you must read pass0's original input in a later pass, use
+//                  tex2D_linearize_ntsc_gamma.  If you want to read pass0's
+//                  input with gamma-corrected bilinear filtering, consider
+//                  creating a first linearizing pass and reading from the input
+//                  of pass1 later.
+//              Then, return encode_output(color) from every fragment shader.
+//              Finally, use the global gamma_aware_bilinear boolean if you want
+//              to statically branch based on whether bilinear filtering is
+//              gamma-correct or not (e.g. for placing Gaussian blur samples).
+//
+//  Detailed Policy:
+//  tex*D*_linearize() functions enforce a consistent gamma-management policy
+//  based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings.  They assume
+//  their input texture has the same encoding characteristics as the input for
+//  the current pass (which doesn't apply to the exceptions listed above).
+//  Similarly, encode_output() enforces a policy based on the LAST_PASS and
+//  GAMMA_ENCODE_EVERY_FBO settings.  Together, they result in one of the
+//  following two pipelines.
+//  Typical pipeline with intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = linear_color;     //  Automatic sRGB encoding
+//      linear_color = intermediate_output;     //  Automatic sRGB decoding
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Typical pipeline without intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
+//      linear_color = pow(intermediate_output, intermediate_gamma);
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
+//  easily get gamma-correctness without banding on devices where sRGB isn't
+//  supported.
+//
+//  Use This Header to Maximize Code Reuse:
+//  The purpose of this header is to provide a consistent interface for texture
+//  reads and output gamma-encoding that localizes and abstracts away all the
+//  annoying details.  This greatly reduces the amount of code in each shader
+//  pass that depends on the pass number in the .cgp preset or whether sRGB
+//  FBO's are being used: You can trivially change the gamma behavior of your
+//  whole pass by commenting or uncommenting 1-3 #defines.  To reuse the same
+//  code in your first, Nth, and last passes, you can even put it all in another
+//  header file and #include it from skeleton .cg files that #define the
+//  appropriate pass-specific settings.
+//
+//  Rationale for Using Three Macros:
+//  This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
+//  SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
+//  a lower maintenance burden on each pass.  At first glance it seems we could
+//  accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
+//  This works for simple use cases where input_gamma == output_gamma, but it
+//  breaks down for more complex scenarios like CRT simulation, where the pass
+//  number determines the gamma encoding of the input and output.
+
+
+///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
+
+//  Set standard gamma constants, but allow users to override them:
+#ifndef OVERRIDE_STANDARD_GAMMA
+    //  Standard encoding gammas:
+    static const float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
+    static const float pal_gamma = 2.8;     //  Never actually 2.8 in practice
+    //  Typical device decoding gammas (only use for emulating devices):
+    //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
+    //  gammas: The standards purposely undercorrected for an analog CRT's
+    //  assumed 2.5 reference display gamma to maintain contrast in assumed
+    //  [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
+    //  These unstated assumptions about display gamma and perceptual rendering
+    //  intent caused a lot of confusion, and more modern CRT's seemed to target
+    //  NTSC 2.2 gamma with circuitry.  LCD displays seem to have followed suit
+    //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
+    //  displays designed to view sRGB in bright environments.  (Standards are
+    //  also in flux again with BT.1886, but it's underspecified for displays.)
+    static const float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
+    static const float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
+    static const float lcd_reference_gamma = 2.5;       //  To match CRT
+    static const float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
+    static const float lcd_office_gamma = 2.2;  //  Approximates sRGB
+#endif  //  OVERRIDE_STANDARD_GAMMA
+
+//  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
+//  but only if they're aware of it.
+#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
+    static const bool assume_opaque_alpha = false;
+#endif
+
+
+///////////////////////  DERIVED CONSTANTS AS FUNCTIONS  ///////////////////////
+
+//  gamma-management.h should be compatible with overriding gamma values with
+//  runtime user parameters, but we can only define other global constants in
+//  terms of static constants, not uniform user parameters.  To get around this
+//  limitation, we need to define derived constants using functions.
+
+//  Set device gamma constants, but allow users to override them:
+#ifdef OVERRIDE_DEVICE_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_crt_gamma()    {   return crt_gamma;   }
+    inline float get_gba_gamma()    {   return gba_gamma;   }
+    inline float get_lcd_gamma()    {   return lcd_gamma;   }
+#else
+    inline float get_crt_gamma()    {   return crt_reference_gamma_high;    }
+    inline float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
+    inline float get_lcd_gamma()    {   return lcd_office_gamma;            }
+#endif  //  OVERRIDE_DEVICE_GAMMA
+
+//  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
+#ifdef OVERRIDE_FINAL_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_intermediate_gamma()   {   return intermediate_gamma;  }
+    inline float get_input_gamma()          {   return input_gamma;         }
+    inline float get_output_gamma()         {   return output_gamma;        }
+#else
+    //  If we gamma-correct every pass, always use ntsc_gamma between passes to
+    //  ensure middle passes don't need to care if anything is being simulated:
+    inline float get_intermediate_gamma()   {   return ntsc_gamma;          }
+    #ifdef SIMULATE_CRT_ON_LCD
+        inline float get_input_gamma()      {   return get_crt_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_LCD
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_LCD_ON_CRT
+        inline float get_input_gamma()      {   return get_lcd_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_CRT
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else   //  Don't simulate anything:
+        inline float get_input_gamma()      {   return ntsc_gamma;          }
+        inline float get_output_gamma()     {   return ntsc_gamma;          }
+    #endif  //  SIMULATE_GBA_ON_CRT
+    #endif  //  SIMULATE_LCD_ON_CRT
+    #endif  //  SIMULATE_GBA_ON_LCD
+    #endif  //  SIMULATE_CRT_ON_LCD
+#endif  //  OVERRIDE_FINAL_GAMMA
+
+//  Set decoding/encoding gammas for the current pass.  Use static constants for
+//  linearize_input and gamma_encode_output, because they aren't derived, and
+//  they let the compiler do dead-code elimination.
+#ifndef GAMMA_ENCODE_EVERY_FBO
+    #ifdef FIRST_PASS
+        static const bool linearize_input = true;
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        static const bool linearize_input = false;
+        inline float get_pass_input_gamma()     {   return 1.0;                 }
+    #endif
+    #ifdef LAST_PASS
+        static const bool gamma_encode_output = true;
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        static const bool gamma_encode_output = false;
+        inline float get_pass_output_gamma()    {   return 1.0;                 }
+    #endif
+#else
+    static const bool linearize_input = true;
+    static const bool gamma_encode_output = true;
+    #ifdef FIRST_PASS
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        inline float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
+    #endif
+    #ifdef LAST_PASS
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        inline float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
+    #endif
+#endif
+
+//  Users might want to know if bilinear filtering will be gamma-correct:
+static const bool gamma_aware_bilinear = !linearize_input;
+
+
+//////////////////////  COLOR ENCODING/DECODING FUNCTIONS  /////////////////////
+
+inline float4 encode_output(const float4 color)
+{
+    if(gamma_encode_output)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_input(const float4 color)
+{
+    if(linearize_input)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_gamma_input(const float4 color, const float3 gamma)
+{
+    if(assume_opaque_alpha)
+    {
+        return float4(pow(color.rgb, gamma), 1.0);
+    }
+    else
+    {
+        return float4(pow(color.rgb, gamma), color.a);
+    }
+}
+
+//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯
+//#define tex2D_linearize(C, D) decode_input(vec4(texture(C, D)))
+// EDIT: it's the 'const' in front of the coords that's doing it
+
+///////////////////////////  TEXTURE LOOKUP WRAPPERS  //////////////////////////
+
+//  "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a wide array of linearizing texture lookup wrapper functions.  The
+//  Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
+//  lookups are provided for completeness in case that changes someday.  Nobody
+//  is likely to use the *fetch and *proj functions, but they're included just
+//  in case.  The only tex*D texture sampling functions omitted are:
+//      - tex*Dcmpbias
+//      - tex*Dcmplod
+//      - tex*DARRAY*
+//      - tex*DMS*
+//      - Variants returning integers
+//  Standard line length restrictions are ignored below for vertical brevity.
+/*
+//  tex1D:
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex1Dbias:
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dbias(tex, tex_coords));   }
+
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dbias(tex, tex_coords, texel_off));    }
+
+//  tex1Dfetch:
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
+{   return decode_input(tex1Dfetch(tex, tex_coords));  }
+
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex1Dlod:
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dlod(tex, tex_coords));    }
+
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dlod(tex, tex_coords, texel_off));     }
+
+//  tex1Dproj:
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+*/
+//  tex2D:
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords, texel_off));    }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex2Dbias:
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
+//{   return decode_input(tex2Dbias(tex, tex_coords));   }
+
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dbias(tex, tex_coords, texel_off));    }
+
+//  tex2Dfetch:
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
+//{   return decode_input(tex2Dfetch(tex, tex_coords));  }
+
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords)
+{   return decode_input(textureLod(tex, tex_coords.xy, 0.0));    }
+
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));     }
+/*
+//  tex2Dproj:
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+*/
+/*
+//  tex3D:
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
+{   return decode_input(tex3D(tex, tex_coords));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, texel_off));    }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex3Dbias:
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dbias(tex, tex_coords));   }
+
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dbias(tex, tex_coords, texel_off));    }
+
+//  tex3Dfetch:
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
+{   return decode_input(tex3Dfetch(tex, tex_coords));  }
+
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex3Dlod:
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dlod(tex, tex_coords));    }
+
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dlod(tex, tex_coords, texel_off));     }
+
+//  tex3Dproj:
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dproj(tex, tex_coords));   }
+
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dproj(tex, tex_coords, texel_off));    }
+/////////*
+
+//  NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  This narrow selection of nonstandard tex2D* functions can be useful:
+
+//  tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0)));   }
+
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off));    }
+
+
+//  MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a narrower selection of tex2D* wrapper functions that decode an
+//  input sample with a specified gamma value.  These are useful for reading
+//  LUT's and for reading the input of pass0 in a later pass.
+
+//  tex2D:
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma);   }
+
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+/*
+//  tex2Dbias:
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma);   }
+
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma);    }
+
+//  tex2Dfetch:
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma);  }
+
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma);   }
+*/
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma);    }
+
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma);     }
+
+
+#endif  //  GAMMA_MANAGEMENT_H
+
+////////////////////////////  END GAMMA-MANAGEMENT  //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+/////////////////////////////  SCANLINE FUNCTIONS  /////////////////////////////
+
+inline float3 get_gaussian_sigma(const float3 color, const float sigma_range)
+{
+    //  Requires:   Globals:
+    //              1.) beam_min_sigma and beam_max_sigma are global floats
+    //                  containing the desired minimum and maximum beam standard
+    //                  deviations, for dim and bright colors respectively.
+    //              2.) beam_max_sigma must be > 0.0
+    //              3.) beam_min_sigma must be in (0.0, beam_max_sigma]
+    //              4.) beam_spot_power must be defined as a global float.
+    //              Parameters:
+    //              1.) color is the underlying source color along a scanline
+    //              2.) sigma_range = beam_max_sigma - beam_min_sigma; we take
+    //                  sigma_range as a parameter to avoid repeated computation
+    //                  when beam_{min, max}_sigma are runtime shader parameters
+    //  Optional:   Users may set beam_spot_shape_function to 1 to define the
+    //              inner f(color) subfunction (see below) as:
+    //                  f(color) = sqrt(1.0 - (color - 1.0)*(color - 1.0))
+    //              Otherwise (technically, if beam_spot_shape_function < 0.5):
+    //                  f(color) = pow(color, beam_spot_power)
+    //  Returns:    The standard deviation of the Gaussian beam for "color:"
+    //                  sigma = beam_min_sigma + sigma_range * f(color)
+    //  Details/Discussion:
+    //  The beam's spot shape vaguely resembles an aspect-corrected f() in the
+    //  range [0, 1] (not quite, but it's related).  f(color) = color makes
+    //  spots look like diamonds, and a spherical function or cube balances
+    //  between variable width and a soft/realistic shape.   A beam_spot_power
+    //  > 1.0 can produce an ugly spot shape and more initial clipping, but the
+    //  final shape also differs based on the horizontal resampling filter and
+    //  the phosphor bloom.  For instance, resampling horizontally in nonlinear
+    //  light and/or with a sharp (e.g. Lanczos) filter will sharpen the spot
+    //  shape, but a sixth root is still quite soft.  A power function (default
+    //  1.0/3.0 beam_spot_power) is most flexible, but a fixed spherical curve
+    //  has the highest variability without an awful spot shape.
+    //
+    //  beam_min_sigma affects scanline sharpness/aliasing in dim areas, and its
+    //  difference from beam_max_sigma affects beam width variability.  It only
+    //  affects clipping [for pure Gaussians] if beam_spot_power > 1.0 (which is
+    //  a conservative estimate for a more complex constraint).
+    //
+    //  beam_max_sigma affects clipping and increasing scanline width/softness
+    //  as color increases.  The wider this is, the more scanlines need to be
+    //  evaluated to avoid distortion.  For a pure Gaussian, the max_beam_sigma
+    //  at which the first unused scanline always has a weight < 1.0/255.0 is:
+    //      num scanlines = 2, max_beam_sigma = 0.2089; distortions begin ~0.34
+    //      num scanlines = 3, max_beam_sigma = 0.3879; distortions begin ~0.52
+    //      num scanlines = 4, max_beam_sigma = 0.5723; distortions begin ~0.70
+    //      num scanlines = 5, max_beam_sigma = 0.7591; distortions begin ~0.89
+    //      num scanlines = 6, max_beam_sigma = 0.9483; distortions begin ~1.08
+    //  Generalized Gaussians permit more leeway here as steepness increases.
+    if(beam_spot_shape_function < 0.5)
+    {
+        //  Use a power function:
+        return float3(beam_min_sigma) + sigma_range *
+            pow(color, float3(beam_spot_power));
+    }
+    else
+    {
+        //  Use a spherical function:
+        const float3 color_minus_1 = color - float3(1.0);
+        return float3(beam_min_sigma) + sigma_range *
+            sqrt(float3(1.0) - color_minus_1*color_minus_1);
+    }
+}
+
+inline float3 get_generalized_gaussian_beta(const float3 color,
+    const float shape_range)
+{
+    //  Requires:   Globals:
+    //              1.) beam_min_shape and beam_max_shape are global floats
+    //                  containing the desired min/max generalized Gaussian
+    //                  beta parameters, for dim and bright colors respectively.
+    //              2.) beam_max_shape must be >= 2.0
+    //              3.) beam_min_shape must be in [2.0, beam_max_shape]
+    //              4.) beam_shape_power must be defined as a global float.
+    //              Parameters:
+    //              1.) color is the underlying source color along a scanline
+    //              2.) shape_range = beam_max_shape - beam_min_shape; we take
+    //                  shape_range as a parameter to avoid repeated computation
+    //                  when beam_{min, max}_shape are runtime shader parameters
+    //  Returns:    The type-I generalized Gaussian "shape" parameter beta for
+    //              the given color.
+    //  Details/Discussion:
+    //  Beta affects the scanline distribution as follows:
+    //  a.) beta < 2.0 narrows the peak to a spike with a discontinuous slope
+    //  b.) beta == 2.0 just degenerates to a Gaussian
+    //  c.) beta > 2.0 flattens and widens the peak, then drops off more steeply
+    //      than a Gaussian.  Whereas high sigmas widen and soften peaks, high
+    //      beta widen and sharpen peaks at the risk of aliasing.
+    //  Unlike high beam_spot_powers, high beam_shape_powers actually soften shape
+    //  transitions, whereas lower ones sharpen them (at the risk of aliasing).
+    return beam_min_shape + shape_range * pow(color, float3(beam_shape_power));
+}
+
+float3 scanline_gaussian_integral_contrib(const float3 dist,
+    const float3 color, const float pixel_height, const float sigma_range)
+{
+    //  Requires:   1.) dist is the distance of the [potentially separate R/G/B]
+    //                  point(s) from a scanline in units of scanlines, where
+    //                  1.0 means the sample point straddles the next scanline.
+    //              2.) color is the underlying source color along a scanline.
+    //              3.) pixel_height is the output pixel height in scanlines.
+    //              4.) Requirements of get_gaussian_sigma() must be met.
+    //  Returns:    Return a scanline's light output over a given pixel.
+    //  Details:
+    //  The CRT beam profile follows a roughly Gaussian distribution which is
+    //  wider for bright colors than dark ones.  The integral over the full
+    //  range of a Gaussian function is always 1.0, so we can vary the beam
+    //  with a standard deviation without affecting brightness.  'x' = distance:
+    //      gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2))
+    //      gaussian integral = 0.5 (1.0 + erf(x/(sigma * sqrt(2))))
+    //  Use a numerical approximation of the "error function" (the Gaussian
+    //  indefinite integral) to find the definite integral of the scanline's
+    //  average brightness over a given pixel area.  Even if curved coords were
+    //  used in this pass, a flat scalar pixel height works almost as well as a
+    //  pixel height computed from a full pixel-space to scanline-space matrix.
+    const float3 sigma = get_gaussian_sigma(color, sigma_range);
+    const float3 ph_offset = float3(pixel_height * 0.5);
+    const float3 denom_inv = 1.0/(sigma*sqrt(2.0));
+    const float3 integral_high = erf((dist + ph_offset)*denom_inv);
+    const float3 integral_low = erf((dist - ph_offset)*denom_inv);
+    return color * 0.5*(integral_high - integral_low)/pixel_height;
+}
+
+float3 scanline_generalized_gaussian_integral_contrib(float3 dist,
+    float3 color, float pixel_height, float sigma_range,
+    float shape_range)
+{
+    //  Requires:   1.) Requirements of scanline_gaussian_integral_contrib()
+    //                  must be met.
+    //              2.) Requirements of get_gaussian_sigma() must be met.
+    //              3.) Requirements of get_generalized_gaussian_beta() must be
+    //                  met.
+    //  Returns:    Return a scanline's light output over a given pixel.
+    //  A generalized Gaussian distribution allows the shape (beta) to vary
+    //  as well as the width (alpha).  "gamma" refers to the gamma function:
+    //      generalized sample =
+    //          beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta)
+    //  ligamma(s, z) is the lower incomplete gamma function, for which we only
+    //  implement two of four branches (because we keep 1/beta <= 0.5):
+    //      generalized integral = 0.5 + 0.5* sign(x) *
+    //          ligamma(1/beta, (|x|/alpha)**beta)/gamma(1/beta)
+    //  See get_generalized_gaussian_beta() for a discussion of beta.
+    //  We base alpha on the intended Gaussian sigma, but it only strictly
+    //  models models standard deviation at beta == 2, because the standard
+    //  deviation depends on both alpha and beta (keeping alpha independent is
+    //  faster and preserves intuitive behavior and a full spectrum of results).
+    const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
+    const float3 beta = get_generalized_gaussian_beta(color, shape_range);
+    const float3 alpha_inv = float3(1.0)/alpha;
+    const float3 s = float3(1.0)/beta;
+    const float3 ph_offset = float3(pixel_height * 0.5);
+    //  Pass beta to gamma_impl to avoid repeated divides.  Similarly pass
+    //  beta (i.e. 1/s) and 1/gamma(s) to normalized_ligamma_impl.
+    const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, beta);
+    const float3 dist1 = dist + ph_offset;
+    const float3 dist0 = dist - ph_offset;
+    const float3 integral_high = sign(dist1) * normalized_ligamma_impl(
+        s, pow(abs(dist1)*alpha_inv, beta), beta, gamma_s_inv);
+    const float3 integral_low = sign(dist0) * normalized_ligamma_impl(
+        s, pow(abs(dist0)*alpha_inv, beta), beta, gamma_s_inv);
+    return color * 0.5*(integral_high - integral_low)/pixel_height;
+}
+
+float3 scanline_gaussian_sampled_contrib(const float3 dist, const float3 color,
+    const float pixel_height, const float sigma_range)
+{
+    //  See scanline_gaussian integral_contrib() for detailed comments!
+    //  gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2))
+    const float3 sigma = get_gaussian_sigma(color, sigma_range);
+    //  Avoid repeated divides:
+    const float3 sigma_inv = float3(1.0)/sigma;
+    const float3 inner_denom_inv = 0.5 * sigma_inv * sigma_inv;
+    const float3 outer_denom_inv = sigma_inv/sqrt(2.0*pi);
+    if(beam_antialias_level > 0.5)
+    {
+        //  Sample 1/3 pixel away in each direction as well:
+        const float3 sample_offset = float3(pixel_height/3.0);
+        const float3 dist2 = dist + sample_offset;
+        const float3 dist3 = abs(dist - sample_offset);
+        //  Average three pure Gaussian samples:
+        const float3 scale = color/3.0 * outer_denom_inv;
+        const float3 weight1 = exp(-(dist*dist)*inner_denom_inv);
+        const float3 weight2 = exp(-(dist2*dist2)*inner_denom_inv);
+        const float3 weight3 = exp(-(dist3*dist3)*inner_denom_inv);
+        return scale * (weight1 + weight2 + weight3);
+    }
+    else
+    {
+        return color*exp(-(dist*dist)*inner_denom_inv)*outer_denom_inv;
+    }
+}
+
+float3 scanline_generalized_gaussian_sampled_contrib(float3 dist,
+    float3 color, float pixel_height, float sigma_range,
+    float shape_range)
+{
+    //  See scanline_generalized_gaussian_integral_contrib() for details!
+    //  generalized sample =
+    //      beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta)
+    const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
+    const float3 beta = get_generalized_gaussian_beta(color, shape_range);
+    //  Avoid repeated divides:
+    const float3 alpha_inv = float3(1.0)/alpha;
+    const float3 beta_inv = float3(1.0)/beta;
+    const float3 scale = color * beta * 0.5 * alpha_inv /
+        gamma_impl(beta_inv, beta);
+    if(beam_antialias_level > 0.5)
+    {
+        //  Sample 1/3 pixel closer to and farther from the scanline too.
+        const float3 sample_offset = float3(pixel_height/3.0);
+        const float3 dist2 = dist + sample_offset;
+        const float3 dist3 = abs(dist - sample_offset);
+        //  Average three generalized Gaussian samples:
+        const float3 weight1 = exp(-pow(abs(dist*alpha_inv), beta));
+        const float3 weight2 = exp(-pow(abs(dist2*alpha_inv), beta));
+        const float3 weight3 = exp(-pow(abs(dist3*alpha_inv), beta));
+        return scale/3.0 * (weight1 + weight2 + weight3);
+    }
+    else
+    {
+        return scale * exp(-pow(abs(dist*alpha_inv), beta));
+    }
+}
+
+inline float3 scanline_contrib(float3 dist, float3 color,
+    float pixel_height, const float sigma_range, const float shape_range)
+{
+    //  Requires:   1.) Requirements of scanline_gaussian_integral_contrib()
+    //                  must be met.
+    //              2.) Requirements of get_gaussian_sigma() must be met.
+    //              3.) Requirements of get_generalized_gaussian_beta() must be
+    //                  met.
+    //  Returns:    Return a scanline's light output over a given pixel, using
+    //              a generalized or pure Gaussian distribution and sampling or
+    //              integrals as desired by user codepath choices.
+    if(beam_generalized_gaussian)
+    {
+        if(beam_antialias_level > 1.5)
+        {
+            return scanline_generalized_gaussian_integral_contrib(
+                dist, color, pixel_height, sigma_range, shape_range);
+        }
+        else
+        {
+            return scanline_generalized_gaussian_sampled_contrib(
+                dist, color, pixel_height, sigma_range, shape_range);
+        }
+    }
+    else
+    {
+        if(beam_antialias_level > 1.5)
+        {
+            return scanline_gaussian_integral_contrib(
+                dist, color, pixel_height, sigma_range);
+        }
+        else
+        {
+            return scanline_gaussian_sampled_contrib(
+                dist, color, pixel_height, sigma_range);
+        }
+    }
+}
+
+inline float3 get_raw_interpolated_color(const float3 color0,
+    const float3 color1, const float3 color2, const float3 color3,
+    const float4 weights)
+{
+    //  Use max to avoid bizarre artifacts from negative colors:
+    return max(mul(weights, float4x3(color0, color1, color2, color3)), 0.0);
+}
+
+float3 get_interpolated_linear_color(const float3 color0, const float3 color1,
+    const float3 color2, const float3 color3, const float4 weights)
+{
+    //  Requires:   1.) Requirements of include/gamma-management.h must be met:
+    //                  intermediate_gamma must be globally defined, and input
+    //                  colors are interpreted as linear RGB unless you #define
+    //                  GAMMA_ENCODE_EVERY_FBO (in which case they are
+    //                  interpreted as gamma-encoded with intermediate_gamma).
+    //              2.) color0-3 are colors sampled from a texture with tex2D().
+    //                  They are interpreted as defined in requirement 1.
+    //              3.) weights contains weights for each color, summing to 1.0.
+    //              4.) beam_horiz_linear_rgb_weight must be defined as a global
+    //                  float in [0.0, 1.0] describing how much blending should
+    //                  be done in linear RGB (rest is gamma-corrected RGB).
+    //              5.) RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE must be #defined
+    //                  if beam_horiz_linear_rgb_weight is anything other than a
+    //                  static constant, or we may try branching at runtime
+    //                  without dynamic branches allowed (slow).
+    //  Returns:    Return an interpolated color lookup between the four input
+    //              colors based on the weights in weights.  The final color will
+    //              be a linear RGB value, but the blending will be done as
+    //              indicated above.
+    const float intermediate_gamma = get_intermediate_gamma();
+    //  Branch if beam_horiz_linear_rgb_weight is static (for free) or if the
+    //  profile allows dynamic branches (faster than computing extra pows):
+    #ifndef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+    #else
+        #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+            #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+        #endif
+    #endif
+    #ifdef SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+        //  beam_horiz_linear_rgb_weight is static, so we can branch:
+        #ifdef GAMMA_ENCODE_EVERY_FBO
+            const float3 gamma_mixed_color = pow(get_raw_interpolated_color(
+                color0, color1, color2, color3, weights), float3(intermediate_gamma));
+            if(beam_horiz_linear_rgb_weight > 0.0)
+            {
+                const float3 linear_mixed_color = get_raw_interpolated_color(
+                    pow(color0, float3(intermediate_gamma)),
+                    pow(color1, float3(intermediate_gamma)),
+                    pow(color2, float3(intermediate_gamma)),
+                    pow(color3, float3(intermediate_gamma)),
+                    weights);
+                return lerp(gamma_mixed_color, linear_mixed_color,
+                    beam_horiz_linear_rgb_weight);
+            }
+            else
+            {
+                return gamma_mixed_color;
+            }
+        #else
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                color0, color1, color2, color3, weights);
+            if(beam_horiz_linear_rgb_weight < 1.0)
+            {
+                const float3 gamma_mixed_color = get_raw_interpolated_color(
+                    pow(color0, float3(1.0/intermediate_gamma)),
+                    pow(color1, float3(1.0/intermediate_gamma)),
+                    pow(color2, float3(1.0/intermediate_gamma)),
+                    pow(color3, float3(1.0/intermediate_gamma)),
+                    weights);
+                return lerp(gamma_mixed_color, linear_mixed_color,
+                    beam_horiz_linear_rgb_weight);
+            }
+            else
+            {
+                return linear_mixed_color;
+            }
+        #endif  //  GAMMA_ENCODE_EVERY_FBO
+    #else
+        #ifdef GAMMA_ENCODE_EVERY_FBO
+            //  Inputs: color0-3 are colors in gamma-encoded RGB.
+            const float3 gamma_mixed_color = pow(get_raw_interpolated_color(
+                color0, color1, color2, color3, weights), intermediate_gamma);
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                pow(color0, float3(intermediate_gamma)),
+                pow(color1, float3(intermediate_gamma)),
+                pow(color2, float3(intermediate_gamma)),
+                pow(color3, float3(intermediate_gamma)),
+                weights);
+            return lerp(gamma_mixed_color, linear_mixed_color,
+                beam_horiz_linear_rgb_weight);
+        #else
+            //  Inputs: color0-3 are colors in linear RGB.
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                color0, color1, color2, color3, weights);
+            const float3 gamma_mixed_color = get_raw_interpolated_color(
+                    pow(color0, float3(1.0/intermediate_gamma)),
+                    pow(color1, float3(1.0/intermediate_gamma)),
+                    pow(color2, float3(1.0/intermediate_gamma)),
+                    pow(color3, float3(1.0/intermediate_gamma)),
+                    weights);
+			// wtf fixme
+//			const float beam_horiz_linear_rgb_weight1 = 1.0;
+            return lerp(gamma_mixed_color, linear_mixed_color,
+                beam_horiz_linear_rgb_weight);
+        #endif  //  GAMMA_ENCODE_EVERY_FBO
+    #endif  //  SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+}
+
+float3 get_scanline_color(const sampler2D tex, const float2 scanline_uv,
+    const float2 uv_step_x, const float4 weights)
+{
+    //  Requires:   1.) scanline_uv must be vertically snapped to the caller's
+    //                  desired line or scanline and horizontally snapped to the
+    //                  texel just left of the output pixel (color1)
+    //              2.) uv_step_x must contain the horizontal uv distance
+    //                  between texels.
+    //              3.) weights must contain interpolation filter weights for
+    //                  color0, color1, color2, and color3, where color1 is just
+    //                  left of the output pixel.
+    //  Returns:    Return a horizontally interpolated texture lookup using 2-4
+    //              nearby texels, according to weights and the conventions of
+    //              get_interpolated_linear_color().
+    //  We can ignore the outside texture lookups for Quilez resampling.
+    const float3 color1 = COMPAT_TEXTURE(tex, scanline_uv).rgb;
+    const float3 color2 = COMPAT_TEXTURE(tex, scanline_uv + uv_step_x).rgb;
+    float3 color0 = float3(0.0);
+    float3 color3 = float3(0.0);
+    if(beam_horiz_filter > 0.5)
+    {
+        color0 = COMPAT_TEXTURE(tex, scanline_uv - uv_step_x).rgb;
+        color3 = COMPAT_TEXTURE(tex, scanline_uv + 2.0 * uv_step_x).rgb;
+    }
+    //  Sample the texture as-is, whether it's linear or gamma-encoded:
+    //  get_interpolated_linear_color() will handle the difference.
+    return get_interpolated_linear_color(color0, color1, color2, color3, weights);
+}
+
+float3 sample_single_scanline_horizontal(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size,
+    const float2 texture_size_inv)
+{
+    //  TODO: Add function requirements.
+    //  Snap to the previous texel and get sample dists from 2/4 nearby texels:
+    const float2 curr_texel = tex_uv * tex_size;
+    //  Use under_half to fix a rounding bug right around exact texel locations.
+    const float2 prev_texel =
+        floor(curr_texel - float2(under_half)) + float2(0.5);
+    const float2 prev_texel_hor = float2(prev_texel.x, curr_texel.y);
+    const float2 prev_texel_hor_uv = prev_texel_hor * texture_size_inv;
+    const float prev_dist = curr_texel.x - prev_texel_hor.x;
+    const float4 sample_dists = float4(1.0 + prev_dist, prev_dist,
+        1.0 - prev_dist, 2.0 - prev_dist);
+    //  Get Quilez, Lanczos2, or Gaussian resize weights for 2/4 nearby texels:
+    float4 weights;
+    if(beam_horiz_filter < 0.5)
+    {
+        //  Quilez:
+        const float x = sample_dists.y;
+        const float w2 = x*x*x*(x*(x*6.0 - 15.0) + 10.0);
+        weights = float4(0.0, 1.0 - w2, w2, 0.0);
+    }
+    else if(beam_horiz_filter < 1.5)
+    {
+        //  Gaussian:
+        float inner_denom_inv = 1.0/(2.0*beam_horiz_sigma*beam_horiz_sigma);
+        weights = exp(-(sample_dists*sample_dists)*inner_denom_inv);
+    }
+    else
+    {
+        //  Lanczos2:
+        const float4 pi_dists = FIX_ZERO(sample_dists * pi);
+        weights = 2.0 * sin(pi_dists) * sin(pi_dists * 0.5) /
+            (pi_dists * pi_dists);
+    }
+    //  Ensure the weight sum == 1.0:
+    const float4 final_weights = weights/dot(weights, float4(1.0));
+    //  Get the interpolated horizontal scanline color:
+    const float2 uv_step_x = float2(texture_size_inv.x, 0.0);
+    return get_scanline_color(
+        tex, prev_texel_hor_uv, uv_step_x, final_weights);
+}
+
+float3 sample_rgb_scanline_horizontal(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size,
+    const float2 texture_size_inv)
+{
+    //  TODO: Add function requirements.
+    //  Rely on a helper to make convergence easier.
+    if(beam_misconvergence)
+    {
+        const float3 convergence_offsets_rgb =
+            get_convergence_offsets_x_vector();
+        const float3 offset_u_rgb =
+            convergence_offsets_rgb * texture_size_inv.xxx;
+        const float2 scanline_uv_r = tex_uv - float2(offset_u_rgb.r, 0.0);
+        const float2 scanline_uv_g = tex_uv - float2(offset_u_rgb.g, 0.0);
+        const float2 scanline_uv_b = tex_uv - float2(offset_u_rgb.b, 0.0);
+        const float3 sample_r = sample_single_scanline_horizontal(
+            tex, scanline_uv_r, tex_size, texture_size_inv);
+        const float3 sample_g = sample_single_scanline_horizontal(
+            tex, scanline_uv_g, tex_size, texture_size_inv);
+        const float3 sample_b = sample_single_scanline_horizontal(
+            tex, scanline_uv_b, tex_size, texture_size_inv);
+        return float3(sample_r.r, sample_g.g, sample_b.b);
+    }
+    else
+    {
+        return sample_single_scanline_horizontal(tex, tex_uv, tex_size,
+            texture_size_inv);
+    }
+}
+
+float2 get_last_scanline_uv(const float2 tex_uv, const float2 tex_size,
+    const float2 texture_size_inv, const float2 il_step_multiple,
+    const float frame_count, out float dist)
+{
+    //  Compute texture coords for the last/upper scanline, accounting for
+    //  interlacing: With interlacing, only consider even/odd scanlines every
+    //  other frame.  Top-field first (TFF) order puts even scanlines on even
+    //  frames, and BFF order puts them on odd frames.  Texels are centered at:
+    //      frac(tex_uv * tex_size) == x.5
+    //  Caution: If these coordinates ever seem incorrect, first make sure it's
+    //  not because anisotropic filtering is blurring across field boundaries.
+    //  Note: TFF/BFF won't matter for sources that double-weave or similar.
+	// wtf fixme
+//	const float interlace_bff1 = 1.0;
+    const float field_offset = floor(il_step_multiple.y * 0.75) *
+        fmod(frame_count + float(interlace_bff), 2.0);
+    const float2 curr_texel = tex_uv * tex_size;
+    //  Use under_half to fix a rounding bug right around exact texel locations.
+    const float2 prev_texel_num = floor(curr_texel - float2(under_half));
+    const float wrong_field = fmod(
+        prev_texel_num.y + field_offset, il_step_multiple.y);
+    const float2 scanline_texel_num = prev_texel_num - float2(0.0, wrong_field);
+    //  Snap to the center of the previous scanline in the current field:
+    const float2 scanline_texel = scanline_texel_num + float2(0.5);
+    const float2 scanline_uv = scanline_texel * texture_size_inv;
+    //  Save the sample's distance from the scanline, in units of scanlines:
+    dist = (curr_texel.y - scanline_texel.y)/il_step_multiple.y;
+    return scanline_uv;
+}
+
+inline bool is_interlaced(float num_lines)
+{
+    //  Detect interlacing based on the number of lines in the source.
+    if(interlace_detect)
+    {
+        //  NTSC: 525 lines, 262.5/field; 486 active (2 half-lines), 243/field
+        //  NTSC Emulators: Typically 224 or 240 lines
+        //  PAL: 625 lines, 312.5/field; 576 active (typical), 288/field
+        //  PAL Emulators: ?
+        //  ATSC: 720p, 1080i, 1080p
+        //  Where do we place our cutoffs?  Assumptions:
+        //  1.) We only need to care about active lines.
+        //  2.) Anything > 288 and <= 576 lines is probably interlaced.
+        //  3.) Anything > 576 lines is probably not interlaced...
+        //  4.) ...except 1080 lines, which is a crapshoot (user decision).
+        //  5.) Just in case the main program uses calculated video sizes,
+        //      we should nudge the float thresholds a bit.
+        const bool sd_interlace = ((num_lines > 288.5) && (num_lines < 576.5));
+        const bool hd_interlace = bool(interlace_1080i) ?
+            ((num_lines > 1079.5) && (num_lines < 1080.5)) :
+            false;
+        return (sd_interlace || hd_interlace);
+    }
+    else
+    {
+        return false;
+    }
+}
+
+#endif  //  SCANLINE_FUNCTIONS_H
+
+/////////////////////////////  END SCANLINE-FUNCTIONS  ////////////////////////////
+
+//#include "../../../../include/gamma-management.h"
+
+////////////////////////////  BEGIN GAMMA-MANAGEMENT  //////////////////////////
+
+#ifndef GAMMA_MANAGEMENT_H
+#define GAMMA_MANAGEMENT_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//  
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides gamma-aware tex*D*() and encode_output() functions.
+//  Requires:   Before #include-ing this file, the including file must #define
+//              the following macros when applicable and follow their rules:
+//              1.) #define FIRST_PASS if this is the first pass.
+//              2.) #define LAST_PASS if this is the last pass.
+//              3.) If sRGB is available, set srgb_framebufferN = "true" for
+//                  every pass except the last in your .cgp preset.
+//              4.) If sRGB isn't available but you want gamma-correctness with
+//                  no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
+//              5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
+//              6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
+//              7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
+//              8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
+//              If an option in [5, 8] is #defined in the first or last pass, it
+//              should be #defined for both.  It shouldn't make a difference
+//              whether it's #defined for intermediate passes or not.
+//  Optional:   The including file (or an earlier included file) may optionally
+//              #define a number of macros indicating it will override certain
+//              macros and associated constants are as follows:
+//              static constants with either static or uniform constants.  The
+//              1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
+//                  static const float ntsc_gamma
+//                  static const float pal_gamma
+//                  static const float crt_reference_gamma_high
+//                  static const float crt_reference_gamma_low
+//                  static const float lcd_reference_gamma
+//                  static const float crt_office_gamma
+//                  static const float lcd_office_gamma
+//              2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
+//                  static const float crt_gamma
+//                  static const float gba_gamma
+//                  static const float lcd_gamma
+//              3.) OVERRIDE_FINAL_GAMMA: The user must first define:
+//                  static const float input_gamma
+//                  static const float intermediate_gamma
+//                  static const float output_gamma
+//                  (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
+//              4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
+//                  static const bool assume_opaque_alpha
+//              The gamma constant overrides must be used in every pass or none,
+//              and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
+//              OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
+//  Usage:      After setting macros appropriately, ignore gamma correction and
+//              replace all tex*D*() calls with equivalent gamma-aware
+//              tex*D*_linearize calls, except:
+//              1.) When you read an LUT, use regular tex*D or a gamma-specified
+//                  function, depending on its gamma encoding:
+//                      tex*D*_linearize_gamma (takes a runtime gamma parameter)
+//              2.) If you must read pass0's original input in a later pass, use
+//                  tex2D_linearize_ntsc_gamma.  If you want to read pass0's
+//                  input with gamma-corrected bilinear filtering, consider
+//                  creating a first linearizing pass and reading from the input
+//                  of pass1 later.
+//              Then, return encode_output(color) from every fragment shader.
+//              Finally, use the global gamma_aware_bilinear boolean if you want
+//              to statically branch based on whether bilinear filtering is
+//              gamma-correct or not (e.g. for placing Gaussian blur samples).
+//
+//  Detailed Policy:
+//  tex*D*_linearize() functions enforce a consistent gamma-management policy
+//  based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings.  They assume
+//  their input texture has the same encoding characteristics as the input for
+//  the current pass (which doesn't apply to the exceptions listed above).
+//  Similarly, encode_output() enforces a policy based on the LAST_PASS and
+//  GAMMA_ENCODE_EVERY_FBO settings.  Together, they result in one of the
+//  following two pipelines.
+//  Typical pipeline with intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = linear_color;     //  Automatic sRGB encoding
+//      linear_color = intermediate_output;     //  Automatic sRGB decoding
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Typical pipeline without intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
+//      linear_color = pow(intermediate_output, intermediate_gamma);
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
+//  easily get gamma-correctness without banding on devices where sRGB isn't
+//  supported.
+//
+//  Use This Header to Maximize Code Reuse:
+//  The purpose of this header is to provide a consistent interface for texture
+//  reads and output gamma-encoding that localizes and abstracts away all the
+//  annoying details.  This greatly reduces the amount of code in each shader
+//  pass that depends on the pass number in the .cgp preset or whether sRGB
+//  FBO's are being used: You can trivially change the gamma behavior of your
+//  whole pass by commenting or uncommenting 1-3 #defines.  To reuse the same
+//  code in your first, Nth, and last passes, you can even put it all in another
+//  header file and #include it from skeleton .cg files that #define the
+//  appropriate pass-specific settings.
+//
+//  Rationale for Using Three Macros:
+//  This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
+//  SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
+//  a lower maintenance burden on each pass.  At first glance it seems we could
+//  accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
+//  This works for simple use cases where input_gamma == output_gamma, but it
+//  breaks down for more complex scenarios like CRT simulation, where the pass
+//  number determines the gamma encoding of the input and output.
+
+
+///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
+
+//  Set standard gamma constants, but allow users to override them:
+#ifndef OVERRIDE_STANDARD_GAMMA
+    //  Standard encoding gammas:
+    static const float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
+    static const float pal_gamma = 2.8;     //  Never actually 2.8 in practice
+    //  Typical device decoding gammas (only use for emulating devices):
+    //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
+    //  gammas: The standards purposely undercorrected for an analog CRT's
+    //  assumed 2.5 reference display gamma to maintain contrast in assumed
+    //  [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
+    //  These unstated assumptions about display gamma and perceptual rendering
+    //  intent caused a lot of confusion, and more modern CRT's seemed to target
+    //  NTSC 2.2 gamma with circuitry.  LCD displays seem to have followed suit
+    //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
+    //  displays designed to view sRGB in bright environments.  (Standards are
+    //  also in flux again with BT.1886, but it's underspecified for displays.)
+    static const float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
+    static const float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
+    static const float lcd_reference_gamma = 2.5;       //  To match CRT
+    static const float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
+    static const float lcd_office_gamma = 2.2;  //  Approximates sRGB
+#endif  //  OVERRIDE_STANDARD_GAMMA
+
+//  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
+//  but only if they're aware of it.
+#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
+    static const bool assume_opaque_alpha = false;
+#endif
+
+
+///////////////////////  DERIVED CONSTANTS AS FUNCTIONS  ///////////////////////
+
+//  gamma-management.h should be compatible with overriding gamma values with
+//  runtime user parameters, but we can only define other global constants in
+//  terms of static constants, not uniform user parameters.  To get around this
+//  limitation, we need to define derived constants using functions.
+
+//  Set device gamma constants, but allow users to override them:
+#ifdef OVERRIDE_DEVICE_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_crt_gamma()    {   return crt_gamma;   }
+    inline float get_gba_gamma()    {   return gba_gamma;   }
+    inline float get_lcd_gamma()    {   return lcd_gamma;   }
+#else
+    inline float get_crt_gamma()    {   return crt_reference_gamma_high;    }
+    inline float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
+    inline float get_lcd_gamma()    {   return lcd_office_gamma;            }
+#endif  //  OVERRIDE_DEVICE_GAMMA
+
+//  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
+#ifdef OVERRIDE_FINAL_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_intermediate_gamma()   {   return intermediate_gamma;  }
+    inline float get_input_gamma()          {   return input_gamma;         }
+    inline float get_output_gamma()         {   return output_gamma;        }
+#else
+    //  If we gamma-correct every pass, always use ntsc_gamma between passes to
+    //  ensure middle passes don't need to care if anything is being simulated:
+    inline float get_intermediate_gamma()   {   return ntsc_gamma;          }
+    #ifdef SIMULATE_CRT_ON_LCD
+        inline float get_input_gamma()      {   return get_crt_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_LCD
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_LCD_ON_CRT
+        inline float get_input_gamma()      {   return get_lcd_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_CRT
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else   //  Don't simulate anything:
+        inline float get_input_gamma()      {   return ntsc_gamma;          }
+        inline float get_output_gamma()     {   return ntsc_gamma;          }
+    #endif  //  SIMULATE_GBA_ON_CRT
+    #endif  //  SIMULATE_LCD_ON_CRT
+    #endif  //  SIMULATE_GBA_ON_LCD
+    #endif  //  SIMULATE_CRT_ON_LCD
+#endif  //  OVERRIDE_FINAL_GAMMA
+
+//  Set decoding/encoding gammas for the current pass.  Use static constants for
+//  linearize_input and gamma_encode_output, because they aren't derived, and
+//  they let the compiler do dead-code elimination.
+#ifndef GAMMA_ENCODE_EVERY_FBO
+    #ifdef FIRST_PASS
+        static const bool linearize_input = true;
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        static const bool linearize_input = false;
+        inline float get_pass_input_gamma()     {   return 1.0;                 }
+    #endif
+    #ifdef LAST_PASS
+        static const bool gamma_encode_output = true;
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        static const bool gamma_encode_output = false;
+        inline float get_pass_output_gamma()    {   return 1.0;                 }
+    #endif
+#else
+    static const bool linearize_input = true;
+    static const bool gamma_encode_output = true;
+    #ifdef FIRST_PASS
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        inline float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
+    #endif
+    #ifdef LAST_PASS
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        inline float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
+    #endif
+#endif
+
+//  Users might want to know if bilinear filtering will be gamma-correct:
+static const bool gamma_aware_bilinear = !linearize_input;
+
+
+//////////////////////  COLOR ENCODING/DECODING FUNCTIONS  /////////////////////
+
+inline float4 encode_output(const float4 color)
+{
+    if(gamma_encode_output)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_input(const float4 color)
+{
+    if(linearize_input)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_gamma_input(const float4 color, const float3 gamma)
+{
+    if(assume_opaque_alpha)
+    {
+        return float4(pow(color.rgb, gamma), 1.0);
+    }
+    else
+    {
+        return float4(pow(color.rgb, gamma), color.a);
+    }
+}
+
+//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯
+//#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D)))
+// EDIT: it's the 'const' in front of the coords that's doing it
+
+///////////////////////////  TEXTURE LOOKUP WRAPPERS  //////////////////////////
+
+//  "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a wide array of linearizing texture lookup wrapper functions.  The
+//  Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
+//  lookups are provided for completeness in case that changes someday.  Nobody
+//  is likely to use the *fetch and *proj functions, but they're included just
+//  in case.  The only tex*D texture sampling functions omitted are:
+//      - tex*Dcmpbias
+//      - tex*Dcmplod
+//      - tex*DARRAY*
+//      - tex*DMS*
+//      - Variants returning integers
+//  Standard line length restrictions are ignored below for vertical brevity.
+/*
+//  tex1D:
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex1Dbias:
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dbias(tex, tex_coords));   }
+
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dbias(tex, tex_coords, texel_off));    }
+
+//  tex1Dfetch:
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
+{   return decode_input(tex1Dfetch(tex, tex_coords));  }
+
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex1Dlod:
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dlod(tex, tex_coords));    }
+
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dlod(tex, tex_coords, texel_off));     }
+
+//  tex1Dproj:
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+*/
+//  tex2D:
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords, texel_off));    }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex2Dbias:
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
+//{   return decode_input(tex2Dbias(tex, tex_coords));   }
+
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dbias(tex, tex_coords, texel_off));    }
+
+//  tex2Dfetch:
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
+//{   return decode_input(tex2Dfetch(tex, tex_coords));  }
+
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords)
+{   return decode_input(textureLod(tex, tex_coords.xy, 0.0));    }
+
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));     }
+/*
+//  tex2Dproj:
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+*/
+/*
+//  tex3D:
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
+{   return decode_input(tex3D(tex, tex_coords));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, texel_off));    }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex3Dbias:
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dbias(tex, tex_coords));   }
+
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dbias(tex, tex_coords, texel_off));    }
+
+//  tex3Dfetch:
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
+{   return decode_input(tex3Dfetch(tex, tex_coords));  }
+
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex3Dlod:
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dlod(tex, tex_coords));    }
+
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dlod(tex, tex_coords, texel_off));     }
+
+//  tex3Dproj:
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dproj(tex, tex_coords));   }
+
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dproj(tex, tex_coords, texel_off));    }
+/////////*
+
+//  NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  This narrow selection of nonstandard tex2D* functions can be useful:
+
+//  tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0)));   }
+
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off));    }
+
+
+//  MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a narrower selection of tex2D* wrapper functions that decode an
+//  input sample with a specified gamma value.  These are useful for reading
+//  LUT's and for reading the input of pass0 in a later pass.
+
+//  tex2D:
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma);   }
+
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+/*
+//  tex2Dbias:
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma);   }
+
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma);    }
+
+//  tex2Dfetch:
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma);  }
+
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma);   }
+*/
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma);    }
+
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma);     }
+
+
+#endif  //  GAMMA_MANAGEMENT_H
+
+////////////////////////////  END GAMMA-MANAGEMENT  //////////////////////////
+
+#undef COMPAT_PRECISION
+#undef COMPAT_TEXTURE
+
+void main() {
+	vec2 tex_uv = vTexCoord.xy;
+    //  This pass: Sample multiple (misconverged?) scanlines to the final
+    //  vertical resolution.  Temporarily auto-dim the output to avoid clipping.
+
+    //  Read some attributes into local variables:
+    float2 texture_size_ = texture_size;
+    float2 texture_size_inv = 1.0/texture_size_;
+    //const float2 uv_step = uv_step;
+    //const float2 il_step_multiple = il_step_multiple;
+    float frame_count = float(frame_count);
+    const float ph = pixel_height_in_scanlines;
+
+    //  Get the uv coords of the previous scanline (in this field), and the
+    //  scanline's distance from this sample, in scanlines.
+    float dist;
+    const float2 scanline_uv = get_last_scanline_uv(tex_uv, texture_size_,
+        texture_size_inv, il_step_multiple, frame_count, dist);
+    //  Consider 2, 3, 4, or 6 scanlines numbered 0-5: The previous and next
+    //  scanlines are numbered 2 and 3.  Get scanline colors colors (ignore
+    //  horizontal sampling, since since output_size.x = video_size.x).
+    //  NOTE: Anisotropic filtering creates interlacing artifacts, which is why
+    //  ORIG_LINEARIZED bobbed any interlaced input before this pass.
+    const float2 v_step = float2(0.0, uv_step.y);
+    const float3 scanline2_color = tex2D_linearize(input_texture, scanline_uv).rgb;
+    const float3 scanline3_color =
+        tex2D_linearize(input_texture, scanline_uv + v_step).rgb;
+    float3 scanline0_color, scanline1_color, scanline4_color, scanline5_color,
+        scanline_outside_color;
+    float dist_round;
+    //  Use scanlines 0, 1, 4, and 5 for a total of 6 scanlines:
+    if(beam_num_scanlines > 5.5)
+    {
+        scanline1_color =
+            tex2D_linearize(input_texture, scanline_uv - v_step).rgb;
+        scanline4_color =
+            tex2D_linearize(input_texture, scanline_uv + 2.0 * v_step).rgb;
+        scanline0_color =
+            tex2D_linearize(input_texture, scanline_uv - 2.0 * v_step).rgb;
+        scanline5_color =
+            tex2D_linearize(input_texture, scanline_uv + 3.0 * v_step).rgb;
+    }
+    //  Use scanlines 1, 4, and either 0 or 5 for a total of 5 scanlines:
+    else if(beam_num_scanlines > 4.5)
+    {
+        scanline1_color =
+            tex2D_linearize(input_texture, scanline_uv - v_step).rgb;
+        scanline4_color =
+            tex2D_linearize(input_texture, scanline_uv + 2.0 * v_step).rgb;
+        //  dist is in [0, 1]
+        dist_round = round(dist);
+        const float2 sample_0_or_5_uv_off =
+            lerp(-2.0 * v_step, 3.0 * v_step, dist_round);
+        //  Call this "scanline_outside_color" to cope with the conditional
+        //  scanline number:
+        scanline_outside_color = tex2D_linearize(
+            input_texture, scanline_uv + sample_0_or_5_uv_off).rgb;
+    }
+    //  Use scanlines 1 and 4 for a total of 4 scanlines:
+    else if(beam_num_scanlines > 3.5)
+    {
+        scanline1_color =
+            tex2D_linearize(input_texture, scanline_uv - v_step).rgb;
+        scanline4_color =
+            tex2D_linearize(input_texture, scanline_uv + 2.0 * v_step).rgb;
+    }
+    //  Use scanline 1 or 4 for a total of 3 scanlines:
+    else if(beam_num_scanlines > 2.5)
+    {
+        //  dist is in [0, 1]
+        dist_round = round(dist);
+        const float2 sample_1or4_uv_off =
+            lerp(-v_step, 2.0 * v_step, dist_round);
+        scanline_outside_color = tex2D_linearize(
+            input_texture, scanline_uv + sample_1or4_uv_off).rgb;
+    }
+    
+    //  Compute scanline contributions, accounting for vertical convergence.
+    //  Vertical convergence offsets are in units of current-field scanlines.
+    //  dist2 means "positive sample distance from scanline 2, in scanlines:"
+    float3 dist2 = float3(dist);
+    if(beam_misconvergence)
+    {
+        const float3 convergence_offsets_vert_rgb =
+            get_convergence_offsets_y_vector();
+        dist2 = float3(dist) - convergence_offsets_vert_rgb;
+    }
+    //  Calculate {sigma, shape}_range outside of scanline_contrib so it's only
+    //  done once per pixel (not 6 times) with runtime params.  Don't reuse the
+    //  vertex shader calculations, so static versions can be constant-folded.
+	const float sigma_range = max(beam_max_sigma, beam_min_sigma) -
+        beam_min_sigma;
+	const float shape_range = max(beam_max_shape, beam_min_shape) -
+        beam_min_shape;
+    //  Calculate and sum final scanline contributions, starting with lines 2/3.
+    //  There is no normalization step, because we're not interpolating a
+    //  continuous signal.  Instead, each scanline is an additive light source.
+    const float3 scanline2_contrib = scanline_contrib(dist2,
+        scanline2_color, ph, sigma_range, shape_range);
+    const float3 scanline3_contrib = scanline_contrib(abs(float3(1.0,1.0,1.0) - dist2),
+        scanline3_color, ph, sigma_range, shape_range);
+    float3 scanline_intensity = scanline2_contrib + scanline3_contrib;
+    if(beam_num_scanlines > 5.5)
+    {
+        const float3 scanline0_contrib =
+            scanline_contrib(dist2 + float3(2.0,2.0,2.0), scanline0_color,
+                ph, sigma_range, shape_range);
+        const float3 scanline1_contrib =
+            scanline_contrib(dist2 + float3(1.0,1.0,1.0), scanline1_color,
+                ph, sigma_range, shape_range);
+        const float3 scanline4_contrib =
+            scanline_contrib(abs(float3(2.0,2.0,2.0) - dist2), scanline4_color,
+                ph, sigma_range, shape_range);
+        const float3 scanline5_contrib =
+            scanline_contrib(abs(float3(3.0) - dist2), scanline5_color,
+                ph, sigma_range, shape_range);
+        scanline_intensity += scanline0_contrib + scanline1_contrib +
+            scanline4_contrib + scanline5_contrib;
+    }
+    else if(beam_num_scanlines > 4.5)
+    {
+        const float3 scanline1_contrib =
+            scanline_contrib(dist2 + float3(1.0,1.0,1.0), scanline1_color,
+                ph, sigma_range, shape_range);
+        const float3 scanline4_contrib =
+            scanline_contrib(abs(float3(2.0,2.0,2.0) - dist2), scanline4_color,
+                ph, sigma_range, shape_range);
+        const float3 dist0or5 = lerp(
+            dist2 + float3(2.0,2.0,2.0), float3(3.0,3.0,3.0) - dist2, dist_round);
+        const float3 scanline0or5_contrib = scanline_contrib(
+            dist0or5, scanline_outside_color, ph, sigma_range, shape_range);
+        scanline_intensity += scanline1_contrib + scanline4_contrib +
+            scanline0or5_contrib;
+    }
+    else if(beam_num_scanlines > 3.5)
+    {
+        const float3 scanline1_contrib =
+            scanline_contrib(dist2 + float3(1.0,1.0,1.0), scanline1_color,
+                ph, sigma_range, shape_range);
+        const float3 scanline4_contrib =
+            scanline_contrib(abs(float3(2.0,2.0,2.0) - dist2), scanline4_color,
+                ph, sigma_range, shape_range);
+        scanline_intensity += scanline1_contrib + scanline4_contrib;
+    }
+    else if(beam_num_scanlines > 2.5)
+    {
+        const float3 dist1or4 = lerp(
+            dist2 + float3(1.0,1.0,1.0), float3(2.0,2.0,2.0) - dist2, dist_round);
+        const float3 scanline1or4_contrib = scanline_contrib(
+            dist1or4, scanline_outside_color, ph, sigma_range, shape_range);
+        scanline_intensity += scanline1or4_contrib;
+    }
+
+    //  Auto-dim the image to avoid clipping, encode if necessary, and output.
+    //  My original idea was to compute a minimal auto-dim factor and put it in
+    //  the alpha channel, but it wasn't working, at least not reliably.  This
+    //  is faster anyway, levels_autodim_temp = 0.5 isn't causing banding.
+    FragColor = encode_output(float4(scanline_intensity * levels_autodim_temp, 1.0));
+}
\ No newline at end of file
diff --git a/shaders/CRT-Royale.shader/scanlines-vertical-interlacing.vs b/shaders/CRT-Royale.shader/scanlines-vertical-interlacing.vs
new file mode 100644
index 00000000..8fe7b14c
--- /dev/null
+++ b/shaders/CRT-Royale.shader/scanlines-vertical-interlacing.vs
@@ -0,0 +1,5830 @@
+#version 150
+
+in vec4 position;
+in vec2 texCoord;
+
+out Vertex {
+   vec2 vTexCoord;
+   vec2 uv_step;
+   vec2 il_step_multiple;
+   float pixel_height_in_scanlines;
+};
+
+uniform vec4 targetSize;
+uniform vec4 sourceSize[];
+
+// USER SETTINGS BLOCK //
+
+#define crt_gamma 2.500000
+#define lcd_gamma 2.200000
+#define levels_contrast 1.0
+#define halation_weight 0.0
+#define diffusion_weight 0.075
+#define bloom_underestimate_levels 0.8
+#define bloom_excess 0.000000
+#define beam_min_sigma 0.020000
+#define beam_max_sigma 0.300000
+#define beam_spot_power 0.330000
+#define beam_min_shape 2.000000
+#define beam_max_shape 4.000000
+#define beam_shape_power 0.250000
+#define beam_horiz_filter 0.000000
+#define beam_horiz_sigma 0.35
+#define beam_horiz_linear_rgb_weight 1.000000
+#define convergence_offset_x_r -0.000000
+#define convergence_offset_x_g 0.000000
+#define convergence_offset_x_b 0.000000
+#define convergence_offset_y_r 0.000000
+#define convergence_offset_y_g -0.000000
+#define convergence_offset_y_b 0.000000
+#define mask_type 1.000000
+#define mask_sample_mode_desired 0.000000
+#define mask_specify_num_triads 0.000000
+#define mask_triad_size_desired 3.000000
+#define mask_num_triads_desired 480.000000
+#define aa_subpixel_r_offset_x_runtime -0.0
+#define aa_subpixel_r_offset_y_runtime 0.000000
+#define aa_cubic_c 0.500000
+#define aa_gauss_sigma 0.500000
+#define geom_mode_runtime 2.000000
+#define geom_radius 2.000000
+#define geom_view_dist 2.000000
+#define geom_tilt_angle_x 0.000000
+#define geom_tilt_angle_y 0.000000
+#define geom_aspect_ratio_x 432.000000
+#define geom_aspect_ratio_y 329.000000
+#define geom_overscan_x 1.000000
+#define geom_overscan_y 1.000000
+#define border_size 0.015
+#define border_darkness 2.0
+#define border_compress 2.500000
+#define interlace_bff 0.000000
+#define interlace_1080i 0.000000
+
+// END USER SETTINGS BLOCK //
+
+// compatibility macros for transparently converting HLSLisms into GLSLisms
+#define mul(a,b) (b*a)
+#define lerp(a,b,c) mix(a,b,c)
+#define saturate(c) clamp(c, 0.0, 1.0)
+#define frac(x) (fract(x))
+#define float2 vec2
+#define float3 vec3
+#define float4 vec4
+#define bool2 bvec2
+#define bool3 bvec3
+#define bool4 bvec4
+#define float2x2 mat2x2
+#define float3x3 mat3x3
+#define float4x4 mat4x4
+#define float4x3 mat4x3
+#define float2x4 mat2x4
+#define IN params
+#define texture_size sourceSize[0].xy
+#define video_size sourceSize[0].xy
+#define output_size targetSize.xy
+#define frame_count phase
+#define static  
+#define inline  
+#define const  
+#define fmod(x,y) mod(x,y)
+#define ddx(c) dFdx(c)
+#define ddy(c) dFdy(c)
+#define atan2(x,y) atan(y,x)
+#define rsqrt(c) inversesqrt(c)
+
+#define input_texture source[0]
+
+#if defined(GL_ES)
+	#define COMPAT_PRECISION mediump
+#else
+	#define COMPAT_PRECISION
+#endif
+
+#if __VERSION__ >= 130
+	#define COMPAT_TEXTURE texture
+#else
+	#define COMPAT_TEXTURE texture2D
+#endif
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+////////////////////////////  END USER-SETTINGS  //////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  END DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////////////
+
+//#include "bind-shader-params.h"
+
+/////////////////////////////  BEGIN BIND-SHADER-PARAMS  ////////////////////////////
+
+#ifndef BIND_SHADER_PARAMS_H
+#define BIND_SHADER_PARAMS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+/////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+////////////////////   END DERIVED-SETTINGS-AND-CONSTANTS   /////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+//  Override some parameters for gamma-management.h and tex2Dantialias.h:
+#define OVERRIDE_DEVICE_GAMMA
+static const float gba_gamma = 3.5; //  Irrelevant but necessary to define.
+#define ANTIALIAS_OVERRIDE_BASICS
+#define ANTIALIAS_OVERRIDE_PARAMETERS
+
+//  Provide accessors for vector constants that pack scalar uniforms:
+inline float2 get_aspect_vector(const float geom_aspect_ratio)
+{
+    //  Get an aspect ratio vector.  Enforce geom_max_aspect_ratio, and prevent
+    //  the absolute scale from affecting the uv-mapping for curvature:
+    const float geom_clamped_aspect_ratio =
+        min(geom_aspect_ratio, geom_max_aspect_ratio);
+    const float2 geom_aspect =
+        normalize(float2(geom_clamped_aspect_ratio, 1.0));
+    return geom_aspect;
+}
+
+inline float2 get_geom_overscan_vector()
+{
+    return float2(geom_overscan_x, geom_overscan_y);
+}
+
+inline float2 get_geom_tilt_angle_vector()
+{
+    return float2(geom_tilt_angle_x, geom_tilt_angle_y);
+}
+
+inline float3 get_convergence_offsets_x_vector()
+{
+    return float3(convergence_offset_x_r, convergence_offset_x_g,
+        convergence_offset_x_b);
+}
+
+inline float3 get_convergence_offsets_y_vector()
+{
+    return float3(convergence_offset_y_r, convergence_offset_y_g,
+        convergence_offset_y_b);
+}
+
+inline float2 get_convergence_offsets_r_vector()
+{
+    return float2(convergence_offset_x_r, convergence_offset_y_r);
+}
+
+inline float2 get_convergence_offsets_g_vector()
+{
+    return float2(convergence_offset_x_g, convergence_offset_y_g);
+}
+
+inline float2 get_convergence_offsets_b_vector()
+{
+    return float2(convergence_offset_x_b, convergence_offset_y_b);
+}
+
+inline float2 get_aa_subpixel_r_offset()
+{
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+            //  WARNING: THIS IS EXTREMELY EXPENSIVE.
+            return float2(aa_subpixel_r_offset_x_runtime,
+                aa_subpixel_r_offset_y_runtime);
+        #else
+            return aa_subpixel_r_offset_static;
+        #endif
+    #else
+        return aa_subpixel_r_offset_static;
+    #endif
+}
+
+//  Provide accessors settings which still need "cooking:"
+inline float get_mask_amplify()
+{
+    static const float mask_grille_amplify = 1.0/mask_grille_avg_color;
+    static const float mask_slot_amplify = 1.0/mask_slot_avg_color;
+    static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color;
+    return mask_type < 0.5 ? mask_grille_amplify :
+        mask_type < 1.5 ? mask_slot_amplify :
+        mask_shadow_amplify;
+}
+
+inline float get_mask_sample_mode()
+{
+    #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_desired;
+        #else
+            return clamp(mask_sample_mode_desired, 1.0, 2.0);
+        #endif
+    #else
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_static;
+        #else
+            return clamp(mask_sample_mode_static, 1.0, 2.0);
+        #endif
+    #endif
+}
+
+#endif  //  BIND_SHADER_PARAMS_H
+
+////////////////////////////  END BIND-SHADER-PARAMS  ///////////////////////////
+
+//#include "scanline-functions.h"
+
+/////////////////////////////  BEGIN SCANLINE-FUNCTIONS  ////////////////////////////
+
+#ifndef SCANLINE_FUNCTIONS_H
+#define SCANLINE_FUNCTIONS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+////////////////////////////  END USER-SETTINGS  //////////////////////////
+
+//#include "derived-settings-and-constants.h"
+
+////////////////////  BEGIN DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////
+
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+///////////////////////////////  BEGIN INCLUDES  ///////////////////////////////
+
+//#include "../user-settings.h"
+
+/////////////////////////////  BEGIN USER-SETTINGS  ////////////////////////////
+
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
+/////////////////////////////   END USER-SETTINGS   ////////////////////////////
+
+//#include "user-cgp-constants.h"
+
+/////////////////////////   BEGIN USER-CGP-CONSTANTS   /////////////////////////
+
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+static const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+static const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+static const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+static const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+static const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
+//////////////////////////   END USER-CGP-CONSTANTS   //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+    static const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+    static const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+        static const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+    static const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+    static const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+static const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
+    #else
+        static const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+        static const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+static const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+static const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  END DERIVED-SETTINGS-AND-CONSTANTS  ////////////////////////////
+
+//#include "../../../../include/special-functions.h"
+
+///////////////////////////  BEGIN SPECIAL-FUNCTIONS  //////////////////////////
+
+#ifndef SPECIAL_FUNCTIONS_H
+#define SPECIAL_FUNCTIONS_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file implements the following mathematical special functions:
+//  1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2))
+//  2.) gamma(s), a real-numbered extension of the integer factorial function
+//  It also implements normalized_ligamma(s, z), a normalized lower incomplete
+//  gamma function for s < 0.5 only.  Both gamma() and normalized_ligamma() can
+//  be called with an _impl suffix to use an implementation version with a few
+//  extra precomputed parameters (which may be useful for the caller to reuse).
+//  See below for details.
+//
+//  Design Rationale:
+//  Pretty much every line of code in this file is duplicated four times for
+//  different input types (float4/float3/float2/float).  This is unfortunate,
+//  but Cg doesn't allow function templates.  Macros would be far less verbose,
+//  but they would make the code harder to document and read.  I don't expect
+//  these functions will require a whole lot of maintenance changes unless
+//  someone ever has need for more robust incomplete gamma functions, so code
+//  duplication seems to be the lesser evil in this case.
+
+
+///////////////////////////  GAUSSIAN ERROR FUNCTION  //////////////////////////
+
+float4 erf6(float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Return an Abramowitz/Stegun approximation of erf(), where:
+    //                  erf(x) = 2/sqrt(pi) * integral(e**(-x**2))
+    //              This approximation has a max absolute error of 2.5*10**-5
+    //              with solid numerical robustness and efficiency.  See:
+	//                  https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions
+	static const float4 one = float4(1.0);
+	const float4 sign_x = sign(x);
+	const float4 t = one/(one + 0.47047*abs(x));
+	const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float3 erf6(const float3 x)
+{
+    //  Float3 version:
+	static const float3 one = float3(1.0);
+	const float3 sign_x = sign(x);
+	const float3 t = one/(one + 0.47047*abs(x));
+	const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float2 erf6(const float2 x)
+{
+    //  Float2 version:
+	static const float2 one = float2(1.0);
+	const float2 sign_x = sign(x);
+	const float2 t = one/(one + 0.47047*abs(x));
+	const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float erf6(const float x)
+{
+    //  Float version:
+	const float sign_x = sign(x);
+	const float t = 1.0/(1.0 + 0.47047*abs(x));
+	const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+		exp(-(x*x));
+	return result * sign_x;
+}
+
+float4 erft(const float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Approximate erf() with the hyperbolic tangent.  The error is
+    //              visually noticeable, but it's blazing fast and perceptually
+    //              close...at least on ATI hardware.  See:
+    //                  http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html
+    //  Warning:    Only use this if your hardware drivers correctly implement
+    //              tanh(): My nVidia 8800GTS returns garbage output.
+	return tanh(1.202760580 * x);
+}
+
+float3 erft(const float3 x)
+{
+    //  Float3 version:
+	return tanh(1.202760580 * x);
+}
+
+float2 erft(const float2 x)
+{
+    //  Float2 version:
+	return tanh(1.202760580 * x);
+}
+
+float erft(const float x)
+{
+    //  Float version:
+	return tanh(1.202760580 * x);
+}
+
+inline float4 erf(const float4 x)
+{
+    //  Requires:   x is the standard parameter to erf().
+    //  Returns:    Some approximation of erf(x), depending on user settings.
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float3 erf(const float3 x)
+{
+    //  Float3 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float2 erf(const float2 x)
+{
+    //  Float2 version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+inline float erf(const float x)
+{
+    //  Float version:
+	#ifdef ERF_FAST_APPROXIMATION
+		return erft(x);
+	#else
+		return erf6(x);
+	#endif
+}
+
+
+///////////////////////////  COMPLETE GAMMA FUNCTION  //////////////////////////
+
+float4 gamma_impl(const float4 s, const float4 s_inv)
+{
+    //  Requires:   1.) s is the standard parameter to the gamma function, and
+    //                  it should lie in the [0, 36] range.
+    //              2.) s_inv = 1.0/s.  This implementation function requires
+    //                  the caller to precompute this value, giving users the
+    //                  opportunity to reuse it.
+    //  Returns:    Return approximate gamma function (real-numbered factorial)
+    //              output using the Lanczos approximation with two coefficients
+    //              calculated using Paul Godfrey's method here:
+    //                  http://my.fit.edu/~gabdo/gamma.txt
+    //              An optimal g value for s in [0, 36] is ~1.12906830989, with
+    //              a maximum relative error of 0.000463 for 2**16 equally
+    //              evals.  We could use three coeffs (0.0000346 error) without
+    //              hurting latency, but this allows more parallelism with
+    //              outside instructions.
+	static const float4 g = float4(1.12906830989);
+	static const float4 c0 = float4(0.8109119309638332633713423362694399653724431);
+	static const float4 c1 = float4(0.4808354605142681877121661197951496120000040);
+	static const float4 e = float4(2.71828182845904523536028747135266249775724709);
+	const float4 sph = s + float4(0.5);
+	const float4 lanczos_sum = c0 + c1/(s + float4(1.0));
+	const float4 base = (sph + g)/e;  //  or (s + g + float4(0.5))/e
+	//  gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s).
+	//  This has less error for small s's than (s -= 1.0) at the beginning.
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float3 gamma_impl(const float3 s, const float3 s_inv)
+{
+    //  Float3 version:
+	static const float3 g = float3(1.12906830989);
+	static const float3 c0 = float3(0.8109119309638332633713423362694399653724431);
+	static const float3 c1 = float3(0.4808354605142681877121661197951496120000040);
+	static const float3 e = float3(2.71828182845904523536028747135266249775724709);
+	const float3 sph = s + float3(0.5);
+	const float3 lanczos_sum = c0 + c1/(s + float3(1.0));
+	const float3 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float2 gamma_impl(const float2 s, const float2 s_inv)
+{
+    //  Float2 version:
+	static const float2 g = float2(1.12906830989);
+	static const float2 c0 = float2(0.8109119309638332633713423362694399653724431);
+	static const float2 c1 = float2(0.4808354605142681877121661197951496120000040);
+	static const float2 e = float2(2.71828182845904523536028747135266249775724709);
+	const float2 sph = s + float2(0.5);
+	const float2 lanczos_sum = c0 + c1/(s + float2(1.0));
+	const float2 base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float gamma_impl(const float s, const float s_inv)
+{
+    //  Float version:
+	static const float g = 1.12906830989;
+	static const float c0 = 0.8109119309638332633713423362694399653724431;
+	static const float c1 = 0.4808354605142681877121661197951496120000040;
+	static const float e = 2.71828182845904523536028747135266249775724709;
+	const float sph = s + 0.5;
+	const float lanczos_sum = c0 + c1/(s + 1.0);
+	const float base = (sph + g)/e;
+	return (pow(base, sph) * lanczos_sum) * s_inv;
+}
+
+float4 gamma(const float4 s)
+{
+    //  Requires:   s is the standard parameter to the gamma function, and it
+    //              should lie in the [0, 36] range.
+    //  Returns:    Return approximate gamma function output with a maximum
+    //              relative error of 0.000463.  See gamma_impl for details.
+	return gamma_impl(s, float4(1.0)/s);
+}
+
+float3 gamma(const float3 s)
+{
+    //  Float3 version:
+	return gamma_impl(s, float3(1.0)/s);
+}
+
+float2 gamma(const float2 s)
+{
+    //  Float2 version:
+	return gamma_impl(s, float2(1.0)/s);
+}
+
+float gamma(const float s)
+{
+    //  Float version:
+	return gamma_impl(s, 1.0/s);
+}
+
+
+////////////////  INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT)  ///////////////
+
+//  Lower incomplete gamma function for small s and z (implementation):
+float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z <= ~0.775075
+    //              3.) s_inv = 1.0/s (precomputed for outside reuse)
+    //  Returns:    A series representation for the lower incomplete gamma
+    //              function for small s and small z (4 terms).
+    //  The actual "rolled up" summation looks like:
+	//      last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0;
+	//      sum = last_sign * last_pow / ((s + k) * last_factorial)
+	//      for(int i = 0; i < 4; ++i)
+	//      {
+	//          last_sign *= -1.0; last_pow *= z; last_factorial *= i;
+	//          sum += last_sign * last_pow / ((s + k) * last_factorial);
+	//      }
+	//  Unrolled, constant-unfolded and arranged for madds and parallelism:
+	const float4 scale = pow(z, s);
+	float4 sum = s_inv;  //  Summation iteration 0 result
+	//  Summation iterations 1, 2, and 3:
+	const float4 z_sq = z*z;
+	const float4 denom1 = s + float4(1.0);
+	const float4 denom2 = 2.0*s + float4(4.0);
+	const float4 denom3 = 6.0*s + float4(18.0);
+	//float4 denom4 = 24.0*s + float4(96.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	//sum += z_sq * z_sq / denom4;
+	//  Scale and return:
+	return scale * sum;
+}
+
+float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv)
+{
+    //  Float3 version:
+	const float3 scale = pow(z, s);
+	float3 sum = s_inv;
+	const float3 z_sq = z*z;
+	const float3 denom1 = s + float3(1.0);
+	const float3 denom2 = 2.0*s + float3(4.0);
+	const float3 denom3 = 6.0*s + float3(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv)
+{
+    //  Float2 version:
+	const float2 scale = pow(z, s);
+	float2 sum = s_inv;
+	const float2 z_sq = z*z;
+	const float2 denom1 = s + float2(1.0);
+	const float2 denom2 = 2.0*s + float2(4.0);
+	const float2 denom3 = 6.0*s + float2(18.0);
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+float ligamma_small_z_impl(const float s, const float z, const float s_inv)
+{
+    //  Float version:
+	const float scale = pow(z, s);
+	float sum = s_inv;
+	const float z_sq = z*z;
+	const float denom1 = s + 1.0;
+	const float denom2 = 2.0*s + 4.0;
+	const float denom3 = 6.0*s + 18.0;
+	sum -= z/denom1;
+	sum += z_sq/denom2;
+	sum -= z * z_sq/denom3;
+	return scale * sum;
+}
+
+//  Upper incomplete gamma function for small s and large z (implementation):
+float4 uigamma_large_z_impl(const float4 s, const float4 z)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) z > ~0.775075
+    //  Returns:    Gauss's continued fraction representation for the upper
+    //              incomplete gamma function (4 terms).
+	//  The "rolled up" continued fraction looks like this.  The denominator
+    //  is truncated, and it's calculated "from the bottom up:"
+	//      denom = float4('inf');
+	//      float4 one = float4(1.0);
+	//      for(int i = 4; i > 0; --i)
+	//      {
+	//          denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom;
+	//      }
+	//  Unrolled and constant-unfolded for madds and parallelism:
+	const float4 numerator = pow(z, s) * exp(-z);
+	float4 denom = float4(7.0) + z - s;
+	denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom;
+	denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom;
+	denom = float4(1.0) + z - s + (s - float4(1.0))/denom;
+	return numerator / denom;
+}
+
+float3 uigamma_large_z_impl(const float3 s, const float3 z)
+{
+    //  Float3 version:
+	const float3 numerator = pow(z, s) * exp(-z);
+	float3 denom = float3(7.0) + z - s;
+	denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom;
+	denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom;
+	denom = float3(1.0) + z - s + (s - float3(1.0))/denom;
+	return numerator / denom;
+}
+
+float2 uigamma_large_z_impl(const float2 s, const float2 z)
+{
+    //  Float2 version:
+	const float2 numerator = pow(z, s) * exp(-z);
+	float2 denom = float2(7.0) + z - s;
+	denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom;
+	denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom;
+	denom = float2(1.0) + z - s + (s - float2(1.0))/denom;
+	return numerator / denom;
+}
+
+float uigamma_large_z_impl(const float s, const float z)
+{
+    //  Float version:
+	const float numerator = pow(z, s) * exp(-z);
+	float denom = 7.0 + z - s;
+	denom = 5.0 + z - s + (3.0*s - 9.0)/denom;
+	denom = 3.0 + z - s + (2.0*s - 4.0)/denom;
+	denom = 1.0 + z - s + (s - 1.0)/denom;
+	return numerator / denom;
+}
+
+//  Normalized lower incomplete gamma function for small s (implementation):
+float4 normalized_ligamma_impl(const float4 s, const float4 z,
+    const float4 s_inv, const float4 gamma_s_inv)
+{
+    //  Requires:   1.) s < ~0.5
+    //              2.) s_inv = 1/s (precomputed for outside reuse)
+    //              3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse)
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  Since we only care about s < 0.5, we only need
+    //              to evaluate two branches (not four) based on z.  Each branch
+    //              uses four terms, with a max relative error of ~0.00182.  The
+    //              branch threshold and specifics were adapted for fewer terms
+    //              from Gil/Segura/Temme's paper here:
+    //                  http://oai.cwi.nl/oai/asset/20433/20433B.pdf
+	//  Evaluate both branches: Real branches test slower even when available.
+	static const float4 thresh = float4(0.775075);
+	bool4 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	z_is_large.w = z.w > thresh.w;
+	const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	//  Combine the results from both branches:
+	bool4 inverse_z_is_large = not(z_is_large);
+	return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large);
+}
+
+float3 normalized_ligamma_impl(const float3 s, const float3 z,
+    const float3 s_inv, const float3 gamma_s_inv)
+{
+    //  Float3 version:
+	static const float3 thresh = float3(0.775075);
+	bool3 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool3 inverse_z_is_large = not(z_is_large);
+	return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large);
+}
+
+float2 normalized_ligamma_impl(const float2 s, const float2 z,
+    const float2 s_inv, const float2 gamma_s_inv)
+{
+    //  Float2 version:
+	static const float2 thresh = float2(0.775075);
+	bool2 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool2 inverse_z_is_large = not(z_is_large);
+	return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large);
+}
+
+float normalized_ligamma_impl(const float s, const float z,
+    const float s_inv, const float gamma_s_inv)
+{
+    //  Float version:
+	static const float thresh = 0.775075;
+	const bool z_is_large = z > thresh;
+	const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	return large_z * float(z_is_large) + small_z * float(!z_is_large);
+}
+
+//  Normalized lower incomplete gamma function for small s:
+float4 normalized_ligamma(const float4 s, const float4 z)
+{
+    //  Requires:   s < ~0.5
+    //  Returns:    Approximate the normalized lower incomplete gamma function
+    //              for s < 0.5.  See normalized_ligamma_impl() for details.
+	const float4 s_inv = float4(1.0)/s;
+	const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float3 normalized_ligamma(const float3 s, const float3 z)
+{
+    //  Float3 version:
+	const float3 s_inv = float3(1.0)/s;
+	const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float2 normalized_ligamma(const float2 s, const float2 z)
+{
+    //  Float2 version:
+	const float2 s_inv = float2(1.0)/s;
+	const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+float normalized_ligamma(const float s, const float z)
+{
+    //  Float version:
+	const float s_inv = 1.0/s;
+	const float gamma_s_inv = 1.0/gamma_impl(s, s_inv);
+	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
+}
+
+#endif  //  SPECIAL_FUNCTIONS_H
+
+////////////////////////////  END SPECIAL-FUNCTIONS  ///////////////////////////
+
+//#include "../../../../include/gamma-management.h"
+
+////////////////////////////  BEGIN GAMMA-MANAGEMENT  //////////////////////////
+
+#ifndef GAMMA_MANAGEMENT_H
+#define GAMMA_MANAGEMENT_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//  
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides gamma-aware tex*D*() and encode_output() functions.
+//  Requires:   Before #include-ing this file, the including file must #define
+//              the following macros when applicable and follow their rules:
+//              1.) #define FIRST_PASS if this is the first pass.
+//              2.) #define LAST_PASS if this is the last pass.
+//              3.) If sRGB is available, set srgb_framebufferN = "true" for
+//                  every pass except the last in your .cgp preset.
+//              4.) If sRGB isn't available but you want gamma-correctness with
+//                  no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
+//              5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
+//              6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
+//              7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
+//              8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
+//              If an option in [5, 8] is #defined in the first or last pass, it
+//              should be #defined for both.  It shouldn't make a difference
+//              whether it's #defined for intermediate passes or not.
+//  Optional:   The including file (or an earlier included file) may optionally
+//              #define a number of macros indicating it will override certain
+//              macros and associated constants are as follows:
+//              static constants with either static or uniform constants.  The
+//              1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
+//                  static const float ntsc_gamma
+//                  static const float pal_gamma
+//                  static const float crt_reference_gamma_high
+//                  static const float crt_reference_gamma_low
+//                  static const float lcd_reference_gamma
+//                  static const float crt_office_gamma
+//                  static const float lcd_office_gamma
+//              2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
+//                  static const float crt_gamma
+//                  static const float gba_gamma
+//                  static const float lcd_gamma
+//              3.) OVERRIDE_FINAL_GAMMA: The user must first define:
+//                  static const float input_gamma
+//                  static const float intermediate_gamma
+//                  static const float output_gamma
+//                  (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
+//              4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
+//                  static const bool assume_opaque_alpha
+//              The gamma constant overrides must be used in every pass or none,
+//              and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
+//              OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
+//  Usage:      After setting macros appropriately, ignore gamma correction and
+//              replace all tex*D*() calls with equivalent gamma-aware
+//              tex*D*_linearize calls, except:
+//              1.) When you read an LUT, use regular tex*D or a gamma-specified
+//                  function, depending on its gamma encoding:
+//                      tex*D*_linearize_gamma (takes a runtime gamma parameter)
+//              2.) If you must read pass0's original input in a later pass, use
+//                  tex2D_linearize_ntsc_gamma.  If you want to read pass0's
+//                  input with gamma-corrected bilinear filtering, consider
+//                  creating a first linearizing pass and reading from the input
+//                  of pass1 later.
+//              Then, return encode_output(color) from every fragment shader.
+//              Finally, use the global gamma_aware_bilinear boolean if you want
+//              to statically branch based on whether bilinear filtering is
+//              gamma-correct or not (e.g. for placing Gaussian blur samples).
+//
+//  Detailed Policy:
+//  tex*D*_linearize() functions enforce a consistent gamma-management policy
+//  based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings.  They assume
+//  their input texture has the same encoding characteristics as the input for
+//  the current pass (which doesn't apply to the exceptions listed above).
+//  Similarly, encode_output() enforces a policy based on the LAST_PASS and
+//  GAMMA_ENCODE_EVERY_FBO settings.  Together, they result in one of the
+//  following two pipelines.
+//  Typical pipeline with intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = linear_color;     //  Automatic sRGB encoding
+//      linear_color = intermediate_output;     //  Automatic sRGB decoding
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Typical pipeline without intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
+//      linear_color = pow(intermediate_output, intermediate_gamma);
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
+//  easily get gamma-correctness without banding on devices where sRGB isn't
+//  supported.
+//
+//  Use This Header to Maximize Code Reuse:
+//  The purpose of this header is to provide a consistent interface for texture
+//  reads and output gamma-encoding that localizes and abstracts away all the
+//  annoying details.  This greatly reduces the amount of code in each shader
+//  pass that depends on the pass number in the .cgp preset or whether sRGB
+//  FBO's are being used: You can trivially change the gamma behavior of your
+//  whole pass by commenting or uncommenting 1-3 #defines.  To reuse the same
+//  code in your first, Nth, and last passes, you can even put it all in another
+//  header file and #include it from skeleton .cg files that #define the
+//  appropriate pass-specific settings.
+//
+//  Rationale for Using Three Macros:
+//  This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
+//  SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
+//  a lower maintenance burden on each pass.  At first glance it seems we could
+//  accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
+//  This works for simple use cases where input_gamma == output_gamma, but it
+//  breaks down for more complex scenarios like CRT simulation, where the pass
+//  number determines the gamma encoding of the input and output.
+
+
+///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
+
+//  Set standard gamma constants, but allow users to override them:
+#ifndef OVERRIDE_STANDARD_GAMMA
+    //  Standard encoding gammas:
+    static const float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
+    static const float pal_gamma = 2.8;     //  Never actually 2.8 in practice
+    //  Typical device decoding gammas (only use for emulating devices):
+    //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
+    //  gammas: The standards purposely undercorrected for an analog CRT's
+    //  assumed 2.5 reference display gamma to maintain contrast in assumed
+    //  [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
+    //  These unstated assumptions about display gamma and perceptual rendering
+    //  intent caused a lot of confusion, and more modern CRT's seemed to target
+    //  NTSC 2.2 gamma with circuitry.  LCD displays seem to have followed suit
+    //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
+    //  displays designed to view sRGB in bright environments.  (Standards are
+    //  also in flux again with BT.1886, but it's underspecified for displays.)
+    static const float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
+    static const float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
+    static const float lcd_reference_gamma = 2.5;       //  To match CRT
+    static const float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
+    static const float lcd_office_gamma = 2.2;  //  Approximates sRGB
+#endif  //  OVERRIDE_STANDARD_GAMMA
+
+//  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
+//  but only if they're aware of it.
+#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
+    static const bool assume_opaque_alpha = false;
+#endif
+
+
+///////////////////////  DERIVED CONSTANTS AS FUNCTIONS  ///////////////////////
+
+//  gamma-management.h should be compatible with overriding gamma values with
+//  runtime user parameters, but we can only define other global constants in
+//  terms of static constants, not uniform user parameters.  To get around this
+//  limitation, we need to define derived constants using functions.
+
+//  Set device gamma constants, but allow users to override them:
+#ifdef OVERRIDE_DEVICE_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_crt_gamma()    {   return crt_gamma;   }
+    inline float get_gba_gamma()    {   return gba_gamma;   }
+    inline float get_lcd_gamma()    {   return lcd_gamma;   }
+#else
+    inline float get_crt_gamma()    {   return crt_reference_gamma_high;    }
+    inline float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
+    inline float get_lcd_gamma()    {   return lcd_office_gamma;            }
+#endif  //  OVERRIDE_DEVICE_GAMMA
+
+//  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
+#ifdef OVERRIDE_FINAL_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_intermediate_gamma()   {   return intermediate_gamma;  }
+    inline float get_input_gamma()          {   return input_gamma;         }
+    inline float get_output_gamma()         {   return output_gamma;        }
+#else
+    //  If we gamma-correct every pass, always use ntsc_gamma between passes to
+    //  ensure middle passes don't need to care if anything is being simulated:
+    inline float get_intermediate_gamma()   {   return ntsc_gamma;          }
+    #ifdef SIMULATE_CRT_ON_LCD
+        inline float get_input_gamma()      {   return get_crt_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_LCD
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_LCD_ON_CRT
+        inline float get_input_gamma()      {   return get_lcd_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_CRT
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else   //  Don't simulate anything:
+        inline float get_input_gamma()      {   return ntsc_gamma;          }
+        inline float get_output_gamma()     {   return ntsc_gamma;          }
+    #endif  //  SIMULATE_GBA_ON_CRT
+    #endif  //  SIMULATE_LCD_ON_CRT
+    #endif  //  SIMULATE_GBA_ON_LCD
+    #endif  //  SIMULATE_CRT_ON_LCD
+#endif  //  OVERRIDE_FINAL_GAMMA
+
+//  Set decoding/encoding gammas for the current pass.  Use static constants for
+//  linearize_input and gamma_encode_output, because they aren't derived, and
+//  they let the compiler do dead-code elimination.
+#ifndef GAMMA_ENCODE_EVERY_FBO
+    #ifdef FIRST_PASS
+        static const bool linearize_input = true;
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        static const bool linearize_input = false;
+        inline float get_pass_input_gamma()     {   return 1.0;                 }
+    #endif
+    #ifdef LAST_PASS
+        static const bool gamma_encode_output = true;
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        static const bool gamma_encode_output = false;
+        inline float get_pass_output_gamma()    {   return 1.0;                 }
+    #endif
+#else
+    static const bool linearize_input = true;
+    static const bool gamma_encode_output = true;
+    #ifdef FIRST_PASS
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        inline float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
+    #endif
+    #ifdef LAST_PASS
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        inline float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
+    #endif
+#endif
+
+//  Users might want to know if bilinear filtering will be gamma-correct:
+static const bool gamma_aware_bilinear = !linearize_input;
+
+
+//////////////////////  COLOR ENCODING/DECODING FUNCTIONS  /////////////////////
+
+inline float4 encode_output(const float4 color)
+{
+    if(gamma_encode_output)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_input(const float4 color)
+{
+    if(linearize_input)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_gamma_input(const float4 color, const float3 gamma)
+{
+    if(assume_opaque_alpha)
+    {
+        return float4(pow(color.rgb, gamma), 1.0);
+    }
+    else
+    {
+        return float4(pow(color.rgb, gamma), color.a);
+    }
+}
+
+//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯
+//#define tex2D_linearize(C, D) decode_input(vec4(texture(C, D)))
+// EDIT: it's the 'const' in front of the coords that's doing it
+
+///////////////////////////  TEXTURE LOOKUP WRAPPERS  //////////////////////////
+
+//  "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a wide array of linearizing texture lookup wrapper functions.  The
+//  Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
+//  lookups are provided for completeness in case that changes someday.  Nobody
+//  is likely to use the *fetch and *proj functions, but they're included just
+//  in case.  The only tex*D texture sampling functions omitted are:
+//      - tex*Dcmpbias
+//      - tex*Dcmplod
+//      - tex*DARRAY*
+//      - tex*DMS*
+//      - Variants returning integers
+//  Standard line length restrictions are ignored below for vertical brevity.
+/*
+//  tex1D:
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex1Dbias:
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dbias(tex, tex_coords));   }
+
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dbias(tex, tex_coords, texel_off));    }
+
+//  tex1Dfetch:
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
+{   return decode_input(tex1Dfetch(tex, tex_coords));  }
+
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex1Dlod:
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dlod(tex, tex_coords));    }
+
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dlod(tex, tex_coords, texel_off));     }
+
+//  tex1Dproj:
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+*/
+//  tex2D:
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords, texel_off));    }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex2Dbias:
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
+//{   return decode_input(tex2Dbias(tex, tex_coords));   }
+
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dbias(tex, tex_coords, texel_off));    }
+
+//  tex2Dfetch:
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
+//{   return decode_input(tex2Dfetch(tex, tex_coords));  }
+
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords)
+{   return decode_input(textureLod(tex, tex_coords.xy, 0.0));    }
+
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));     }
+/*
+//  tex2Dproj:
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+*/
+/*
+//  tex3D:
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
+{   return decode_input(tex3D(tex, tex_coords));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, texel_off));    }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex3Dbias:
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dbias(tex, tex_coords));   }
+
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dbias(tex, tex_coords, texel_off));    }
+
+//  tex3Dfetch:
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
+{   return decode_input(tex3Dfetch(tex, tex_coords));  }
+
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex3Dlod:
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dlod(tex, tex_coords));    }
+
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dlod(tex, tex_coords, texel_off));     }
+
+//  tex3Dproj:
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dproj(tex, tex_coords));   }
+
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dproj(tex, tex_coords, texel_off));    }
+/////////*
+
+//  NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  This narrow selection of nonstandard tex2D* functions can be useful:
+
+//  tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0)));   }
+
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off));    }
+
+
+//  MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a narrower selection of tex2D* wrapper functions that decode an
+//  input sample with a specified gamma value.  These are useful for reading
+//  LUT's and for reading the input of pass0 in a later pass.
+
+//  tex2D:
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma);   }
+
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+/*
+//  tex2Dbias:
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma);   }
+
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma);    }
+
+//  tex2Dfetch:
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma);  }
+
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma);   }
+*/
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma);    }
+
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma);     }
+
+
+#endif  //  GAMMA_MANAGEMENT_H
+
+////////////////////////////  END GAMMA-MANAGEMENT  //////////////////////////
+
+////////////////////////////////  END INCLUDES  ////////////////////////////////
+
+/////////////////////////////  SCANLINE FUNCTIONS  /////////////////////////////
+
+inline float3 get_gaussian_sigma(const float3 color, const float sigma_range)
+{
+    //  Requires:   Globals:
+    //              1.) beam_min_sigma and beam_max_sigma are global floats
+    //                  containing the desired minimum and maximum beam standard
+    //                  deviations, for dim and bright colors respectively.
+    //              2.) beam_max_sigma must be > 0.0
+    //              3.) beam_min_sigma must be in (0.0, beam_max_sigma]
+    //              4.) beam_spot_power must be defined as a global float.
+    //              Parameters:
+    //              1.) color is the underlying source color along a scanline
+    //              2.) sigma_range = beam_max_sigma - beam_min_sigma; we take
+    //                  sigma_range as a parameter to avoid repeated computation
+    //                  when beam_{min, max}_sigma are runtime shader parameters
+    //  Optional:   Users may set beam_spot_shape_function to 1 to define the
+    //              inner f(color) subfunction (see below) as:
+    //                  f(color) = sqrt(1.0 - (color - 1.0)*(color - 1.0))
+    //              Otherwise (technically, if beam_spot_shape_function < 0.5):
+    //                  f(color) = pow(color, beam_spot_power)
+    //  Returns:    The standard deviation of the Gaussian beam for "color:"
+    //                  sigma = beam_min_sigma + sigma_range * f(color)
+    //  Details/Discussion:
+    //  The beam's spot shape vaguely resembles an aspect-corrected f() in the
+    //  range [0, 1] (not quite, but it's related).  f(color) = color makes
+    //  spots look like diamonds, and a spherical function or cube balances
+    //  between variable width and a soft/realistic shape.   A beam_spot_power
+    //  > 1.0 can produce an ugly spot shape and more initial clipping, but the
+    //  final shape also differs based on the horizontal resampling filter and
+    //  the phosphor bloom.  For instance, resampling horizontally in nonlinear
+    //  light and/or with a sharp (e.g. Lanczos) filter will sharpen the spot
+    //  shape, but a sixth root is still quite soft.  A power function (default
+    //  1.0/3.0 beam_spot_power) is most flexible, but a fixed spherical curve
+    //  has the highest variability without an awful spot shape.
+    //
+    //  beam_min_sigma affects scanline sharpness/aliasing in dim areas, and its
+    //  difference from beam_max_sigma affects beam width variability.  It only
+    //  affects clipping [for pure Gaussians] if beam_spot_power > 1.0 (which is
+    //  a conservative estimate for a more complex constraint).
+    //
+    //  beam_max_sigma affects clipping and increasing scanline width/softness
+    //  as color increases.  The wider this is, the more scanlines need to be
+    //  evaluated to avoid distortion.  For a pure Gaussian, the max_beam_sigma
+    //  at which the first unused scanline always has a weight < 1.0/255.0 is:
+    //      num scanlines = 2, max_beam_sigma = 0.2089; distortions begin ~0.34
+    //      num scanlines = 3, max_beam_sigma = 0.3879; distortions begin ~0.52
+    //      num scanlines = 4, max_beam_sigma = 0.5723; distortions begin ~0.70
+    //      num scanlines = 5, max_beam_sigma = 0.7591; distortions begin ~0.89
+    //      num scanlines = 6, max_beam_sigma = 0.9483; distortions begin ~1.08
+    //  Generalized Gaussians permit more leeway here as steepness increases.
+    if(beam_spot_shape_function < 0.5)
+    {
+        //  Use a power function:
+        return float3(beam_min_sigma) + sigma_range *
+            pow(color, float3(beam_spot_power));
+    }
+    else
+    {
+        //  Use a spherical function:
+        const float3 color_minus_1 = color - float3(1.0);
+        return float3(beam_min_sigma) + sigma_range *
+            sqrt(float3(1.0) - color_minus_1*color_minus_1);
+    }
+}
+
+inline float3 get_generalized_gaussian_beta(const float3 color,
+    const float shape_range)
+{
+    //  Requires:   Globals:
+    //              1.) beam_min_shape and beam_max_shape are global floats
+    //                  containing the desired min/max generalized Gaussian
+    //                  beta parameters, for dim and bright colors respectively.
+    //              2.) beam_max_shape must be >= 2.0
+    //              3.) beam_min_shape must be in [2.0, beam_max_shape]
+    //              4.) beam_shape_power must be defined as a global float.
+    //              Parameters:
+    //              1.) color is the underlying source color along a scanline
+    //              2.) shape_range = beam_max_shape - beam_min_shape; we take
+    //                  shape_range as a parameter to avoid repeated computation
+    //                  when beam_{min, max}_shape are runtime shader parameters
+    //  Returns:    The type-I generalized Gaussian "shape" parameter beta for
+    //              the given color.
+    //  Details/Discussion:
+    //  Beta affects the scanline distribution as follows:
+    //  a.) beta < 2.0 narrows the peak to a spike with a discontinuous slope
+    //  b.) beta == 2.0 just degenerates to a Gaussian
+    //  c.) beta > 2.0 flattens and widens the peak, then drops off more steeply
+    //      than a Gaussian.  Whereas high sigmas widen and soften peaks, high
+    //      beta widen and sharpen peaks at the risk of aliasing.
+    //  Unlike high beam_spot_powers, high beam_shape_powers actually soften shape
+    //  transitions, whereas lower ones sharpen them (at the risk of aliasing).
+    return beam_min_shape + shape_range * pow(color, float3(beam_shape_power));
+}
+
+float3 scanline_gaussian_integral_contrib(const float3 dist,
+    const float3 color, const float pixel_height, const float sigma_range)
+{
+    //  Requires:   1.) dist is the distance of the [potentially separate R/G/B]
+    //                  point(s) from a scanline in units of scanlines, where
+    //                  1.0 means the sample point straddles the next scanline.
+    //              2.) color is the underlying source color along a scanline.
+    //              3.) pixel_height is the output pixel height in scanlines.
+    //              4.) Requirements of get_gaussian_sigma() must be met.
+    //  Returns:    Return a scanline's light output over a given pixel.
+    //  Details:
+    //  The CRT beam profile follows a roughly Gaussian distribution which is
+    //  wider for bright colors than dark ones.  The integral over the full
+    //  range of a Gaussian function is always 1.0, so we can vary the beam
+    //  with a standard deviation without affecting brightness.  'x' = distance:
+    //      gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2))
+    //      gaussian integral = 0.5 (1.0 + erf(x/(sigma * sqrt(2))))
+    //  Use a numerical approximation of the "error function" (the Gaussian
+    //  indefinite integral) to find the definite integral of the scanline's
+    //  average brightness over a given pixel area.  Even if curved coords were
+    //  used in this pass, a flat scalar pixel height works almost as well as a
+    //  pixel height computed from a full pixel-space to scanline-space matrix.
+    const float3 sigma = get_gaussian_sigma(color, sigma_range);
+    const float3 ph_offset = float3(pixel_height * 0.5);
+    const float3 denom_inv = 1.0/(sigma*sqrt(2.0));
+    const float3 integral_high = erf((dist + ph_offset)*denom_inv);
+    const float3 integral_low = erf((dist - ph_offset)*denom_inv);
+    return color * 0.5*(integral_high - integral_low)/pixel_height;
+}
+
+float3 scanline_generalized_gaussian_integral_contrib(float3 dist,
+    float3 color, float pixel_height, float sigma_range,
+    float shape_range)
+{
+    //  Requires:   1.) Requirements of scanline_gaussian_integral_contrib()
+    //                  must be met.
+    //              2.) Requirements of get_gaussian_sigma() must be met.
+    //              3.) Requirements of get_generalized_gaussian_beta() must be
+    //                  met.
+    //  Returns:    Return a scanline's light output over a given pixel.
+    //  A generalized Gaussian distribution allows the shape (beta) to vary
+    //  as well as the width (alpha).  "gamma" refers to the gamma function:
+    //      generalized sample =
+    //          beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta)
+    //  ligamma(s, z) is the lower incomplete gamma function, for which we only
+    //  implement two of four branches (because we keep 1/beta <= 0.5):
+    //      generalized integral = 0.5 + 0.5* sign(x) *
+    //          ligamma(1/beta, (|x|/alpha)**beta)/gamma(1/beta)
+    //  See get_generalized_gaussian_beta() for a discussion of beta.
+    //  We base alpha on the intended Gaussian sigma, but it only strictly
+    //  models models standard deviation at beta == 2, because the standard
+    //  deviation depends on both alpha and beta (keeping alpha independent is
+    //  faster and preserves intuitive behavior and a full spectrum of results).
+    const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
+    const float3 beta = get_generalized_gaussian_beta(color, shape_range);
+    const float3 alpha_inv = float3(1.0)/alpha;
+    const float3 s = float3(1.0)/beta;
+    const float3 ph_offset = float3(pixel_height * 0.5);
+    //  Pass beta to gamma_impl to avoid repeated divides.  Similarly pass
+    //  beta (i.e. 1/s) and 1/gamma(s) to normalized_ligamma_impl.
+    const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, beta);
+    const float3 dist1 = dist + ph_offset;
+    const float3 dist0 = dist - ph_offset;
+    const float3 integral_high = sign(dist1) * normalized_ligamma_impl(
+        s, pow(abs(dist1)*alpha_inv, beta), beta, gamma_s_inv);
+    const float3 integral_low = sign(dist0) * normalized_ligamma_impl(
+        s, pow(abs(dist0)*alpha_inv, beta), beta, gamma_s_inv);
+    return color * 0.5*(integral_high - integral_low)/pixel_height;
+}
+
+float3 scanline_gaussian_sampled_contrib(const float3 dist, const float3 color,
+    const float pixel_height, const float sigma_range)
+{
+    //  See scanline_gaussian integral_contrib() for detailed comments!
+    //  gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2))
+    const float3 sigma = get_gaussian_sigma(color, sigma_range);
+    //  Avoid repeated divides:
+    const float3 sigma_inv = float3(1.0)/sigma;
+    const float3 inner_denom_inv = 0.5 * sigma_inv * sigma_inv;
+    const float3 outer_denom_inv = sigma_inv/sqrt(2.0*pi);
+    if(beam_antialias_level > 0.5)
+    {
+        //  Sample 1/3 pixel away in each direction as well:
+        const float3 sample_offset = float3(pixel_height/3.0);
+        const float3 dist2 = dist + sample_offset;
+        const float3 dist3 = abs(dist - sample_offset);
+        //  Average three pure Gaussian samples:
+        const float3 scale = color/3.0 * outer_denom_inv;
+        const float3 weight1 = exp(-(dist*dist)*inner_denom_inv);
+        const float3 weight2 = exp(-(dist2*dist2)*inner_denom_inv);
+        const float3 weight3 = exp(-(dist3*dist3)*inner_denom_inv);
+        return scale * (weight1 + weight2 + weight3);
+    }
+    else
+    {
+        return color*exp(-(dist*dist)*inner_denom_inv)*outer_denom_inv;
+    }
+}
+
+float3 scanline_generalized_gaussian_sampled_contrib(float3 dist,
+    float3 color, float pixel_height, float sigma_range,
+    float shape_range)
+{
+    //  See scanline_generalized_gaussian_integral_contrib() for details!
+    //  generalized sample =
+    //      beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta)
+    const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
+    const float3 beta = get_generalized_gaussian_beta(color, shape_range);
+    //  Avoid repeated divides:
+    const float3 alpha_inv = float3(1.0)/alpha;
+    const float3 beta_inv = float3(1.0)/beta;
+    const float3 scale = color * beta * 0.5 * alpha_inv /
+        gamma_impl(beta_inv, beta);
+    if(beam_antialias_level > 0.5)
+    {
+        //  Sample 1/3 pixel closer to and farther from the scanline too.
+        const float3 sample_offset = float3(pixel_height/3.0);
+        const float3 dist2 = dist + sample_offset;
+        const float3 dist3 = abs(dist - sample_offset);
+        //  Average three generalized Gaussian samples:
+        const float3 weight1 = exp(-pow(abs(dist*alpha_inv), beta));
+        const float3 weight2 = exp(-pow(abs(dist2*alpha_inv), beta));
+        const float3 weight3 = exp(-pow(abs(dist3*alpha_inv), beta));
+        return scale/3.0 * (weight1 + weight2 + weight3);
+    }
+    else
+    {
+        return scale * exp(-pow(abs(dist*alpha_inv), beta));
+    }
+}
+
+inline float3 scanline_contrib(float3 dist, float3 color,
+    float pixel_height, const float sigma_range, const float shape_range)
+{
+    //  Requires:   1.) Requirements of scanline_gaussian_integral_contrib()
+    //                  must be met.
+    //              2.) Requirements of get_gaussian_sigma() must be met.
+    //              3.) Requirements of get_generalized_gaussian_beta() must be
+    //                  met.
+    //  Returns:    Return a scanline's light output over a given pixel, using
+    //              a generalized or pure Gaussian distribution and sampling or
+    //              integrals as desired by user codepath choices.
+    if(beam_generalized_gaussian)
+    {
+        if(beam_antialias_level > 1.5)
+        {
+            return scanline_generalized_gaussian_integral_contrib(
+                dist, color, pixel_height, sigma_range, shape_range);
+        }
+        else
+        {
+            return scanline_generalized_gaussian_sampled_contrib(
+                dist, color, pixel_height, sigma_range, shape_range);
+        }
+    }
+    else
+    {
+        if(beam_antialias_level > 1.5)
+        {
+            return scanline_gaussian_integral_contrib(
+                dist, color, pixel_height, sigma_range);
+        }
+        else
+        {
+            return scanline_gaussian_sampled_contrib(
+                dist, color, pixel_height, sigma_range);
+        }
+    }
+}
+
+inline float3 get_raw_interpolated_color(const float3 color0,
+    const float3 color1, const float3 color2, const float3 color3,
+    const float4 weights)
+{
+    //  Use max to avoid bizarre artifacts from negative colors:
+    return max(mul(weights, float4x3(color0, color1, color2, color3)), 0.0);
+}
+
+float3 get_interpolated_linear_color(const float3 color0, const float3 color1,
+    const float3 color2, const float3 color3, const float4 weights)
+{
+    //  Requires:   1.) Requirements of include/gamma-management.h must be met:
+    //                  intermediate_gamma must be globally defined, and input
+    //                  colors are interpreted as linear RGB unless you #define
+    //                  GAMMA_ENCODE_EVERY_FBO (in which case they are
+    //                  interpreted as gamma-encoded with intermediate_gamma).
+    //              2.) color0-3 are colors sampled from a texture with tex2D().
+    //                  They are interpreted as defined in requirement 1.
+    //              3.) weights contains weights for each color, summing to 1.0.
+    //              4.) beam_horiz_linear_rgb_weight must be defined as a global
+    //                  float in [0.0, 1.0] describing how much blending should
+    //                  be done in linear RGB (rest is gamma-corrected RGB).
+    //              5.) RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE must be #defined
+    //                  if beam_horiz_linear_rgb_weight is anything other than a
+    //                  static constant, or we may try branching at runtime
+    //                  without dynamic branches allowed (slow).
+    //  Returns:    Return an interpolated color lookup between the four input
+    //              colors based on the weights in weights.  The final color will
+    //              be a linear RGB value, but the blending will be done as
+    //              indicated above.
+    const float intermediate_gamma = get_intermediate_gamma();
+    //  Branch if beam_horiz_linear_rgb_weight is static (for free) or if the
+    //  profile allows dynamic branches (faster than computing extra pows):
+    #ifndef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+    #else
+        #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+            #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+        #endif
+    #endif
+    #ifdef SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+        //  beam_horiz_linear_rgb_weight is static, so we can branch:
+        #ifdef GAMMA_ENCODE_EVERY_FBO
+            const float3 gamma_mixed_color = pow(get_raw_interpolated_color(
+                color0, color1, color2, color3, weights), float3(intermediate_gamma));
+            if(beam_horiz_linear_rgb_weight > 0.0)
+            {
+                const float3 linear_mixed_color = get_raw_interpolated_color(
+                    pow(color0, float3(intermediate_gamma)),
+                    pow(color1, float3(intermediate_gamma)),
+                    pow(color2, float3(intermediate_gamma)),
+                    pow(color3, float3(intermediate_gamma)),
+                    weights);
+                return lerp(gamma_mixed_color, linear_mixed_color,
+                    beam_horiz_linear_rgb_weight);
+            }
+            else
+            {
+                return gamma_mixed_color;
+            }
+        #else
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                color0, color1, color2, color3, weights);
+            if(beam_horiz_linear_rgb_weight < 1.0)
+            {
+                const float3 gamma_mixed_color = get_raw_interpolated_color(
+                    pow(color0, float3(1.0/intermediate_gamma)),
+                    pow(color1, float3(1.0/intermediate_gamma)),
+                    pow(color2, float3(1.0/intermediate_gamma)),
+                    pow(color3, float3(1.0/intermediate_gamma)),
+                    weights);
+                return lerp(gamma_mixed_color, linear_mixed_color,
+                    beam_horiz_linear_rgb_weight);
+            }
+            else
+            {
+                return linear_mixed_color;
+            }
+        #endif  //  GAMMA_ENCODE_EVERY_FBO
+    #else
+        #ifdef GAMMA_ENCODE_EVERY_FBO
+            //  Inputs: color0-3 are colors in gamma-encoded RGB.
+            const float3 gamma_mixed_color = pow(get_raw_interpolated_color(
+                color0, color1, color2, color3, weights), intermediate_gamma);
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                pow(color0, float3(intermediate_gamma)),
+                pow(color1, float3(intermediate_gamma)),
+                pow(color2, float3(intermediate_gamma)),
+                pow(color3, float3(intermediate_gamma)),
+                weights);
+            return lerp(gamma_mixed_color, linear_mixed_color,
+                beam_horiz_linear_rgb_weight);
+        #else
+            //  Inputs: color0-3 are colors in linear RGB.
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                color0, color1, color2, color3, weights);
+            const float3 gamma_mixed_color = get_raw_interpolated_color(
+                    pow(color0, float3(1.0/intermediate_gamma)),
+                    pow(color1, float3(1.0/intermediate_gamma)),
+                    pow(color2, float3(1.0/intermediate_gamma)),
+                    pow(color3, float3(1.0/intermediate_gamma)),
+                    weights);
+			// wtf fixme
+//			const float beam_horiz_linear_rgb_weight1 = 1.0;
+            return lerp(gamma_mixed_color, linear_mixed_color,
+                beam_horiz_linear_rgb_weight);
+        #endif  //  GAMMA_ENCODE_EVERY_FBO
+    #endif  //  SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+}
+
+float3 get_scanline_color(const sampler2D tex, const float2 scanline_uv,
+    const float2 uv_step_x, const float4 weights)
+{
+    //  Requires:   1.) scanline_uv must be vertically snapped to the caller's
+    //                  desired line or scanline and horizontally snapped to the
+    //                  texel just left of the output pixel (color1)
+    //              2.) uv_step_x must contain the horizontal uv distance
+    //                  between texels.
+    //              3.) weights must contain interpolation filter weights for
+    //                  color0, color1, color2, and color3, where color1 is just
+    //                  left of the output pixel.
+    //  Returns:    Return a horizontally interpolated texture lookup using 2-4
+    //              nearby texels, according to weights and the conventions of
+    //              get_interpolated_linear_color().
+    //  We can ignore the outside texture lookups for Quilez resampling.
+    const float3 color1 = COMPAT_TEXTURE(tex, scanline_uv).rgb;
+    const float3 color2 = COMPAT_TEXTURE(tex, scanline_uv + uv_step_x).rgb;
+    float3 color0 = float3(0.0);
+    float3 color3 = float3(0.0);
+    if(beam_horiz_filter > 0.5)
+    {
+        color0 = COMPAT_TEXTURE(tex, scanline_uv - uv_step_x).rgb;
+        color3 = COMPAT_TEXTURE(tex, scanline_uv + 2.0 * uv_step_x).rgb;
+    }
+    //  Sample the texture as-is, whether it's linear or gamma-encoded:
+    //  get_interpolated_linear_color() will handle the difference.
+    return get_interpolated_linear_color(color0, color1, color2, color3, weights);
+}
+
+float3 sample_single_scanline_horizontal(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size,
+    const float2 texture_size_inv)
+{
+    //  TODO: Add function requirements.
+    //  Snap to the previous texel and get sample dists from 2/4 nearby texels:
+    const float2 curr_texel = tex_uv * tex_size;
+    //  Use under_half to fix a rounding bug right around exact texel locations.
+    const float2 prev_texel =
+        floor(curr_texel - float2(under_half)) + float2(0.5);
+    const float2 prev_texel_hor = float2(prev_texel.x, curr_texel.y);
+    const float2 prev_texel_hor_uv = prev_texel_hor * texture_size_inv;
+    const float prev_dist = curr_texel.x - prev_texel_hor.x;
+    const float4 sample_dists = float4(1.0 + prev_dist, prev_dist,
+        1.0 - prev_dist, 2.0 - prev_dist);
+    //  Get Quilez, Lanczos2, or Gaussian resize weights for 2/4 nearby texels:
+    float4 weights;
+    if(beam_horiz_filter < 0.5)
+    {
+        //  Quilez:
+        const float x = sample_dists.y;
+        const float w2 = x*x*x*(x*(x*6.0 - 15.0) + 10.0);
+        weights = float4(0.0, 1.0 - w2, w2, 0.0);
+    }
+    else if(beam_horiz_filter < 1.5)
+    {
+        //  Gaussian:
+        float inner_denom_inv = 1.0/(2.0*beam_horiz_sigma*beam_horiz_sigma);
+        weights = exp(-(sample_dists*sample_dists)*inner_denom_inv);
+    }
+    else
+    {
+        //  Lanczos2:
+        const float4 pi_dists = FIX_ZERO(sample_dists * pi);
+        weights = 2.0 * sin(pi_dists) * sin(pi_dists * 0.5) /
+            (pi_dists * pi_dists);
+    }
+    //  Ensure the weight sum == 1.0:
+    const float4 final_weights = weights/dot(weights, float4(1.0));
+    //  Get the interpolated horizontal scanline color:
+    const float2 uv_step_x = float2(texture_size_inv.x, 0.0);
+    return get_scanline_color(
+        tex, prev_texel_hor_uv, uv_step_x, final_weights);
+}
+
+float3 sample_rgb_scanline_horizontal(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size,
+    const float2 texture_size_inv)
+{
+    //  TODO: Add function requirements.
+    //  Rely on a helper to make convergence easier.
+    if(beam_misconvergence)
+    {
+        const float3 convergence_offsets_rgb =
+            get_convergence_offsets_x_vector();
+        const float3 offset_u_rgb =
+            convergence_offsets_rgb * texture_size_inv.xxx;
+        const float2 scanline_uv_r = tex_uv - float2(offset_u_rgb.r, 0.0);
+        const float2 scanline_uv_g = tex_uv - float2(offset_u_rgb.g, 0.0);
+        const float2 scanline_uv_b = tex_uv - float2(offset_u_rgb.b, 0.0);
+        const float3 sample_r = sample_single_scanline_horizontal(
+            tex, scanline_uv_r, tex_size, texture_size_inv);
+        const float3 sample_g = sample_single_scanline_horizontal(
+            tex, scanline_uv_g, tex_size, texture_size_inv);
+        const float3 sample_b = sample_single_scanline_horizontal(
+            tex, scanline_uv_b, tex_size, texture_size_inv);
+        return float3(sample_r.r, sample_g.g, sample_b.b);
+    }
+    else
+    {
+        return sample_single_scanline_horizontal(tex, tex_uv, tex_size,
+            texture_size_inv);
+    }
+}
+
+float2 get_last_scanline_uv(const float2 tex_uv, const float2 tex_size,
+    const float2 texture_size_inv, const float2 il_step_multiple,
+    const float frame_count, out float dist)
+{
+    //  Compute texture coords for the last/upper scanline, accounting for
+    //  interlacing: With interlacing, only consider even/odd scanlines every
+    //  other frame.  Top-field first (TFF) order puts even scanlines on even
+    //  frames, and BFF order puts them on odd frames.  Texels are centered at:
+    //      frac(tex_uv * tex_size) == x.5
+    //  Caution: If these coordinates ever seem incorrect, first make sure it's
+    //  not because anisotropic filtering is blurring across field boundaries.
+    //  Note: TFF/BFF won't matter for sources that double-weave or similar.
+	// wtf fixme
+//	const float interlace_bff1 = 1.0;
+    const float field_offset = floor(il_step_multiple.y * 0.75) *
+        fmod(frame_count + float(interlace_bff), 2.0);
+    const float2 curr_texel = tex_uv * tex_size;
+    //  Use under_half to fix a rounding bug right around exact texel locations.
+    const float2 prev_texel_num = floor(curr_texel - float2(under_half));
+    const float wrong_field = fmod(
+        prev_texel_num.y + field_offset, il_step_multiple.y);
+    const float2 scanline_texel_num = prev_texel_num - float2(0.0, wrong_field);
+    //  Snap to the center of the previous scanline in the current field:
+    const float2 scanline_texel = scanline_texel_num + float2(0.5);
+    const float2 scanline_uv = scanline_texel * texture_size_inv;
+    //  Save the sample's distance from the scanline, in units of scanlines:
+    dist = (curr_texel.y - scanline_texel.y)/il_step_multiple.y;
+    return scanline_uv;
+}
+
+inline bool is_interlaced(float num_lines)
+{
+    //  Detect interlacing based on the number of lines in the source.
+    if(interlace_detect)
+    {
+        //  NTSC: 525 lines, 262.5/field; 486 active (2 half-lines), 243/field
+        //  NTSC Emulators: Typically 224 or 240 lines
+        //  PAL: 625 lines, 312.5/field; 576 active (typical), 288/field
+        //  PAL Emulators: ?
+        //  ATSC: 720p, 1080i, 1080p
+        //  Where do we place our cutoffs?  Assumptions:
+        //  1.) We only need to care about active lines.
+        //  2.) Anything > 288 and <= 576 lines is probably interlaced.
+        //  3.) Anything > 576 lines is probably not interlaced...
+        //  4.) ...except 1080 lines, which is a crapshoot (user decision).
+        //  5.) Just in case the main program uses calculated video sizes,
+        //      we should nudge the float thresholds a bit.
+        const bool sd_interlace = ((num_lines > 288.5) && (num_lines < 576.5));
+        const bool hd_interlace = bool(interlace_1080i) ?
+            ((num_lines > 1079.5) && (num_lines < 1080.5)) :
+            false;
+        return (sd_interlace || hd_interlace);
+    }
+    else
+    {
+        return false;
+    }
+}
+
+#endif  //  SCANLINE_FUNCTIONS_H
+
+/////////////////////////////  END SCANLINE-FUNCTIONS  ////////////////////////////
+
+//#include "../../../../include/gamma-management.h"
+
+////////////////////////////  BEGIN GAMMA-MANAGEMENT  //////////////////////////
+
+#ifndef GAMMA_MANAGEMENT_H
+#define GAMMA_MANAGEMENT_H
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//  
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides gamma-aware tex*D*() and encode_output() functions.
+//  Requires:   Before #include-ing this file, the including file must #define
+//              the following macros when applicable and follow their rules:
+//              1.) #define FIRST_PASS if this is the first pass.
+//              2.) #define LAST_PASS if this is the last pass.
+//              3.) If sRGB is available, set srgb_framebufferN = "true" for
+//                  every pass except the last in your .cgp preset.
+//              4.) If sRGB isn't available but you want gamma-correctness with
+//                  no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
+//              5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
+//              6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
+//              7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
+//              8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
+//              If an option in [5, 8] is #defined in the first or last pass, it
+//              should be #defined for both.  It shouldn't make a difference
+//              whether it's #defined for intermediate passes or not.
+//  Optional:   The including file (or an earlier included file) may optionally
+//              #define a number of macros indicating it will override certain
+//              macros and associated constants are as follows:
+//              static constants with either static or uniform constants.  The
+//              1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
+//                  static const float ntsc_gamma
+//                  static const float pal_gamma
+//                  static const float crt_reference_gamma_high
+//                  static const float crt_reference_gamma_low
+//                  static const float lcd_reference_gamma
+//                  static const float crt_office_gamma
+//                  static const float lcd_office_gamma
+//              2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
+//                  static const float crt_gamma
+//                  static const float gba_gamma
+//                  static const float lcd_gamma
+//              3.) OVERRIDE_FINAL_GAMMA: The user must first define:
+//                  static const float input_gamma
+//                  static const float intermediate_gamma
+//                  static const float output_gamma
+//                  (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
+//              4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
+//                  static const bool assume_opaque_alpha
+//              The gamma constant overrides must be used in every pass or none,
+//              and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
+//              OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
+//  Usage:      After setting macros appropriately, ignore gamma correction and
+//              replace all tex*D*() calls with equivalent gamma-aware
+//              tex*D*_linearize calls, except:
+//              1.) When you read an LUT, use regular tex*D or a gamma-specified
+//                  function, depending on its gamma encoding:
+//                      tex*D*_linearize_gamma (takes a runtime gamma parameter)
+//              2.) If you must read pass0's original input in a later pass, use
+//                  tex2D_linearize_ntsc_gamma.  If you want to read pass0's
+//                  input with gamma-corrected bilinear filtering, consider
+//                  creating a first linearizing pass and reading from the input
+//                  of pass1 later.
+//              Then, return encode_output(color) from every fragment shader.
+//              Finally, use the global gamma_aware_bilinear boolean if you want
+//              to statically branch based on whether bilinear filtering is
+//              gamma-correct or not (e.g. for placing Gaussian blur samples).
+//
+//  Detailed Policy:
+//  tex*D*_linearize() functions enforce a consistent gamma-management policy
+//  based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings.  They assume
+//  their input texture has the same encoding characteristics as the input for
+//  the current pass (which doesn't apply to the exceptions listed above).
+//  Similarly, encode_output() enforces a policy based on the LAST_PASS and
+//  GAMMA_ENCODE_EVERY_FBO settings.  Together, they result in one of the
+//  following two pipelines.
+//  Typical pipeline with intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = linear_color;     //  Automatic sRGB encoding
+//      linear_color = intermediate_output;     //  Automatic sRGB decoding
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Typical pipeline without intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
+//      linear_color = pow(intermediate_output, intermediate_gamma);
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
+//  easily get gamma-correctness without banding on devices where sRGB isn't
+//  supported.
+//
+//  Use This Header to Maximize Code Reuse:
+//  The purpose of this header is to provide a consistent interface for texture
+//  reads and output gamma-encoding that localizes and abstracts away all the
+//  annoying details.  This greatly reduces the amount of code in each shader
+//  pass that depends on the pass number in the .cgp preset or whether sRGB
+//  FBO's are being used: You can trivially change the gamma behavior of your
+//  whole pass by commenting or uncommenting 1-3 #defines.  To reuse the same
+//  code in your first, Nth, and last passes, you can even put it all in another
+//  header file and #include it from skeleton .cg files that #define the
+//  appropriate pass-specific settings.
+//
+//  Rationale for Using Three Macros:
+//  This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
+//  SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
+//  a lower maintenance burden on each pass.  At first glance it seems we could
+//  accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
+//  This works for simple use cases where input_gamma == output_gamma, but it
+//  breaks down for more complex scenarios like CRT simulation, where the pass
+//  number determines the gamma encoding of the input and output.
+
+
+///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
+
+//  Set standard gamma constants, but allow users to override them:
+#ifndef OVERRIDE_STANDARD_GAMMA
+    //  Standard encoding gammas:
+    static const float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
+    static const float pal_gamma = 2.8;     //  Never actually 2.8 in practice
+    //  Typical device decoding gammas (only use for emulating devices):
+    //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
+    //  gammas: The standards purposely undercorrected for an analog CRT's
+    //  assumed 2.5 reference display gamma to maintain contrast in assumed
+    //  [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
+    //  These unstated assumptions about display gamma and perceptual rendering
+    //  intent caused a lot of confusion, and more modern CRT's seemed to target
+    //  NTSC 2.2 gamma with circuitry.  LCD displays seem to have followed suit
+    //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
+    //  displays designed to view sRGB in bright environments.  (Standards are
+    //  also in flux again with BT.1886, but it's underspecified for displays.)
+    static const float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
+    static const float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
+    static const float lcd_reference_gamma = 2.5;       //  To match CRT
+    static const float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
+    static const float lcd_office_gamma = 2.2;  //  Approximates sRGB
+#endif  //  OVERRIDE_STANDARD_GAMMA
+
+//  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
+//  but only if they're aware of it.
+#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
+    static const bool assume_opaque_alpha = false;
+#endif
+
+
+///////////////////////  DERIVED CONSTANTS AS FUNCTIONS  ///////////////////////
+
+//  gamma-management.h should be compatible with overriding gamma values with
+//  runtime user parameters, but we can only define other global constants in
+//  terms of static constants, not uniform user parameters.  To get around this
+//  limitation, we need to define derived constants using functions.
+
+//  Set device gamma constants, but allow users to override them:
+#ifdef OVERRIDE_DEVICE_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_crt_gamma()    {   return crt_gamma;   }
+    inline float get_gba_gamma()    {   return gba_gamma;   }
+    inline float get_lcd_gamma()    {   return lcd_gamma;   }
+#else
+    inline float get_crt_gamma()    {   return crt_reference_gamma_high;    }
+    inline float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
+    inline float get_lcd_gamma()    {   return lcd_office_gamma;            }
+#endif  //  OVERRIDE_DEVICE_GAMMA
+
+//  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
+#ifdef OVERRIDE_FINAL_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    inline float get_intermediate_gamma()   {   return intermediate_gamma;  }
+    inline float get_input_gamma()          {   return input_gamma;         }
+    inline float get_output_gamma()         {   return output_gamma;        }
+#else
+    //  If we gamma-correct every pass, always use ntsc_gamma between passes to
+    //  ensure middle passes don't need to care if anything is being simulated:
+    inline float get_intermediate_gamma()   {   return ntsc_gamma;          }
+    #ifdef SIMULATE_CRT_ON_LCD
+        inline float get_input_gamma()      {   return get_crt_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_LCD
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_LCD_ON_CRT
+        inline float get_input_gamma()      {   return get_lcd_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_CRT
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
+    #else   //  Don't simulate anything:
+        inline float get_input_gamma()      {   return ntsc_gamma;          }
+        inline float get_output_gamma()     {   return ntsc_gamma;          }
+    #endif  //  SIMULATE_GBA_ON_CRT
+    #endif  //  SIMULATE_LCD_ON_CRT
+    #endif  //  SIMULATE_GBA_ON_LCD
+    #endif  //  SIMULATE_CRT_ON_LCD
+#endif  //  OVERRIDE_FINAL_GAMMA
+
+//  Set decoding/encoding gammas for the current pass.  Use static constants for
+//  linearize_input and gamma_encode_output, because they aren't derived, and
+//  they let the compiler do dead-code elimination.
+#ifndef GAMMA_ENCODE_EVERY_FBO
+    #ifdef FIRST_PASS
+        static const bool linearize_input = true;
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        static const bool linearize_input = false;
+        inline float get_pass_input_gamma()     {   return 1.0;                 }
+    #endif
+    #ifdef LAST_PASS
+        static const bool gamma_encode_output = true;
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        static const bool gamma_encode_output = false;
+        inline float get_pass_output_gamma()    {   return 1.0;                 }
+    #endif
+#else
+    static const bool linearize_input = true;
+    static const bool gamma_encode_output = true;
+    #ifdef FIRST_PASS
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        inline float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
+    #endif
+    #ifdef LAST_PASS
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        inline float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
+    #endif
+#endif
+
+//  Users might want to know if bilinear filtering will be gamma-correct:
+static const bool gamma_aware_bilinear = !linearize_input;
+
+
+//////////////////////  COLOR ENCODING/DECODING FUNCTIONS  /////////////////////
+
+inline float4 encode_output(const float4 color)
+{
+    if(gamma_encode_output)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_input(const float4 color)
+{
+    if(linearize_input)
+    {
+        if(assume_opaque_alpha)
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0);
+        }
+        else
+        {
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+inline float4 decode_gamma_input(const float4 color, const float3 gamma)
+{
+    if(assume_opaque_alpha)
+    {
+        return float4(pow(color.rgb, gamma), 1.0);
+    }
+    else
+    {
+        return float4(pow(color.rgb, gamma), color.a);
+    }
+}
+
+//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯
+//#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D)))
+// EDIT: it's the 'const' in front of the coords that's doing it
+
+///////////////////////////  TEXTURE LOOKUP WRAPPERS  //////////////////////////
+
+//  "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a wide array of linearizing texture lookup wrapper functions.  The
+//  Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
+//  lookups are provided for completeness in case that changes someday.  Nobody
+//  is likely to use the *fetch and *proj functions, but they're included just
+//  in case.  The only tex*D texture sampling functions omitted are:
+//      - tex*Dcmpbias
+//      - tex*Dcmplod
+//      - tex*DARRAY*
+//      - tex*DMS*
+//      - Variants returning integers
+//  Standard line length restrictions are ignored below for vertical brevity.
+/*
+//  tex1D:
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex1Dbias:
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dbias(tex, tex_coords));   }
+
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dbias(tex, tex_coords, texel_off));    }
+
+//  tex1Dfetch:
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
+{   return decode_input(tex1Dfetch(tex, tex_coords));  }
+
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex1Dlod:
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dlod(tex, tex_coords));    }
+
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dlod(tex, tex_coords, texel_off));     }
+
+//  tex1Dproj:
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+*/
+//  tex2D:
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords)
+{   return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords, texel_off));    }
+
+inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex2Dbias:
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
+//{   return decode_input(tex2Dbias(tex, tex_coords));   }
+
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dbias(tex, tex_coords, texel_off));    }
+
+//  tex2Dfetch:
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
+//{   return decode_input(tex2Dfetch(tex, tex_coords));  }
+
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords)
+{   return decode_input(textureLod(tex, tex_coords.xy, 0.0));    }
+
+inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));     }
+/*
+//  tex2Dproj:
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+*/
+/*
+//  tex3D:
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
+{   return decode_input(tex3D(tex, tex_coords));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, texel_off));    }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex3Dbias:
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dbias(tex, tex_coords));   }
+
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dbias(tex, tex_coords, texel_off));    }
+
+//  tex3Dfetch:
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
+{   return decode_input(tex3Dfetch(tex, tex_coords));  }
+
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex3Dlod:
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dlod(tex, tex_coords));    }
+
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dlod(tex, tex_coords, texel_off));     }
+
+//  tex3Dproj:
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dproj(tex, tex_coords));   }
+
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dproj(tex, tex_coords, texel_off));    }
+/////////*
+
+//  NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  This narrow selection of nonstandard tex2D* functions can be useful:
+
+//  tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0)));   }
+
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off));    }
+
+
+//  MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a narrower selection of tex2D* wrapper functions that decode an
+//  input sample with a specified gamma value.  These are useful for reading
+//  LUT's and for reading the input of pass0 in a later pass.
+
+//  tex2D:
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma);   }
+
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
+{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma);    }
+/*
+//  tex2Dbias:
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma);   }
+
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma);    }
+
+//  tex2Dfetch:
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma);  }
+
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma);   }
+*/
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma);    }
+
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma);     }
+
+
+#endif  //  GAMMA_MANAGEMENT_H
+
+////////////////////////////  END GAMMA-MANAGEMENT  //////////////////////////
+
+#undef COMPAT_PRECISION
+#undef COMPAT_TEXTURE
+
+void main() {
+   gl_Position = position;
+   vTexCoord = texCoord * 1.0001;
+   
+	//  Detect interlacing: il_step_multiple indicates the step multiple between
+    //  lines: 1 is for progressive sources, and 2 is for interlaced sources.
+    float2 video_size_ = video_size.xy;
+    const float y_step = 1.0 + float(is_interlaced(video_size_.y));
+    il_step_multiple = float2(1.0, y_step);
+    //  Get the uv tex coords step between one texel (x) and scanline (y):
+    uv_step = il_step_multiple / texture_size;
+
+    //  If shader parameters are used, {min, max}_{sigma, shape} are runtime
+    //  values.  Compute {sigma, shape}_range outside of scanline_contrib() so
+    //  they aren't computed once per scanline (6 times per fragment and up to
+    //  18 times per vertex):
+	//  TODO/FIXME: if these aren't used, why are they calculated? commenting for now
+//    const floatsigma_range = max(beam_max_sigma, beam_min_sigma) -
+//        beam_min_sigma;
+//    const float shape_range = max(beam_max_shape, beam_min_shape) -
+//        beam_min_shape;
+
+    //  We need the pixel height in scanlines for antialiased/integral sampling:
+    const float ph = (video_size_.y / output_size.y) / 
+        il_step_multiple.y;
+    pixel_height_in_scanlines = ph;
+}
\ No newline at end of file
diff --git a/shaders/CRT-Royale.shader/textures/TileableLinearApertureGrille15Wide8And5d5Spacing.png b/shaders/CRT-Royale.shader/textures/TileableLinearApertureGrille15Wide8And5d5Spacing.png
new file mode 100644
index 0000000000000000000000000000000000000000..2995ae5f4cd1c7f0c72732286e49f255c641ac5c
GIT binary patch
literal 198848
zcmWh!1ytNz6J6Y8ahF}(i@U?(PSN7l;#%Bgai_Q!Z=rZU9G2qNQmp6}D^T3Kus{FF
zNnVnZ^D=qw&Sc)5nM6Gu2mvk)E&u=^P*+nn002;5PEi5a|8GKQls>&Q=*|k-3ILS&
z@>hp>)-TtXcA5}n0P_D^d0%bz%N-mqH8WoT0FU(lI|`t%m<j-h1gI-37zM6{KCRC2
zu6F*#x~X%%Aa5L>4!iA~>+1~KLBSUsr*Z=`5SyB!CM&A-q70L@o)1K4XxUoHOH<)6
zTcDzlcw$?pC@@ViW#D%<ht2kdoJ|{t{^Y6LWxiZ?N(Nr+EiS#@pA>OcPFaAQed^i_
zPUiuE>=-Z7RS{j_=NO<dOa!hAJb=n6BgpiXm6a*IQ|mypzzA-51j$9SYB9Jn=^hL8
zJOxCKLbo86;7ce7PWMW65$Fh`dZm&BRDzX(p2dL3WM~AqG8M`OXH2?pRzZw`WtC4r
zlh7`xCs0Y5J`-{RN`ivyQtnMa&;NkP%TVtPWP~uhEb%^FMGNSz%&?$L2Wo{H!Zei`
zGa=VNdl(W7FI#?E1|m11NDWwoD0~e10qUwuudl2H{0EhU(_K*LE3c~{=q~tG5qLnP
zsxl?82O1#`FH5^mSI&_>C-zPhS6&1*1K+^-DwAEn&o@w{K5T%_`}@<2yf&~18F(4=
zo_Imk64bA%46B7o+NIQiYM=+uS|~CEhU9@GDZCS*f^dHDb0RcC5Pp_k2PS-Z_p8eE
zz~sP0D75EgTZX>$gQU6`NWVwU9KRePpl1sp(o6Z9r16V9O3J`j%DpP>Kow=mi!RV}
zg1PhtQjY9m3wp1sx(=}f=>Um<{xAy|aVBK<MOIIs84Sq|FH66-Xe}F%gUc~qyo^yY
zl+A84=M8M|?=+hL{|-_L9>H+IuY%|SKJ`8$cc4fom>k6y=aEOKY-JJ^m=9q~dGRuP
z$_QWAe-C4z=K^R%gmOBL)z{3Vq9mv+oaFzaU;&Xyr$|p#L@oH)@c#pn0M&%uL6u<a
zO)e=^;P{ux7$CnGNW1?4dgfi7lOws<N__dcRwy*z^nC2)Ib!d<e@ZZ<tGnFQ!_%26
zq6#QcBiX!!$S_u>f2Cpx;sYYhVMtRseN{vl5NQO9V20lX1ilOJ+wXejf_p-wRS?#o
z=Pn>}KNNWdfBFGL_CxJq%$2EuY2uJYpqz>_>;rTV^!yWww1OcyzaoR4kV<Z*%5>mc
zVAu;#DZQbG;C>Z^F>EX8o)Yxj@!&c8d@w+Ep{s&mu1p*U%{HyRfl*zMEGVx-9Dos0
zaO;$NYgI&qKN1-TlcT+0gm$Z*fNp_7uv~k&W_?n9)m~KuBj~vnh}6a3?R-YM!Q>bg
zyzUR|m+ka>USuP?;DuYK-XDOU=RA<eum1<dOCJZ*xj+o|O9?mu<Q$oLZ>@qj13k~W
zA=|F*n^h6)pav)%<)ZTSi;~G!rirWW0$=du3X`k7;UAH`>OTXwLg_c9KsxK+8w-@<
zI#elMRN>-hTUEc$QUa6*lpYABdI9JQ&dC;3wf?)Lx}dmVt=zq)7w2nU*I7xN@%IkH
zO$m_Y!x(dUzIk{ag9=tAi7OjD?<E%1gHEyONuAaqT0m{(kvt>d$1TNK4EE<85x9sQ
zT+YAkll$gF=o540sLAZ66xeP$PY>b{8r1cm1eW{qTXrw_pvV11JR)EMbe(_G*Y4KR
z0Zy?GcpP1q$7C@zo#lgP-tdFdieW*NlhA)JV5huzQ5mWwxmR*K{4Z#hQ<nCkY{Pl0
zvTtL`a%^qad-b?1{<jNCcos2g&^j}E488Ax;$sPhAo<|1=eY>qj3*uRwTGNjiEE7G
z#a4dmh@kMK`G<+sU7%qmd&B=Hd*j8#pxmwe+qq!nz;`8w4L(r(_cAkB;gVO!&VkXT
zVAIvPnSt))7o7pYn?Y%bPjl=Cn@bV56{MmjzZP7=F0QFTvw`^%cDj*i!xVc#n7jcF
z5gP^@E!&f61l(ZR7t<zUy(u2ucGcMZR};qV;kubq{M(Y<<A<D#bEgftP(Y)@3{6U~
zZRm#=+ZK`H0!{;hoxL6Wr=X&6nuUwzF5QVAr>0YsaGg=kqrk!IqAr5`_YvWbZ(w84
zbe;)Bww%3cSSt$Yv%mSLp_ANqgi4)30F7tWp@^b8@R*;_^9?>!mw>~|UgCBh(yQzM
z40&x$RO}i4K<2%dfLD5~yxpf7nlgStYIeEz5Jd!7e(+5j2ke?Gx%_?>qRR(O#V%Na
zAE98YumH3>hCxA{ty(A}l-zun;n&L5-5EAh*G!LUuQGmd{_4h?7jX8)&Ax)KM(JK+
zw?&+yNTF=H;h0FgH(|9XcM;u|fow1L?u6i}@=L#Yd<vp$j}?N3bo>bUHr+Y{;W+O`
zS>ZG7<*uL3to!di7kE(KUC`D}{VI?NbFqA4^of${E#<{@5+OzW4R<ZN%v;KXlj?0s
zO{qulRvCYGWdg}rMbl1dj<;-%gon$RF8^u7Y!{8qqZaCP{os4cHG0_-R4VgORfL6o
z50jkG*>U7kIA|F0pN_=p6ZgM$;t4$8D)2xzn)^D1=b-38_ttra(B6xhv;{Ff!JnVF
zHF*=nVroR;xhnOhC!mMR$e95fHH>EnR=YitYfCYc9)nvtdPvBlI#oQ=hc~6x@A&Q4
zQ^X;W_&MsLcZxU!KM}3W1kr<XtqDWd65r`+fO5f=V2sP3?I|w|VT_=F7fnXtH&&Il
zBfmnjH3g4Eeg?9?ZnkrGxFgUEw(X#_?pKypQ1}zdAn+e`_w3-qu+MralH`JJ!Pc)$
zP8?uq9$qK3)w&Z#2Scy9ingD;#~FIG!0b%FXNLR2&`i2f<(wSk#>&&`e6$E78LHD(
zAjPjf;GzeP?RV{Gi4qQ`Fb7M0#~9-kH~j~-hs&#EP22ocg2hyPZTuTFXKlg44DOfZ
zF@wI~bPEK66a20htr#I&u<axqW;7jL4U}4P)S#0GdZ*lT!of8Rv5lK8_6kmV&1h=L
zPh{)0=UVjMWBbl$D$SyBP1shVxK}u>&!sCY?y%NEGu<YudAi$iH-cC0n&QGi{7+|~
zr+{t{Z2Su&xb(&JJHni8!u|_dizxXUf-p;0$z8E)%GPtYnu`eBCgb*-)&InTdG^qp
ze1)X=5+kr`2AEli`}TeAx(dIadFu}WKr3*0t@jfC2;Oy{OLcI-HSyqj-}RgN5cY&u
zCQ6Ze5JKG@H;dx(2U8Cod>dNzJH*)sLAt&)3<&1=tsE?N*<ljX?E5ZIAT&%$9&4q!
zRAs81l+I9s$t+aW&RD&AeMoJ$*cPa41`m^bZf|eyQ|)cZC%<<|VBKsr)wlVmZxf<f
zyq^-#d2}opMCb-*;1`O?i21Z6BPPOsG<v63YXYc^MC#3&XTm4UDYu%{iB<boEw}_?
zqA&yMgXf|+l;<ca1(Ou59kIc>{mq85&MP6Gvoq0i^kV*@qp)Jx4Z5wp_!&a9>)W)o
zEMWvArb3ew_*F-q&ebF`kTCBnA-*tx=iipS?9sJJ#LywuJ>7)}oEn;$4dLQCm3X5K
z+mH;_od<8dm}l@G3-=Q^?Pu4bfy4S=>)r`wu+j1J=r}rg)NHv3_3j?9cf@}2vQx>3
z6F?6$v_)_h`i9c&9(yrV8(<1)>OC+EZfU^!A>$mRA+XXLz6FwvdR8UW(iJF18>BV!
zc%0Ztp&cO@GNkjSm57k=w~~R}NWcrw_*=GsVf|>j6oA)wJtMJPZ7rk`g8K12jy7WU
zdc*pc%~cBZbnLH23a87Me^2qh=!#6>9y(UN<ar#AZCQ6F4jRw63dCR|Ub%fVYwKxO
zpcCHL(Q+xyRrm3;{cu)r+U{0rVCsFQ<oDUuV_9#2NcH=3Mx*}qOvKN5vXRF}>i4Bl
z%djAAVuvfIfWT-K%z)H;UzoLbq)_Fw8c6nF&t$fS?zj7ut6<-Q*EH}fO|JQV0XIh5
z+Yd!G2z*G%<Qv8nsW54KV0u=HWNVT7;9#ZxF?wX3RfL6pYJx;=NB&{P{aLSK0Cf73
z>cOR0-6_-N0{i2`8nBZ;L4cYklDvCy7EPY=B5BMZv2y>wDbNoS_PU6wGAR&x{|YW^
zmmpCV)k}0T9`Gog=@uS2a108sAS{U?*qfeoz|oWra1Pon!{JPTGF`kTDG1+BK*(c0
zXpmR{{hX$x6diabu^al0i_h^G&hX3M7hlIJ@7+?1Gedf{J*d@YIz5f}qX~||{UmmB
z<uQMt&TzM`H~9jN31DgaAaX}X+m*K^xUV&5b$Bi@%7hqqoo^i%XFsNA{vz_LjsdxI
zD-9;2S!N<m?tc-`HvS7wz6U2oTHp!~kv~h=QC@IX4sYDqL>H*iu6`rHwG*aj97!>{
z|3TQNxWp4)2~vH(;+61YYedB|(TX!d$h#4|yjUBTOEz&wb|Ir)v4_UTbWU%5>Lue`
zsaVEi6`35Kc8_ZX+?>Vc^bsQvJ|Tc%4{fTTiwrZOlqxDseyP2im2bhl1Kt_A;r9H(
z&G>R&3ipDzz1`k{G;Snf8v!<<3K+9*2M>=~Kd$$`kLPZD>R(kDPj&cfHjIJ#!MVUT
zfGG8!n&^XDZrxDyMCx#ndTA2=8s43W{7FnAnkdCUtV=UK;LsqIb#}z*l;vn(xIou*
zcUC-kRlSz(9?)(0_v#g#9~2%ho`K94MbYybiiX*<RT^Q<U^gW|%kqf-QS@mlT1M5Y
z)3uDq2Osq;qswcJv#3t-K@iq&HwRuh_pd~M^`MbHqEotprmuT7nHF2ieO0-o_tO6?
z7r7jgF+H1IH-WfNYnuN1fLC0rF(G!X|0q+JlTmPTZ~!e7=Q9mbCth(-;KYLbp*Tpk
z+|JwLk@|NtCFuvTpD=R6OQOO!wf!wbm1<%?LfPq8Uih^_4%mJi)Ik3k8BH**bRrO`
zLeZGh4~aJN{nb46$#X`LVyg`KZi*yOx(;jwdL|@WgDI}3d8b$Wq(H;UyG_b-@zhx1
zF^k5fa6uK}#-jk{U!4CEz8<G{viv%7gD-tMi~sfl(f7rRQtco8!A?q)+F1rWVL9_L
zk5>-we>F*aZbN>wwPMzrwa?5WIz;=WqNj2rC&BZ3GDG(S8UZ#}rzJ5IapH~5XX8j;
z3hUkFP*B1a_8`4v0a(uHVkOD>7g;_@?GSnkt0X4IADIjekxkO{H%w~?bgU;7tl-fz
zj};_2N+&w5M2cl9PVs#FzAI3rWg-f!|LS9l^J(PXTYRpse{q%a^X7R0O0+6|^c7g#
zTesK&l7x<F{P`!yh^L*tiGCPpIrbQrf(xs`>G*(AZ{_cV{T=!UuIT=tkkfycb_BSs
zdT#^v{cgc&{;^JKjv2!$tZEQ92#&9?3PL%pyZhg2|5=mtXN^SrWp28qPW;b;c~-1a
z3j5p3xxh`t63=>gyj;DO=9MPQ9tL*Y@JW2@iAi^_hvNBj0n>Cx^2J}M-F;MmmEcYL
zm_xQrCR)n~IBLlzU%|n`)wsl8Wd;s2HrC?Vc&yTEgAWEz^S=4HwPAzL|DgUdqCtM}
zW+>`F`&0i*t?wV*Mb*W-yY=&2@E6m8VUlM|nt|bv>4!Zi(cJPkAqDdYG}$1oisGZR
zssUe8z3b+t%`itwE<_M<sioli9wff?K$rZJB7s82_J4WX51ceF&b0JzzcbSY)WUiv
zS3_LZu@lVmlF6))-!tSm23Q;o4Ypt+VW9fZyrQg|15*{SF$5+C23m(AT*SIf2fo{V
zI_yo%F|5zX3CH_gb{;_hDQ_ptnVr6M?+Xf#Wgm8*dBiHV_B8%7`Y@9sH!~f=jdP;T
z(OCWXfv!SG)hFsGcW(xs@lY1wg*1Hh#Ak~m^w@_$9~KO@dteD4Vkve$t`9nmHEb&A
z%CewT54QDEVY@N0?fne#HKtvPsQ#&p5xGWI!`nrzzK*AL#s`Ni3YrYdW03i8MktJM
z$)7cCHasE;BiOtV;i*hQh+ksQBc*Tj`0kirEVJ^7;3r_L=7FDgu2C%K)C|b&i}b3{
z-|H?g2!6*lLPvrsWj30Z&}+vV!&<o-g1VXLIw>(em3aG0$trQ!ytsm7z}jcE^vwE@
z^*=EQmEb4WmxZ$-44gOM%#8I8P736k0pvzIpF7KJrF`7q)$rA9h~>$E&}*qbO}5Uk
zPHswib%q_v!ijPO8JmuNK|&Mnh1k!6KCyfPGk|4%hn+gF(pvkosu-UAfM_hJ-9irv
zD1dha>D8A+VFQ?X=&m)TSrfZ{?1WQrL@q*xH^rw=!u3C;`3O;aYlnFgSMeFME0f0z
zu3n2|Hs+=D3~$OL6ap2ZkHa!wb^d#{+jO1!eab_nb`)y%L0?s8)DmEDwTw6?z|r@p
zzI_ciC({UBl$on4T<NwN9Kuadf{_*{<4%?arA=&bqrOJC`{xie)#kT^8lvW`Mx4tM
z7`y8Nt--Po1eDtUn7~B~LW{m>Hvg=p=0g0M56Yq*0<kwWT6#--oqh%^H~*C^<L*Q&
zRW>FO+>w~Ih~t>3g}$Dvfu4Iqz-GzrE3D~J4$z3f2fOX5;!4uQiQ-Z4IcOh&4|#j6
zLfKw_Rz&!v-cJtb?0f&9iT}J_2=H9;m=X&`eYXA(4i7;3-uwX)hd@|9#J+7EJ<gJ1
zls`aY5R3WtTCM2QzjuVfea{ffSfk$(?oH&4zfImCG_k$}k9nIsl99E1UhFa?PrC*~
zut(OYKA9x5kVY`4?vk6l&Q(i9#<?=2WNQDl?%&a14|jxCR0@WHh|Fxd7Xm*oWaX{~
z%AUK#1i_7>zEk5#L^$DWwR!tZ;)tcS*o|Xxn<Z=hosOFGUooj2wAiJTz=#A7UrTWh
zTkp*APdw1y|6q)aI)F;CY|I|+se>)nW-D+*-VP6nh4YApojWMFZ(h8RXfSI)F*p7{
zi5EW37dB@VX9Dc#JT%%-X8`H6XS;NXZR-Noi8Qh9hbv`lt3B1xXQ!v6y_Ex&)ZZrs
zrsacV(T@MSLCL?Gn{YVNC`gNi^StWY1;28VkL{(2itwcQ7Kc`5Gf)N&-)83{>%V0E
zJAaY<hdPH$!ah44D5sbNjql4(NWRgX&b$F@mY}()2DBg4T~9oU)nEj}s<#9nam!*f
zLln!5#1nLCj8d^=l;T|^u~)r(!w2HqXz^5mNQR!-rr?f9&m^`3sHejlhDvst0xrSi
zjWuFi?x5F8>Msn3>dsZ~Qu!nB5ak!iTElxAk2=n#c%PnrV()bF2rw<REg7zsy=9h&
zm{_Itjg_H7!f+OJiY823btP|IQR(-5+K&a((=1kNkDw$EPE&)BGc`bO6N-%!$pjIQ
zv3ncxj7TAev0;`FT+N*~f5+c*7>QF*^ftr8^#I&!N{9@tWA2V~clAX@NgZ}zGVG4S
zU1?Ner1~rFEC5vJ_$18lr{{=C14KDK6mH_d8>s0wlm;mf!Lt%ic`hwv^1%Y;b6b2`
zHb$n7zC|j;O3cE&&Rd#FOwLuy9)#tsHFHbSI}U#mo<I1M3r!(}KH-bh9;7CVslj@x
z6XnGi+3bsCo|mC=*%~t2--Hb9_~`U>O+$n`Z7FsenV%eXIQ`2hxxB#GZkV$Wt@7Gn
zLl4aD#JGI*<2hiNcGMfQfR~l!&s>*D<WHPF_IM7<>FFt7SeXr$r(!YAQ)>Hv&C&<x
ztt3M~RYNr$+65U#C(~+ZYMn=t?j5s~b3_B}cI$-ql@dNGN<4|jvB>tOcc*t(kxI5C
zJ6*94C-M1kE<@SRrwO08v(UG3e&uUoIaeanU^}b$?O}6KL>$SkZ4|FICF)n`6(paG
zUb`OFwSQ<zQ(I*Oe+D9FU%Yk5y+-s(`Mc&O-Fszf8}D$kI?JOwfNf1u?K>vgS?3z!
z0W+lxKlHe-I}$D7w{Lm=yP_*&?)Cv_T}v<je>sfRg_KYT+La|2#vDAsDh~e27xBRZ
zjr2?H{(x~{`d5ud)Sp}IcvLy%*5{k=mxTP{OA%BTap9>Ya*@-)Oul@)RkF$mc`cbw
zD&ygled<IpR4r)A`>~hplq!~2Lmo{dVvgcfHnbG}YV}NfgNu{VPFM(6Y|k@L_kvgX
z4`xR7qdnE{uYKqN_5PS>Mx-YP$Cv+-CGTw_&fE$=ge1M1tH!3K=^jKMJ{s-WQz!QN
zN(^9&xD?{5BN^6z&>2H9N<<71MgG9l@NrIwaF`hho$3y{sw}_7Pi<UPn~+T-yKd;9
z9Z$SWs-V_6!0G88tNS5Yesn~=zxXBGz<3~E=-0j4ZnoY!SY#+gBK^r+xq|9k#*XRK
z$cxZgz3%pfZ5v?GW+AnBjJG)E1>lR4e*x&a=|OXC4?<$Uq#82{l_CqTzd+?e%3<P>
z;&XZ}sVbN0K*lVcZPeDClr^=Zoo|@j{Y67~BfA-vf(-f=?BDG;-%>8~yL20&sOe~9
zEEc!jGhdVuKpREV$}wW-&R-k!Y|4K0jI6fDNT^UT|35DYw}4sqj}09=`}#@-cXBuM
z+BhvLeq;age)LW5IEBXic<+W3;kR$RWJ}L{Cwvi2{S9wW?J%rMp^xM?%_7cLLKN3h
zz7pR-C-fQ3`_H4sx8I^+A)>-#7hpm}B*E!Ll8Hxywj{GfbQp%GS$?EJiAna25AS8&
zbc?-nj*GqUxsU<kss@)EkoFF;@Qn4Ixj?xP<%X_rGx#XRI#V9bP)@GS?i(lTrRXRD
z<bH<;QZ+karjD<*5MOWyvsNcv6+9${x<|n00`f=bv75xGgfsUT#I#%XqH;M?)^FpY
z@*G<q!tv<MLNFhx^yLor(--%M!=KUsi1Ecq3cmXVquZ*Vq7>GzoBcDu4s1!;6l3(i
zdLZ4xxJ_`m)ceY_<JX0#r4qGPzkVC2tZ))NV!yUpM;4IY3T0Y4?o0;zh~lcXW=nC!
zAE`CvJvW1kHt%uHJ`}!Pp0kt?lIa$frRCB}M3O~fHOj(TDKGZFz8IGF18RjU5F4a;
zJwAzbCAClF6Jbphf)!3VThH<z9<ev3c}{UUX0kp|ht9ZPmDqMQ&vFhMwGh9%24YP%
z$$itnaKifFx$;V~WkL+gNQpj=LsnB~AOJorX8$9^sphBKf?e*?-e+ZQ;~}-IR~HD+
zYO{}j8luS@H7Mk!nS37+>sTeJLSNl9sD`ODbNyI~{tW(`#19`+?|x%Vpj4vGlJ_ox
z!c+P@Az9MRX<p-IA1nKraD3t&K)$2@q4a>qYx^BOA)Xj5q6|5S^T{!8X5+$|)qmr#
zYJA3=93K^35^JAmNw1ZF$pHgSEo4{H3Zr2Q%EYQSd60$2e4z(k$eXw-xgomggwKzg
zWtk-1aejaOGw}J`=O}0WoRR)|Gi@Z>o4?Y~XKmU8J6KVGHArXZhA7gk{(CNC_1u>E
zT|jDmBVH$J{u|LBcPZr+gkOaoN3le$ooXaU{|cG^w0xICKW6*cK}I0&HdU{L&ATes
zWfWD@CmT(-QxHJ?2CFQ%RX<PaSmR%dy8UaHaX*|6Lrb$1YU_wyd_KQLp@qO9(shfm
z3!GnioNs@FH;pe>CkIcH3pQI4<+<~EMBiIyA}BL(m0*1EgxHuL1#V^+Yltt<Jv=Gz
zH`a1_Y-<ZwS%$5+=zG4BkKQGU4XQ!VKxiOaV*dN;o%f@SAF=ghmC+2%X_SQ4<ry#9
z|3tD`o`@YAakz6#;bcW@<}ixuCYMol%K!%pZgNEc|FvV6DNs$P2ZP9iS}qbHZJ7Z=
zfMcKrY_PR4FWL1;=WJm+>X^#yB!y|ephDJ~6urm-?Q`k;l?%qFso)%!e4^*U*XgRZ
z4G3PFHmcD|Ei?1uC{NMXk{sTiax5i-)E&c<xPeC#6HS>0^@p9&HmY|7ZN99BfhhTZ
zC(dM>EZ+Kv)>s`8xyELi^bTsmzwP<n+CYcKM;c|4S-faj)S^ac%u1c7!xQ`oDbNf6
zMHZ}_Li}tL+NpwOFwNoNpzE_wu!VvH!&kqbXa>ps9#D5f1a!?w?f7vpK<J@!MD!=d
zpu+m|1rMJc07RpWS|lqggtSJyNK|+UJ3ujX@v0<oc&B*nO#UIpIFDlM?x?{*2p1Fd
zk&AF1ghK;#*FFiK;6QZ*%&}B{m?5*Ws&p8$`g-rO;+zn~mn2Vp^y@4THJ3kKIv`5D
z*W+`U3u9uas-{YsUxjq@PV$*uGJ6JEVH34JcNv>E^<Fb+TytL4d*{VLdC$k7u*Y?F
zD{bm<4D$7zzT+7(OMu$ST}r0)MNuDph6)v@SNl<3j*kXLM(M~sfxbmytT7_B`1_PH
zLwJk9VRsO6)W^ZY*~(kkxI;34;uVW?wveKOsp~#%Kt03uw5NiI^xh$t>!dKcr0DlK
z0f1z|%n1$+KhUH*fvX^fIB=jist!V%VANu+acFF(%L&WUzr*@Zb@C&d686WfUgWG@
zTrkdD=%@YN0W-7EFF_*jrA&WAK^JL<K|{DdWU0Q`$V6F1S8VQg{Utf*DUZArO<+17
zn{6cP?b?>W0T?9u=Lumrv$o}Rqs86F;kOOJOS-cEsO=cL*Dy+2^Cs7yeV|kQNkN#C
zOR2I*p*Qc7ox4XnN)n<PUEc9e;@wf9vd+)JRsa3h9=c}{p!s?|FUP^pSS1tM%S2wb
zGM-#9_}98&32z!c^|*L@8#kB}`)ND(;Br5(A&8o;!QZ!{vb={*Z~cVS(m!-Y=RiA%
z^d^ryvL}05Kh0!AT)?H>KUcG5X_0q85qTx2j*E-kF!(xFjiecuI9o0MAQOKv_05)X
zK(Dhi0zxa<>J{17@o7UzDWWT2GI@qpLt8#V32C1H80h&Z;;+9!7dQ%EcENJ`14j$!
z5CeQEQ6^CJoaWV{{1LfqS-;U<hT~&#*D18q`=1#s6CVe2VdT0uyY|(awaA%{0cJZY
ziC^g%?^#jm6bJ4|F5ZLsKlR+RseQ&ratb~x26?ofj(dk);5A3NfWs?^_8p4%B4hss
zy-5wow-zM5$;CS5!N(<e`Y$5cU|bm&oVx)iljbk{Pn$7|%%Hx3B_3`0zZE>Q)1Z&=
z;Yi=(HAZpEcO5h*M80UidbEb~mwykR`FF+d@M$9#b5pyhyG;xFnZ|SQN%T=nm+n|u
z9@BlMOVj3Gto6fi_ZaF7h;S!c^!2*Y+f)ZLR8%|~D_MDx2NEp`T9tlYx=4=lF6cm7
z9r!sb?_vTVpJ?B(JSJX?&B52$LDqgyLc{2li-*PWk1mCFfg*=4{<i^HF2f)@DV*c3
z*PN>%?>tp!|10S&7V5bR`FYEwo^AcE_Gq<iN=(Ho?^j=ehqc=1+-D!x?hr(|KlgDP
zOWmDqObGjeS3)divjSV9BJrfFB(XK{%t<TD*WXbk8?I#cZ&;MmXE<$KlpKle<7l}1
zn`Jq(YOH+Mas{ImnpSgj{vtp57ynCK!FD+jjUY=~WFWDPOwfer{eT%d-?Vcl!1GN5
zEJL{Hh3vh0eA4<o1(uQ=E7VQ#d<%Vk%a|f~|J3>$30?hpC}fo-n@@`&46ftnxaa1^
zoSww_Vz9b~b)6?c(TPb7tg>TxZm$RDp-s87|NH=o5cq6gD_Sn>y&KzHrm(^dUqd5g
zl989N*F7BybS@$?*IN&(nsL#OEjeM|t$UKwu4@!LSASw4EcDch(W;J~7>_xsCX)R!
zG-+AP<<<6IU2XE8RvibthJMP;7@rn=7r}~0RnjQ@b43PYB`m_%RGd*+&Asn@@&7%1
zO}lfZ-K|FQpRDKLORhYRXYc2%`_j0n+WkIei!MpM249KhoOHzAZSE`ayjGwt&+u{Q
zUMA5>;w^3$AZb?O+It$uz)h(P)rGS6J-{UBu}!WPLO%|@i;2iOepgYxi(%e2PLjrV
z_wQ^c@pSMsxOeG@T3!3S_md1gPxCeWW@XqR!{XzKa7;wy>CL-;eyo6#?!l<xB~Z-S
zPQ7N@V3Dn-z7WBoy%NM=MCc&)8GQR;2)jYF+xzLO;G(=NL2S|F?LgvvJD+^~!vJo{
za^gBCR?K0-cZ&X^W1#1utt43C>o7<4j%Mw^8@NhbZyLB>Uc?PEGUQYiLf)`;L3s96
zTb>(df;WAf*}4?;IWpT;MoYHaqTX~rco@AvmPA(;;!tD|jtn<t{Lf7#V?fyK0WM_I
z_<lZ)T_@|^gUEsOQ~&yZ=yKUV7g#k<-3)(RLoUnzK^e6sIQL?8ZrcCIg!tN!V~LL}
zYX@LjZT=<_1X<|zn`~Qr6p*f3vTPdIwol(U;|-R5Rb0&)?=}wod0<-g-$1$UxH<?T
zWhzGhmBVRb$>@33oyX15^7UWp_Sr4v$JLcd&I-5sY&QtMqfNGo+g_0gZioRjj{~gg
z=NMP38NaB>`gjXxY@zKV);n?;1>HT(w?+1!<gptsIbzCw&!Sjxd}7Hxk~iv)Fne8a
zi&^iZbC4N?o@_zIYkiv0kD0{x(}9SrqU$5aS86+FWX3Pd4@KGdK6F#C38|tVp4jES
z$v=KOVf40ypPC5HB>kEt0Pq10|FTTqz2Cl}9Ws>FgCJ-wCX=JZu-ECMy4uxW%dpWG
zKQ}Vxw<mo<v<WVL0!tviapS0<kiWjn+OOUIXHxi6$Cbu-MV~7^Pe#vrVo2KUUEij&
z@M?V)>HyZ3o27KOll@<<%73|L{xz)Tbdj@GetKEH$MdCHA2*9K`3MKqx)OPuV)^iu
z^&22=;(E0*)uO-vZhY}gi1cUc39*oYGOaGLt&05JOy6fW<jrk;{>}cby65b)#Ca-<
zYU0@7S3F_|VuRUE5FUBT_c^zE7#Vfg7uCvd4qVAko+sLMN2}h+&z~f#n*`db3~W0c
zVcch_r&?aYQV8Cnrq3S|sZ7lkecu=gP`i!DEZ*c=uj-?5b!%PFQdj#)dO8@|UNz_T
zEaL#C1e$e$5ok>npJZDzKZ(?Mrl8`0eC-94sdKtpg#G^McHb)RCD9~@D}CttD|ZSG
z-~H%wwG~+IH-9yNS6LbCjtq{a!{;P-Kx_E>r;A$*HIk=oXo`&ZN(j2zHp_{fb0<UW
z|M4GwNTaQKwB%>+UKRl<{O&>0IF!e4OQKH>)pxoM4y{phWVOyEtINvq9~3glcVDp<
zn~dD>moH+PWKM2d<_mfGN&ZtLp!W&5&&?lFE6^f<9iei2?T>~aMoTa2TW1!sVr1_1
z8WJ($UUhv4t;7;Zk$+7Z^?lV&2*4OrwOk;3i^`JW{oZbpLQbD|tF;^-?y#X>qEIay
z>E{AvoCzVSHl`#+(K@5neVZZH_JyecV>`($a7abxHH<(}cZ$14vv?CjGsskIgv+&X
zT4E~C?`d<1<4G&;$eNjl)^aWCpcyb(kE@BCT6krdD$%%HCwC!hkhYh1aw9%3M9cHm
zH28`)#xPhoBE)9i5GDbK{72j~6NM?|#FJB<ILVnSJKcHCO0DwSxpTP1Jb49liK`i5
zK-eE+ON(Mt^{T9jmHoHV^=E1+{vB;?#yQfiJX=ay<tC;YT!Bg(DY7*J-wBWBe>JFU
zLipSVn75cJbFUQ{#mw^Y$ICk_AR(h#X(hr%;5`UB5y{EP$zC2or#~Rvy~%cLWpClK
zDY{+Rs=mF8oUhkp=D}@xR5J89C^nVsHL{{rmWZ{|&I7eT6Bm_@yQ(;{fc|?5&gH(H
zrJ_c~4A)o6M?h<CsQAgAKr%pf=VNZe0UeIN5=XV;8MEGvVB$lgcs~ES%3LcEPNZJ{
zIj<c6tC5qdO)NM)z=`q>$qR2}t$1uCna8EpIcCu)8%nA};WszU-a~Khu84L$36lWx
z@wVOdz*=>MWaJ7|S|{Vjt$3KZMXj+(rXGi}Gl^@k@hJA(WsI#75HMZrkWS&aCBGy0
zI|cl@A9(kSz#)@h6T-a8k3?nm%%A#1)PBkZ-ID-vodis#Oawz9kMt|o)VN<m=As7l
zaKsBoCM84*{Np^8hfD8Uv0}8V6(?nq?<VtEiFe*Tjz(|Ha>y?h4-f!j5DDV{FpI)=
zL|Q5FoVvKT+)Q!pYCYIP8>_4LXky2M%e3dM?Nuq2Qh)7ePTx9U{KLm8C%hME&bt(A
zTiXW9Z50(WwN3M%#EEw44(%H)p|Tj4?$_2Cf@OlykYD!?-f{e-_fKC?TS|S&VA0n|
z=ttu$%Sa#uZh<d%`}8IUZ;kNzG@nKY(Rn>&HhRAwsbuqNjQ9|iKXBGZ9#_dmqDcL5
zAK9$y<DBBjK48RFYSr+Uo=a|^Xj=9M>pI>@X5xeXJ9E1ofBRSFnw!-BgdTE0a`7&#
zHm*<9W(bs`Iyci8*e*u(Gf%Me|G7-Vww9q0R=xZ{w6;E)V@?-f5XC?yy4L-CqMKz2
zMii}7bN2khZg|Bv3GY~AV+ji<-#2qrW%(2zaRc4=YEbyJ73`|8ElIlAkGYFdJi9G)
z{+I&tQQBzL@(%EyLtuFAiLuhZLk`aH<bMNk)@^ccX8+?6m8HSVXfyx%g_>(ZxG2wL
zhy2~6J51;_@W&SUz-{~Mmps$#^)A7FWh&G9-M?)aQSXi-jSM7O?n%crz^J0K1tlj*
zoBf183YxhY+x18Lg}9}L^4g6SF<3jr<Z%mRcz<9pRXU>yVMcpUGLJ;xK-r%t7S@t6
zEV!XtvPs9J$)9~qwoP?bM&8aSyd#o=VUc<tD8?+KVS1$nw*Fb^nxm5(xN05{!22oy
zj8NzL@R(8hx+BKD-_C!+ujtqbXuj(=71P`5Cb5oqA4@gn&welOC2hWrn|DO$Oe0$C
zXNzt8lE_$g;(vH$-upen+LtRnR$gcJywCsZtn9f^grJ@M-309g+oNU~hqwQ&`EH)g
zcj_E%$HD9wk1(Mj3y46yyMLELwLuYgE<8zqN<5w+v_~&f4WlZ@R#&v~_Cue0NUk5r
z)V&e9$t#tqnykrNI_Th3X^Wh~!x{}2DxgF2^Olg-c|mA1oJUQGFUpceo}5+4NS;)|
z+7Vz(4wt#s`~F^T)x%m}0Ndeb-Vl#0je*gx?G$=zJ{wV=wKxit(002Z+vph_X0tec
zDT^fJaWLPsjUBDj5${KtibV`s|M7?PMa1GIen;P)p#n#)H;!!XN?6)WKb2Sk?}#Cb
zt5?!}5}g9Y`-;#t?-&`c-vN=$OYLGdDZ2T)tHK&T_VHe^O$DG0Jn_DS`Xycx43qCk
zMZMOhZMYDIvyZOK%{8T@Ou8YNY<1DIs`GwO#@m{BT{AL$j)+$N@f4Vp&J<hhb=A$>
zg5`#Sx-1RZ!?_ipfCHS;!Xy&2cU!#qfM2@A1qHAXVH4rHY9Fmmg^afnP=B7JN_DK_
zimXqs7VbF5OyFO$GbPj=FgFkMY(DCdLAKCE{4x~<8}fbt{YVAZ8MJgSF0A~<CGFKU
z((;|#a{jhz6i~Sb2r<;$WEaVA$*yuF^{c}{17mik4h~eT7-_l>>Gz@Pwky<M^eH1=
z!uf;Gq@NNcMA}nIXvNT4JUXJIr&E;XPQo}wobdH3&Y=gA1-x~`!e#{a3PU1?zAD`s
zCIKG>mZ^H$@1I|y;9B(VCzcq#sNteQW)xBE@JWY4YMHwG6Q;QJb6arOz|h&95h|g0
z4fh*tGdP;LAdCs|SIggmcwS>Xod8RiN?y*_#W{i}$Z5rbno=~!+fT#ZZ=Wh;4!cSU
z+lc)(;;9w)bZyIo>1=j$c%6p6g_k8iY@AlyYi>)*oWC@OA^9Is+a<0TNIS^hr)Jrq
zu#%%1e6jFe?Xz|C^It%@z@xmB`7F_2`P7>bEHy!Ri$}$<IdpdEZ8rOHtsD)2cT`ZD
zSFIP7Em#V$%LX+5`L)&NyZbEA7~>3$<&DaEJu^sSWg<`)O&Z=+{R_XXEzvcl=(Y;*
zy#W5@mH&sBQZhh2oavjjDYtlLNgu8n*_u1v0sbMS;_{{{#|P8KcN`lu@@XyZF^)14
z)G*=~@^=i%3_nYS&b&wE4&!7(VuJJu&nbJMHxwy@<=?+;B<lCoMtZMaKa+YiI%hj~
zta(2a-q}FDgLA&9fg{w($8grJGRrpeG05p|B#y(j-x&;R4i<B{Ozxr>`TS=fxN2dP
zZ#sQ;Y^$vBX%H^~41q8?wA1!{vdX9JYqb;!t(N|l7_zPwg;%uVCpwdK{Xi5rd}Xd?
zSlIn8GF^_k>r`YSd?Pv6_9OM(lFwe+0w{u!-K{6HRyb%1?_&#&mAkVtY|67;Fvp9+
zcR}?{LDq#jX9qSQHPz;b1G|xumr3vzq4ZcFcBTY7bKm_#E^J=(XJFyF-=&&1ziNK<
zFhKvY7HZ98)&~o3V;p^>{R#VdpyM_!nsuFx#Fu+3EizH66a>|HVv8%VkjkM-dP%4m
z>I_q3lS~&y-$jYnKfY95$C-Zk@3abs@0so*Q>JX(+-Id@5T%AGbT{=ifv91j=FwL-
z$E<Iztr0P=EE|U2Xob&5YTNZaOMr6d-QLE{G$Qr1-v(9gO`3j9pPtrl4tMSpMv)hF
z$QROm;T&N}J`DnYn<o3UZBwyLh29Z>&*k^F7I-31A_pM7DhOG-u`4W}x(TaEw)%V2
zL9UqbCRDlYW&ZMn6s+D)NcUI0geyY6ik7){5ZjJ^72NM@O=lAvH$s=os1!Apa)S<2
zpVBf~Cci%B{F~8H^fs(a`vrwGmO$Id=Pky?M+5WnARGtHZj(DfS0#g7!@`IqAg?On
z&0}O)kHx_p^=@mSRzNqyMa9)n@JE)GItQ(NOkdH7L{(0#cx_x}q49FJ0Tgud{ktyO
zi}hGZtKS*CwWt)r-Jv@(MajBznr93TY1iD8Z5TsO(hrx3Q|}~j9qh0A=-wU#Kc>=5
z4{qrl8i-bqAcF|xB{hD?nFG(pWf--CL{Z!fXu_>NK^mi`C%y?KTeng)7qvx7O%h{~
zjB!^{<=mR&+41o44Sm2Jkf_jXU+#G%m6#u(91j(oSyoQ`8co>KBxbF?2zDE^)O$vU
zZlO57@9qAqVJDB`F_UQA5f_5PJyO%LQ7Q+bd1Nee$63T&r;-*Qfb39kh@J(4I@)lU
zi9`!-DCC{*ZtX%=4YSBa3Jc!WnzQVE-c(*BX*_877V}opLoC7&C6Rg>Cz|VFx?Ues
zqu2cBW5C#4yC9>N_Z=*C^P4xc*7C&4eFhS&{Fd6sHEnk3p>}pVMKev6WCp84RHn9a
z8GenAz=}f_It{k0JGM*u6)LR1iut3+>L|Eo-%fqoqfMM?@lz+k%7-j&(Yp;_aL0^L
z`6(I*A{gJD887X*K{4RheH+csJh+57ncC;sW$^G~BGH&;msG)xjw^ig&)Q-{+f&bp
z@q&`-)?etcQ5Mc_b)bqK>~1%B&HnlRoUT-5SJ!pMvoFli`>7P~)^vfq_LJvrbsUy8
z5<NSGEw?aGJ7nx>`OMKKj@;H6*GRj^wzVV`Rcr9a6V*lb&1K&ZRGR893!h_@EpEdU
zTtaddo_W|DviccCi%KzuKjRr<-7}^ts*Fw<T9Pw}aZvl)(Gla>S;H>S6Usl5r|CMY
zH!0Ecb!rWnGgy)+sHiFJLeSN&JT4r6um~sv^_vm-tTYM+sKRQ>76K<pI(1Y=lKqeP
zwCXI8({$o#-J5jVi{KPVb}W}xi4N8+X&UPHq6EgC8ohCM6BklkKW=z7LdS@B=2UTH
zgv@<0#pig`oeCOR-2ZSy6;M6&sQPcL5>L^|i$>KO(9V9kxlX<Q4X81=b8>rUlPyEX
ztZuYWM!G77SV(gz1(l=43sOgrl<zDIs;^B707n+Fm`-Xwf2SuoI9?0xzzyLl2_d`+
zEs1aQ|I!ZHD>vr!fQ~gN2sfR6qszqq_$&k2o%vR)5vA9bH`cMLyS<StGTVqH$@6G^
zKK2&qId(y1x;sR-f~T(@ynFpmp>(U&XNY7vE-o=}tx4M@V7m+?yB6Zt`%W%XeBIL^
zLqnVF0+q@r+FX++#lZv*@cotRUzGG;YqYP40-U;pY;qai-+CkrElxS#3OZiITFH6|
zu;g>9AI{b!oGuF}{!tRZP&Wl9ka9#h|AGYGO3Rs#`mpGwiqB=;{Bk*HWa_nc2`ZEs
z<6(VxyK44@d4%Fi;P3+0X9KEs>#MS^?2M-AC_kN_+cF#HZ`s{!_K4b#{cVuLPRVor
z(?aZTzo59o_<_g(*a37)VP$t+pQ!(Vo*qey9sC;gd7NwmMfbG-8~4*6>K9+<|4BYX
z4u@dp8x2LnwHTgW&q37b&4ck%=ltbdX4g`hT1H1nDcXXHwt!bnhy`oxY--e=1N>x`
zKR7%h$PjaDGrC|`^-4Q=R|fcfX_9JP*j(9l=jA2CPa0Xw*)OjojJ~lYj@LEeH48W(
zkLQ-xA4m;%B3cCFxwbE?fKtDyQU{v~DbtMtB+c=f@zAYpR%G<_95?U`84l=!HXJIP
zy00R`8xI7WGe8}m_Tq8tsxewRMPJ)uuMT&{>NX@aCf3F_43jWA)+fv62qa03EHh}Q
zRjDz=<9@#>Yf7&adt*G~t`lEQnHngBaU<1w%bnq-i~zBotCGez5JsIewc1y8j-`nA
zER0hlQf20E@Lhd$QEInc6JpWx|9l{-+XKN#e_nf@6jXLK@Zym&xWzs?T&I%7p}n;&
zYw1o_ZTZfI&2caVddVu(!uTxG3erY7^A2;iZ&u|PqW<AWH}UHO#6JEE-@l8DSYvo9
ztu1%5+W`|x^B*Qp=XN2)SM?9dXB2O&Z`%>c_tdD53|D&f4hkr-artPk^psQtZM>Z}
zwCiu$^Zjjv3Eo<#2@!Mp4NOF7xC8Hh47+uk^`C{GC*SpctaycSn|pWihUzMmInuKY
zoAftU^!x*$+=Qx8Oe9tVHJY|G62Wzt%B1PtVt9#d;`^RA>mZ9USNXYJA_>f+CkwB%
zP%G*Bc;93$)_TJt%Nmqo^sA(!b1!h^RnYyvk2a^#u^Q~PD7kLQ*s8@!bBUL}=BqhB
z$in`dYu42Z3FY$>OJqFGPv{t9;rZGigK4GpX*xP9><Z-zOg7!F#|;jM3!WN!JLJ0z
zEJY*hCqFmix*bKB6Pws?w|s&8xSP76BiM-3_+fB#=0@DzIwYyzI%~U!Cfb%DLE2cR
zSNC%bIo*CyHYDl$$YPdERJq1V$sb05<?}XkNkVRky2DE#?7o>O<5!Zk-WyFtA5et5
z+vgKiPIZHl?3=&Z`3h9JJws6h#b=WOsUf}JtAVte$AbbjXg0P9#LieqUs2sW8ZRdy
z6Ee^&&y$0H``fU{QH8FiXa_@rjvwgryq9Nq&6As7lo~$Z_LJ49M`*i*LdK#4*z*(C
z&0YD<{86#KmFNoxWBg$FfVR8ItGIlNs#q^Kr##2)a#}o26;m7RbP)mboDo~KUsF)E
z{EVhIlfj^nBGz4ZcH0oO@J<bn@KNJ>zBgv=$Q|pX%bJTP;)yKJTu0d8ru#GtZ(mb3
z*jCOL_{5F#dREAlOf9O;r7F8$puRxZaMo=9n{_=~#J?98s}NVCkn)8M<97drOq}Wz
zr$eEAn#V+vFKTXOSTP8cmWt`U?}R9|ViXXx;%XFE4ISXuUcYNbUui)Kd>O>Pt|w5}
zWoj3WhYr;LEm0+1Tq5n`KjP3Vi%{nHrHm*=6Urn$Q2=>P(5-mURiT#oX&M+A0p(WE
z946$yPGHAKSUiwiKnq1d3{4}ZdUbaIgKSx0GFvX}<O#xWcThRiUN>BQRTvuIPhvd(
zFK~k<w0Q*oJK0R0e}Im~{J;uRA{ZILS2TYIx_GfmB;&&k0ex3@iBIAqpXfY%@Ov-o
zD?M~KXLoxkDc4CQ)!&_l<Z5`+#2A6?rmROlJ%!#N@#7D3&r(cbjoOjp(^ekKAMBdJ
z%8lo;rqUq-6W`%*UgIoGLaQMyt^3I1UwM~XMdr0T@0+y*L(0h=%!}q7B982$*08<y
z*w2=1jl(h<&ZgtvprF;qFUs+HCQF|0_sw3*p>1sHE>*RAy%Cfeox!n?y_ge`T(F}D
z<kDm642b>=8T#%K5nNcYW%@VEpZutK8jH_JnyZie+f{G*_CT%JliOr`JE}$25N2-m
zchjbal(04l-#?EyUodZk$+y`~HNB7EWQE_63kIEukZ#pbM~-JOsMw<~+z}S`V$d0J
z01`!lQceF4Gj+?mYCW#aMCGEZQNk_%8nX~EwOTssYYbtM&d#oGV*S24IICuIt8r$K
zcqJq2qDzuxRy3u`UcRf+q4HRg+rhVwrZa(>G+dAUAY*39a51Do;wif9e4*c9X?jwE
zGMXqik@9nWyL|84-K|ps0J6299G-YzDC~ogSky%<qIwYb)BW3w&OBzkd;4}n-U_#K
zTZc_nc=e|7#_#cfUXpL8?t-Imc9^X5;1+57Pna+?DMXsxlXY>QF)$YYu!7=qa!|^S
zjF_;Vi~b}NtIM2Y(jAMXRMBgpD0z!xJz3k{hsOtYU(LzBGi}3x1F9~}DvvUJtd_3$
z4w5!cUYzkIfBBuu6e*LMl_9`1>WQw~Rd^U)oXEURHJoeXP|wX|K=4~^PIai=iL-C7
zSLHi_Ah%ZN`{IHY!AO^DQX4*F6oUQNc>i=9xnM6fD5@ltRQJ)3J<(q|31l!n4iixL
zw5ICc1*KyjpeypH8uf%tLdT#3?GysWS978e$d|9WX!04Ta(tRrn`^s{1*S1Bju&?)
z${yz`xI^?e3o7jHqoXRNy+c%!Z+34L<5S5tX@?s2PSF0G#zXmmbLwCN0PcJFyljFc
zxwXm}d*;$?&?dUX)9}$2@P_Dqf8E12X0YU?CN?7Fp5)>#{{wIN9i~nM7HOH2-chC;
z9EUB`(TP9d5a8|D`V0x7|2G={?Eq?R)m=qE9vy9w{^Oqu*zYGD5<fHrhLu)a%6-1k
z1BDg;yw8Zs60>iVtz<n;Ii_>xQIGrRNv1zSL*2}VUZiwj;^%nx8e|;`>m~}!S|Y5%
z)gw(~=Cvv2d~gySZ*Y41ahz6E;(Bma)>JI5-Ey{gdiA=Dgqx=+lw&`j-G(jQca^7d
zKe8VJE|Ib)#T+^KU6a&|Iiw?ft_G=bfKbvMW+yX`Zq4)tC6_-7y-RBQ$O!*VLEQov
z-#lXOQ^>+EaqBgS<lzjfQx4jB(n&R*adr8o0+{(y%#iR)QDoy6RZJHC@Uec^{CnMK
z0qXdNK_FiudPwsKzeg2%H+Bc~iMxsCz7QWgg?}l=*3(V3+%%1BO)QUVRAs0UBK}pt
z&LR3qP7jgmu$g*ed6GCm56<qH!Cp61F3cMh&1EQU`sfM9C?44>8ETro59H((rBPoe
zs6&XoRI7nd{TENP<X9EyqPo{;tY({#v)Gs-<JDzR+e{L(4oQTGy9PKhTS%L*W%4ME
zlcj6|hYk5{-_XE4uM@Wb+cfrk&4oEv>IU2h>MuvI#;e7zckgsOEFhn*1mp`FT10^V
zaSaZ(U@pk42r;qVBCchc(T%yX3^GitECL3m?OEAQHjiWGpVCDI_hPpUo#Y~fi<uDn
zno=!xD;n8ASwI%w{PtVSCF(x5=V=U!1|j$Bg+n*7#f@P<GYr(-W+1i9l&3l-j~7$!
zm)3=~H2FEPt6lh2ZEFqJ(?c%4GUszkKl1zH8GA+sM-v_iUI!l8@fGdsFWCV$qDSf5
zC~_rT=?#8d|CQ#y)gNih4O~=t4B$q!ej#r|6J7qwP(TQk;)I?IU+R?QJ~_If{<@>9
z$!q^De=;C2X*=>S)e9&E@5VJba?X)kxp(-sD*@&0?n&{+K)bN<4!eoY8UF-JCF*32
zGy6rLusn@Fi7nIjhaXc%Z3;FVkXrtJx_&_fi(92DCwD(PM`Kk*m?3e+q=+q;HLM6K
z@U(_PfcQBPKcudf<BOTBjm`^H&?XW2!iQ25VtQ#{|33hcKyJTsfDa&;*1}~C<bL7Y
z4j5IDT)*X=R|n$zn{_Re<V4+j{j7nbsVv8p_CKsbCBE~vc{OF;R|GfN5?1$Q*5X1I
zDiHO1gr!oQY4RtElUJm18v*1yac|-kxqIqft<lT<x)XJvfOcmSg{30t%^7b~ID*y_
zeqSDUA|yZCkjCxvJ5l=`fDQqImO^-f@7N~}w6s{)SQAQV)yI-~L)RpOREQWkNJ_Kv
zi06o3BKC*?-fiU>aS~6g+1`37FXl@(9zJXo@O*6c=4bAVe81luh8Q-(V6HNJFn2iO
zNIK(mB0K<`Bfc_@RXW(hV!BGxdBk5OMVPq{{X2})&RU~J+qw%7J8ebpIU+kE=o34(
zGrZEgr$G!94o?wT#IwCSr6Ku|huN>+NYyX7k<yGoT+H5tM3I0<)gFVdN$qjvSCj#4
z=(v?({sGkZW-LSxHyx(<-4KFIb_U$zztY+_t@$fIKp8}JY3r0(68D6KZFA`Af4S1g
zHL}hn*m)$gS(IT4oBVRd?^O17?*h9Ycx8Q=n9!NN)w!57^Lz(3@(72kcA||lD7set
zI~{CcZSvwVVPTY7i61{e<EyLoL}1fQk%bi(Hj1RcODj{nU{oTO(<ES`0{lYxac|T{
z7<8D(rWPj9QNeAj14tEM?_n30SS0H|nM|aF$Q_N&c;O(o@7YfPLs}cbUhs1>O*uoS
z&v49(CGevv^rZq$g}Uy3t>6%*tF4<uU^^ps6Vtl^AjG(bb>ixrH?(xWM$}0WI4pxI
zOsY5#*yS)!aG9|N0KL8P<v`JJ;NrP82);Q0y%m9Hu=Rn1ChG69<VAjpc#QaC5`n{l
z{b0bO20_h#wWpUXO`$~|%q;cyh@Xi*KhOg_ZpKc1dh|!L1Fb>RPb_ang<#tNbF=A=
zV3aMP_}pgmx(1Pxf#?f0))Dey2e-aKz$ZR%E}Xsr^D5SI{<#;3-E6)1K^V?WkluqG
zCeF~ESwfmc?vcBWY+hvne13{oILJt@pPOJJCyXd@scl$K1~|XfFiKXtWfu-MwX>x+
zaz@!7Y-a!MQ`?K2q_&Z2Tea7VtyjAL84DA9d6w56cmyRb`P%#SP`njCcd$0vHxj2e
zR=QkYU$Uvp8j0rZJQy0^pztz7tpof2fV$+u8cSN1y3Q83bcB7Es70k*fw2wyEiFC<
zE7Y4`*keK$)CAk5;L=jbJ-n)jkjAh<Dk6A;`O5&XueiY+?^4mOlrjKHjl{EcnA(vx
ztuxHmh6-zfAdwzE*#F!jjC#lyMPQN74Rn7?AC+e+I$iBLfb=%(bFd^*vlVixC(p^-
zWNraq7x6jb!xFg5(MaUELm`+J5xoeZwu7*2y+_v(iHbu8B=JE&I-XF3sK+DTv!24H
z0Zc%#3*Xt}cZT&S(;&vqp{Ft4wIFl|3gqvJn4gJkQZaPUu#FAGoPJ}ROJIX<Jkji~
z<}W-&bP*2`@g9qDHK$R6v<ou75dt#bl!A{O)X5l6Ph7;vY;;2U5*&;xA9;|BiwDBk
z#r1RkrzZBCjT9+2fql~=xVEt7KhKBbgS`=4d(rXhX$(*SrpAQ#WH1fioH!jD3Wy9H
z{2P_lJ%_fmN!ud8H;U+c^4Nnl`~)?u$)<#E5a>1415uTC)cby+q2n_rZJ*5YnwFSs
z{tu^!uMr345Y%jHmH@bnOnX3yo005@iNE7(CmV~h_=tSTEOC5|_ytBbaNwoI>D!mD
z?4N+%$r3kd%*J&iXc$%0@tlHw9TK1&1Aczb<1FmIvj=MpH27|hBv)w}LG*TE=tceL
zvsZtCxog+x^NL%S!wSN%{B#@f1tfCY{M$*E&{OVuzz4Os{v1j&MbM>H2YX{lbIX>s
zF~c&p{OTN}`x;S`k6`~Z%b-o(4gfn6!z=k$6Y<IP3+ML=2=0Rl3MwOEz2eR<Ru;Xj
z+=Mvujheft<IYHt(fr{yi>2(OK|B5y@r`QcL|#56<9Jm8o`cIBVDn-Jd(Ofe(>pAd
zP-Utx0yQ{yg-v9+gJZ~&JD(xz%t4RFcB#HqMSNzxQUs~YzoyLHvx_y?*8$kbKxQNP
z9ua_8We{Q^I(V&SGluu58y>AwqcQ>R8UCP2_CEke9n4W}1ny-2KQH_1IskheRxGVz
zzyUBb2ib3H028>i-Ew$%fCjGwrrKgq>cHM`no4MTb5~bpRK0Erfo8YSu^ofG3roUs
z6?8WM#Ap*_#M}sMdjUn>dgJ*^&T?&B-h^*&G4>55a7DT4LMgxE76j$zGT4VTizpS`
zfP$l1wY-!xntWduaWi9qMvEj8mCY5Nj>ER0Bp%2va}LZ15>WxF;l(E9MgRz_!>FGv
zCzM{TxHu&~L>W$Hq?M4eQjn6a%Il^LX4dng+HY?1xamCt%xz$A<sYXp^}lM_5qA5`
zkm(2O!3~%nK^{#YsFDn*Cda7h8Jn;ebv9fm%@3R2U+bZTJcE=!Ak;62up5081?tO3
zmU;k3s3xkuoW{Se{8dbDff-H~UiR;`VFr)-kwU|nGC*VQHsd^A){TT})za<t)T53h
zuf4&DMGa@NX|y?5WzHi+KJKhLZ@UIlT6U_Qh#FGtIpRcd*HO){pAKgA19)Bi<l;|N
zE6&#5C-~u-Lmx|jrc|WgOn(CN9NDMLrrK8#k4WK<_RR%=kX~O%d1rX$%+?6jr{^cP
zISK>$R%V767}8;L6v)K09ieAB0Gw8#2gaocSVia_l(?gq(OZ4Gvn*|~grlA6w}|f%
zmxzamANa48l`6MO;ky5lGOC3C7qobl=_SiGhEm<<j2E8x^ErBG6iP~h<Rr`YbW^=h
z#t1U2eZ=n~UL*bucit(E5o52CNl?yiu+VnB075W5NK;={y=daxbn9uH%G^}}>)=1M
z`cJ%SFkeTH;D}Y0MFj_-qXMkR*e7Q&w+i0G`t-Q*t4$csY|_W=0ErvP41V|adVmtg
z6b5C|`yUCE1C-`7^BqSU3Nlh?zZuZT-#EgYsQF2o7p5?M7X<76y%(miPGlXXbpQAE
zPE##jx6GI1G}9Eg$5EV@CBK@|$1h;-ZV9Vy_H%~tDlq-lW=doRCR8q5d<q;`4~L%F
z=rg$o(P3lagE9ctfzpEz^kfcvI|B}j4`IU!bg<v>8^>gpv~QlbZ*<ntSVw%M{2wF~
zi=U`j?UsByOa(UDOBx>-kW?o3AHb51d3+yPMOB5g92*(1KydI^CC3@*{?{&Ell2#9
zIq|NH$A~|e_)fj7Co-^&BB~N}QTI-bT~YUJcX4&bKjoyUJ1A@sCn^an&ffsNDe3w%
zsOkuAs%;Fzf<;yhtv2CC1^QA=UT1l+a%nC$N9ylb>>W_xxE!;Z(%9O%J9|W#_=Mjm
zDby_-@I=%TbCYAbM<7d&{zGJX`kAZ{f1=o#a}p#(dXD%jUKYJg<4UfjYTTJ4U2&zf
z4y22p*ejvAk;ln-*-t9L&=M-)E{zNM$f;`34@81MMZjN{x1+jE1E2k3rL<H=c_nn;
zbHOVd<N-HW8-KNNfR!m_R<9DP8YW&!Gnff?HIN|GmOJ^mQE%+Reg@v7^3h5ZDr2Y~
z@|s}B<o*jHj$?=mHK;pen2J`5W20W{D<f!f(`@x@XSMl47Bv|j2}7gxFwa7y2iEMk
zp2nK6s1q<T!<HM4R?Xt)?!dj!ce0tV57yh9v!dzYpO)SCVG<aZ!DhcPMap@SuwJ%~
z(9;{9lJP7ua4&AjLJJGS4XXBP4ZpKxtnI9%saBwuh@UN+OO0o)6n{b{av}p!1zXI-
zMgW*_7-~E@<6&xWM{79i(M>?k=Ze&7XIJYnjn=~J?f%mMAbZI8&-d27Drr#VKjTPe
zrd}nZ5`0iPqFA+XKqV5t@AMPi6TvDrCT#V$&+v$67IvR(3i60XXF~;)nZ=YWePbu7
z!WepU01E`7dga8e<~C5G$334mi_r6Dpo9K_lFGrFDCVt_E@$CDT1K4=mJv@&F0Qcm
zTBjLM8F3f!iHJXQe!oD6$dalhCQ29oimxhh+7tXiXH|i6{XT=q&bkCIEg(=BptWF<
z66gmA@H@_s;G;}!XPdRHLTQ%%QPl$pG&(y*i=FF?=PM$9z$N8&5C)^L7B!?EXj<87
zk5>9P;G28f(Zy%p!UyvVSEFWgc&WW%gURjZM4B^GD0|CkJCeAH{31<%IyBfL=eMx{
zoLQGi{~PubaF(t8`QDm|YZiNP3!<<N+^|TN!Va~XQ10Nh7CS%CK&)EN&R(%Io6{%L
z0H>ByTd#kQ)aiiiNK+;z@aVz~3scYZ?kCC(B~R2EuzX1RfRvp)8)6Ex-yP|vr?Oi$
z0Ga5WeOubH%)t3J2d3*vN}gLav@uEE-Fg(n9wAO)S=eAHOqQUo^=H1NgBgt&`;7Lu
zeZ(Ik{<zLy1}nU^X6_ZpYUCtsc#j-_akNt2fbQy%G1sr_Nl^ajX;Q1vAYP67Rek!j
z4nSder?aVWJKp2lat6e%EhwylLzChC1IW=>&>0LOZ5&{E`J5u;nZ^6Yss6B>3is=_
zsrt~EU`7;_1lcXNxY3C9DdI;!V9!u*VaE<<lD!8Y6t^2=0k3$!9ogX1dn%R=^fKH3
zizUGY1^u4nY9Zy%tU9gS?P3)Vxh?_yjWSGd?1JkYvEum;1wT{cnK<4v`+nv)<m_un
z;eTZFQ}i=G>Qls9#E+Dr25w4T@oG3_@yRUo<v4<o?(2aqROw+uhCAQz=ZHrW)##{J
zreMj|HOTqF{zQG^qrC+D3^2K)g4=C6B9S-4l1LqF7L==i(of5~Uw*J|lA5)cjM}@b
zO)w_|`E2E`dE0jwyvWyhHv^FM&Aa!-lG3n=9^-l*yO#B8_SEz99ufwlchfAaGC9S>
zTQVOb4oC+T72f@LLCYUh8jZ$!^KKt$00;9&s!XacD2xehk_g^_>jr3}!DN_6grQIG
zO|uwG@F4lPR7eH);<b&%%FO`v%+Z;U!!(wAHO#fytz~fHT_=v-PZ6=|P2LuuDupjV
zSL&#duODdV+Mtj|9MB#+lkDhZ>vCuMhjjf)MxA_(2m_%S4by;cHzXjz`SLd04qcS`
z9|vl^uSp%QA~Ju6s(HZ>1-V+l^)C^>Bs6qksm*K%Pxk$h2Cq8~)wUA(d&Dz_P@1Su
z9&a!hbz46LprgHewg!?-tWJ)Z@7Ay6P*@W55cybeBHjq#+d@1kMr+G7atq8O%7QQ}
zxE-FB1o2=7lbbxVWrME<Rd1VrC0SQ1MMlud44$4Np4SwvNRS+qA^xZ3DSB(@NHd6m
z+Ve*MjDF~X!D<PAK@xrga8eNX#!SYnOt!`;_j3E|3NwQLs!YzssqYcrDU_zCr)6=%
zbXO`F@60#8Y%ZXP4V7uXNBqKGw@X;OfuLQyl4BAQEL<%Y8WIpQL3=y_yso&b#uV6d
zZ?1yjq7yW*c(np34Amw_m=VC^rV2?_zc8ssWz+Wtj_!s5iS3XMJGWJs%tI>-<m<&d
zSQ}zvdiN{vWg?8`>*%dww{V<9S;3(_Am&XwnRRc4wF}Rd{2K9R1UA1nons1JItV}g
zhxv!TLyUgr0>j#VCj9QiQg_7Ak{hhhf&Yo23yUIj7V(LT=ph3HctBwC#2}lrh%K#k
z;An;ST6;IgyZp&Z#2=Z5FD=n%52Y0GlCX%GwSeuJ;uJ=gU(nTvl&#kGpL6Ag47j&C
z`uulx+|+gk?57~-7Dj&b8u2p?qTfA$DlZby$qg%wcz<D&Gynz%`x7kmQ^XJGk4g^D
zH}<dJyzPSJQ6sASf&}2g6JpeV{gj}@JBR~O|0gq$tLZb+4ESix8XcP#R?-@|(&EW)
z$7p~jIB)<OOCD=Ba6`naRqo7DkC^nyI@4ci^8KD5C_R&gNL=8<3KmWHdOL+z9jn5P
zykZp>OuAri4(kl0M+dz#G24i!X4W;_N8D2t7(IZfjvb~M+L<mXV*@tQhu{G1c!gGO
z1m=MhQf4G>eeq>g-7;%xxfpdQEyNR(S#O^Kq<X{?4>qi!1bZsEc$WpcqtbS^#*4v5
ziB1q<U2*_aU&ujSchG54A`b6$P$&T1Gx~hs^!zzui1-mAA~O$>ex#*&qaEwf_`lig
zA0nO-93}DGT%hX>@#MB`t|H6aflkDv3e-13WLyl5UVt$hb3nq9*ehqYGHqyBHN4y0
zyux;<BWQlapT8O=@7J@0dJ5ln+-jJRW;9yiy7@-Duv=BKIS@TX$4(%`)P7XDdov6*
zJ3uj-wLzLRgDiN5=*1F*5|=)s?mzLPcbn%|=z6k;pFb`R88C_kg{H%Q2XP=Iu43kd
zJGfi_L>=*g-O|M+rk55=iOOyBO5a0O9WLCGDAg0iojqh+<_7G?umrv+ktrc#=g#`K
zO&cCf#4PH+G!Cxe=Qd<;eg?uycJoa>^LoOJ-r@y_6$f5fEBjz?NGz<Cd_anu+VDWi
zGvoae-zjVZkLCu_PY%ZF*@c4eEhafE4@f^9L3`F&oe=%)NPMo`<I8%Um};ySELP{M
zwIX^>inYyw_C%u5L{YlzZpn?N6qf1}JK#nJbogk%b0os1DjZ3zJ~9QkA*$5`laB4&
zSpSYHQCJ=z+Mr|?3wwWGuEmoINn*?cuvFAYWyn193@o#}%+&}Zaj#&~b&eHmq`wW7
z;(6QeA|7!%!e(mm*%qOT#y&%}sO>TXR@O{&xOxCkaBxQxxZWPqtd_uXas9Jp;2G!a
zN=TV^`yQt4e(i(jFMP#|bsU73h_Bq#BjGir@x4~?iEO5_+`C{c@9kYLk^F>2r~NYV
zpPNWNM|{N67AB~R*ER%K8B|y>A4t?w3OBRkG1v`fY_*ceRKm4^=4Fxv!v%Uu;P*?%
zdi&eCb$gW@^xEomR~9|PJr~IZbNhe)#hKOqXhzz<=L2($w&ESj6Y)+>^cU*o?b-qM
z1n3qK{+<Y0Fj-*S6wxbQ?|Eg()np55&OTUWd*cjGBq2@2XIz>R)@&JJ7dzJ*XVpMh
zeKj4odgA9}u+VW&hWbucsV~W$IvP;5`5VuFW@x^cuaTPKH`tkqWHrqrrNa1TQ$vM%
z=BK;I9G+pM7T?b-IO^F-&U4N29un-NB@hb}HpJ9Nab)~?y@^3C5x+$I2<obBh)8nC
zHX6}#e8rU?9%0^2Py-b%X(I`je(8c|JcCpN;3`Sr1`V(DxupoaXO6~+CYlj6no3wQ
zD2$RvI9c=4gULE3vYZVJG?>t_CS_0=vE{OR>?-}f<bfxf+A4PBWQ~Mbq9U4D?I$XJ
zHAHBR{na#u)$7IcM>wI|ts;JBUGWv{P{zQ;NS07VmonlHmiz31;R1v8IsnRkdz&CJ
zm<+Z<kWpBb_*Y}Ro#FYb-Y?!6CKeQE*Mi=1IVMp1F_6W!R;BsOoa)5`1n2u=K4b?a
z-&iPn@wY)FW_V}*Od&}%B^80cW9PqKyf<lo4)BzarH%M;`Ldx6>y>`#O8F~bzMR->
z-;;q#3HgQ~e}Ih7t*>ZX1301BRj+7<B54*MNu4rTx8TA6N7ARG2>?g_;hDs(d9R4w
zLy?xoRjX<D)uz&^zP?z$3*)wih-1WOo5&&!@UZOVT6}VwVs*BeDJ2hofrfwC9LL=d
zdIp|+;38K5*$x@yhW@I;Q5E>G+0O1U$_yCs69;Uue?9W9qv;8E5x=tmNu^~l8w!es
ze~{?0w8q#YKhbUm00r6!a}sAZd+(D8QPLQo8dQq{LaI)6Lk#ee!+RObz1{W*f*vfb
z&xr)-=8yRIg9$CQe+O2y#IY_~+Lc|mz@-+aIbjijw5j`z05B0E=h9P&FMZ_xRTW5G
zh+~}*SJM*om7J*#o_Yb1msb$;-pHSvVAz`w>etUL00;!g7$)Yoh<x~tZ#i!|;IOoL
zw5!b6^EIM}z^anFZy{FRh(Y(vvmW6r3QS_Ri2$TfDfvaE68AO{#l2EW12)F|{D5%w
z7t?`zH^U9YDSvGdx>ZIvoq)Xg@;0aS{SW-`$$H0CKF)FA-I_)f_80J-L(dBhpgYL3
zVN*t`BIwQv&l%a~?EQY^b`F~e^bqj_eFU$drhqgJ4&B7i)&+<k*{27SgN&rzA0mQ`
zzGhIv;sEppvvQLX$ouqoh1d6rc$F>qWHv7jc`gimvgZB;0J@L($h+8b5Yq+uMBofX
z%`D0&1F21pGc0fkJ0yv{35&OB{39|*L2g}H)y!=ZR0b#&P-}v}ZaE6!w7&kd(o793
zN=aXo>Z7mgPTUN$lTI#7vT=c=Z?N2rUb<pAIAnyl2;;2#Wnfxd-)m)~{jM1L9RT#*
z3bt6>5)Jlb^Lv!UY=2+Onb<;+9jemL(eoRs>U0SQ98P1hqnul)$<G%2dzm~y1y}mY
z9_-CXGYZ9#T)o3AB;>hrXf+h2qR=&%;PTgBQvNbu{|u#B@(jJb*zNAU6!N-!6{-3+
zW~XtD_)7S^Pev(7tC>L|<;!@jMc#AB7e%}))?Zi#F>vz@k*w!GZM5>M7v}*>-W#Q?
zBtR>*Sbt^MBe`TpV9Gdd1sv>U9Y7|_@SSnYNyODm9jit8bzvHcSb@kzv_oN2W;eX2
zfYTj0bB@zi+`MwOR9qkY`!3;8r@|gs+S?NDHVI_^q0GFeNQpV?-BF_6Py#9gsjPc;
z!kOp%+<=U7LBXj2b|7@O{Ozj^PRS$w6!FXY^vgMCtbEJc)xy}N5lZ@J55P)4o_H;h
z%S|O`J8;sAwH9e$UdNsS;AjeniZd%c?7K?hE0e#x8)IZU)mpc)c_kYjyist#qwLKu
ziPjjC#ZwP{W>d^}+*HW|iw8+>zFQvE<8lIVZp^^8ezCGu2>$gA9KYj4b|8^H`H7r$
zzB3Gx;WK3gZSqq4TL8%6?UkWi`PZ2VZ79g731D%CC7G|5J@q|aNeu<<dkA;}IA0KE
z?g*>u(dyodpat)hcJ{ci?68L>8f;Y-tEUEq!UjU@jeV0AVc~-_%8W<wvgBqpbif%I
zX*xu}?_#vEMLW#x3rS(d%C&1Htb`<^`LI3vGCSaTX`cuNMY8y3pk-yu*F7VYYH&om
zi2(gZl=h4E+fm0=sbbLw;winfdRKNUB2K*Pr8LL7Sh<p*8IU{&rXzP~p5|-pHZvIA
zg>;7nQE87A=Qb#{Uawz^erU2+jwtj8@T8_VyvRVM4?ZsYr$#uh-qf$}X29y>C}>8V
z*XhhIRNO_{4Bx}tw4~{KkW2xAteUo5)#Bn_bn+eXK9KncJ+5CS62?3=ocWBGtmcNG
zw-J19M_tMO1yr^tn?Kq8E@}W5s$UNgfy`cwBUe!2_?`r7E<OAb@kSE61BT_e_h~b3
z$?OZDON-km%}8=qUfSHn^Q*3{-XCWQ)W6!VGnkE1+?Bmj9?)@g8or(O;?&E)lS9Ux
zNg32%KU#tRb{ieyz|Ta9SN?{U;Yxz|=go=Gg;ke33&rxA`q%6F0LW>nar=tt3BgSy
zh|kIOXa3_e$<Dzdb%!G51s^u9yNJS0%-&j$Pn5!Ps<tKZv`-LB%YyZ|fziUiY}ogl
z^LIJOo>zii`Fr9J#Y&&wd#k5DE$@Gtcn}pH)tI_Cz!OQ~W!W|})0GV|xHdaaOSwWX
zneoXxgT)Of^_slU+CM!-)WBdGu-Dw7S-;l^r~-~;Z!N*Ru%AAg5_50>A(gXo{@hvv
zB@&$LlYg}t01C3^BcWe$qu`nq4)}+p6{SGKc_g8&$=6iV4a865+JTPjc4HJ)ZA6})
zMWw2g!=4Cz6=pjUx;>T4EwfmBWI}smt6LT&gM{cg7pV$r!fE$}_!`%}gY#CdnPRxf
zf<r^}?(H>ewb}f};Th?7sTc%*A*avr%N?iSU~1D}FMf^q67iKz;NIBk3l6F$`Y#Ho
zWpWz>{_#2D&k=dV57w1dA<-Wb95(Ug&eV$?<=X@6(OF)5OJPI+jyQnM3Lo><KSz9z
z$RoZ(<ZKH$uACTE{lr9Pc(133Z%HGGYXb_?yX(L9+}a#~zOJ`%wCPQ%abECEcR=j&
zy?1(McDDm*MQ#bzK!hFF6?BO=Mw9s}r^sy6hu^?nM@zf!@jHPr-N7FW?}4CTsZNMT
z9!%aCprLXt#UG6G7_AgHt6D#y=#nQ|fZ&O<rmES!y%XeW)uS%r&k?`8_gd->1TXzb
z#dUTp>Y2%uxnF;z>Mq*yg?g0AMVXPz=Y++Z|HiCge6UzoMtlJJG{&y>n*q3_dRB09
zN&sPPdfs-eWyuMeAB_>|0U9IQel&%5Y6AO+-;n`#9KOP`_H5bTk{Z8YvbPLek%EX+
z>kYjvDA-qg_*}34IpPi0qQJWLpy36Pl%X}=<9e@47BoZQ-54Z{Wc`j2e-ZHs#1K%0
zLC`N@L^nu)!3MwVu#kd$FOPUbx%jvl04}tFH>_m~?It`}K#d)J^B0}KVoQQ=<cOAz
zCasC&875wc@@x@sG7uymT$q}8K>;)*hN_*-7WaUbWpANuzp2CRp=~7S?=9-88g$|A
z7Ku=1@v^s4c#b{2GE7SiP74;K!#N#rT{Y6V3^etOSKO~T=*grK8921E!dY-HgS{tU
z5x{b@uP^~_hq;^y33)P)T$#cLR$F9#7x9hFD=El+-b~@cnCJ@EzXwyDj1qk_;l;uM
zh?vSTSCv~vrlRGn%Ys91=~6F%E4lGExzRpD#J}|(1yn8d$@04*U^9ce3kTdUOOE=9
z+`y_5pA8ms-k!gTc#Zf6VC$rtEy*4Fh^L6Jn;G!5g%{;d8#FFEUbFQkA1*tuQL&Bu
ze}*3P&6?60*tLZT)VwL>nV~?cwX~?aYEy}9XYd+7abkAsj^|PV>6DChoj{g3t2J|u
zmC+_5ug-QfZs$i5qzRI;A_kRQAYaaVPgPhu-lyAGxXXx#iXj~t^tk^1Cj>)ph!q~t
zjFqg8^3z2PRds_5pxsk|%5Z94p1L{j(hpz;^Kqb{&{1y~81&fKP>K`*OFdN)zEn^a
zIKGf6Z}b6#L9-HbEm3pt@w*#sygHDDDQ61);lu^&h#w=eh`&l+D?8x>gPB8>E$JoV
z8)cz+OFrAY-(UZp(wqx+v$Hp#s)mr+yPFpumFX(}zvd*j=GD&Y=#!DZPuThnmvX{l
z8ymE6UcZy0ph^=axR`O&nPGP7^w|b+BQ$tt9B7Y4ym2GfP2<T4PWpMZo(Ils07SLx
z19Xk}9`ViI#;jhP9e;IXVAkEb5lVN_SOMcgB>sR!);3u&k}b&Cq}hSp;VTPk;Zm+p
zB}=HV;#$3t<Ej3d`n@*t$uLN60@#UneC2HFM7j1Y5n)`Y19z~!UQbeQ`IWUuIcPCF
z6?$x?G7SJO-^?A{1xVl65q%1>2{k`;2@5F-ZO5@KB7TVYz~T9Z^m&BcOdAR3^^~@;
zSypcj&?47O@=ZUXEzR)HsvBI;o{ThBFOGmxqrx<}86Krag)|_cS}b91L!uAtr8*5n
z0S<&d=Qoz4AZBRr$V&DshEgcmW7-HZTjpFrd4@^f6H=ZKX5E=kctZ#hK(0L?NP57E
z<n77&au%y!Vz-%bSsAgjBM4@87;Xqs8cadk+(?ZQa9l$tEK+D8s4y*oc#bRZZKjdG
zMf@@1<vksE&mQV&zo0+)0SkFZINXc{&rN(2c)#FlmASDz@m?j>(ZJEGGA5VYqxC;;
zBG}%Jm86SLDBJHbq<ai$Ox92BBPF3v*Pjg8cUI?EAV3|97A0<NI%>6~w(AOb@ZY4d
zE^uR+5oAGeZc0>0?tu1aO$TC_%&y{?HERl{qucTK5%(BU;KP+uk4R<f%@{BchF_tB
zr2kRM)MGF8`r*%(zT_l<mCZ3#aXjs$m0GDpytOmUPGj|)!lK3E{EEmMEKQb6EKCu!
zAp$9F0&tkMsDi7kH(&FQ8B6d+1$<|s>k&vKgqt=&1pq*Kd}j+LI_;kAEU~oo2z4e~
z3OrZP!qrgOZ<oC4uYvLt7_cK^XGVf_fX*gS!Ij#;=ZHT>{GRjGu=^@v2}kbMK3Hd<
zX1ju|pZM~G(f0`ml<8*Ha$lKds09t}BYr|sn0^G4GGe@9X9e~7#*C9x11Y&>nAAk%
z(Fa}>Tf;N=ZUYlh?k(a2hc0Zn=jE9H;bpTGKP=oFEaC0h&po;H%&t7HaiHX;9l)os
zH|T`jbGBBJs-j=mzh<1|hJ>Oph!*SnpKy@{-v0s`z5@!?n?=Cf%&W44-dlL+O~TbA
z2)e-CXDt5C+)WUPU&&)P7QZzIP~k4_uso-+i?s?RQQgmC?V4hQ&b?|2i^C(f^h3mt
z7Hef%HArgOtl<Rkvu1mDIN%Xmn5_BiF5>S&tP*kqhlode3HKXEaQ?*)Mt<bU^Jj4F
z)p8tIyORFS$9AI+Mn&MB{hw9>>I;c>ngj`eWqhr|e$GtgzqdBn<7OUJAOo*WQhl+g
zHlpVEYkN2C?fwU*40yO#XHn0ShgJL+pVA~3J3EmXpn1RC=>>%=_4b>U2<H_ZECx;7
zRG9{V2}e60&Ob%mMf?bYyf9IBZ<u;R26qL!PS!vy70FCHyfN^s@CTR$m>;PzpNIj4
zU0}e@HoCaNoJM8;5S(gxrIwrBD8x%%BArHImD$ONtTYi+ht5Z2KUY*djm7J^S?c$|
z^~C{hm_L%4(>_FeU=BiQZ?oA*A3959mHs3$#14sg%EF9BHC?F`spl%=vYAEe^po3a
z_10Uh&X*P$WV^0oW(L`VukC>KxdHz($0+kF?~t7Z`G&ZdD+h4kw_>w`nz8ctMq%!-
znbGv1qaDv*KfSp%74EK9)9*Q2CmY!v$@3rZH3byX8@iUv5P0+6D2M}B3zN(C24$6>
z9uo|u09*iO=GFQE-`d!yfHsjdX2=|B8)4HD>slM|^TvsImq-b#b#<m?Ycbv(lQGgH
zNaweOVYxt<{9r<y3~lp5L@;vNXD_SfEp^o@NWZP;$jAg{frzwJuO=kY%4H@)Ph;3X
zt??eV)RpvVvwJXg+2Z>W>gp^^`1;vryzywOBJi9reWnrTV!%a|`oQv>ll*VOR===0
zD(ERAn%KPy$TH=i_e9_~xV8(s)Ud+a>b+poUg^87toAe;{SSsi+ckeA1Tz5nwBxze
zElL|_v#?=-J<$uVXnv$Y>%o+Fn?=aU!f45A)wX6M{%lyQc?T9bXQRX3=hqrLl&u3u
zn7KHOpTQzs!0tQvn~l?}19-8i0X6HlHz8HK=0FPqWxhmnA|C~dPFx`E#N9zLD?VS)
z4Sxd<+^zXLzyFRJzbYnLni7sq_;m5fY=>rs@h4h<T6SkP?Qap^*8Hb#;f;RWJ6q%y
zQOAx9U}OEIpg&Rcng~E@5+4v(JSR!8h?SY7%k0HBdpk~HvpUD*{zve3Yf8$NRR~~Y
zX4G!_8La8IvV@?pBUE4usS4~wi+c-gs;KChNw74FE$nEIdjLnbR4k>3p1sgx#P1Rz
zjc(ye01PIul=zj}L;&sD{?r2ZVcAHSTP5VxZe&A4>xLxg)7wb9poX*62QXjd4*U85
z%N(4i0(-ie7Gsfu#Bz*&Zrvpp_TY>#FWX$e*7C12BUC?HEBt9C(o8o%twxop7)m3c
zXCcquPQq`76!)G?Mdhmzfz!u`2NP<i2<QVyLrO*|yRVjxPwSi4{rC36TAs>E!3_ne
z4H2|(b!O7j9D7tjv0Ja75YTAmHWKkOc6$%LXx^i+ddeMpB7RMBawW@?QYKZxWf*P1
zWr*z<(YKFm<U)#_A^m(LM2xiXv>0D)<W0TlLkqaQm!)=Ib%Pf76iEyfb9pA#&3Nj>
z+cWl1Q@>S2mw)Sdo!HJM56{5uY(>0Se}ArbUa`myzpbb`BkR8t@ux(Gpb)hO-yCd!
zVqs6JPQ8srGH)@RH%!{bj4y2DUcr(Ma0h2BXS8e}C-K;*q4W{oBmUipBX8!FT(q$9
z2RkIK=k+NR-ajIKt2r~7)dn*JqZ=5aFz1o6THZV5k#ytXh3@47X6CEr7G|dB<-LIv
zUamXyo!$<3u_m8NzlV9nAMKsABj>RIvUMD<!mm5C1)tEtj{I#mmsdcDBq+BtFeg7>
zBR&yb&*-&FE2>ZHIRgG9dm}nau2WeeT-zB;kKSv<cW5suxeCHv1H_zL30~KCq1xq@
z3Ys#<7Dhol`9={^G~$<tGn~(8L`sQ~NtChpaz|?RU`beScq21QOb@V(yCjBm=>9~<
zh{@h<QV<3GH#@ZJ?Xvc%Qq*Y;0MrL?VueZ*r77nfnTNhhmayPI)bMZOd@7F4U{~5G
z;?;C=m)2Bt;X6AlDlH%doBV<U9L?j6TvljwztoA|7B8n=cSohYVa*F<#4RUpJA*kf
zc=brE7Jv5!2AOQ^;o=b{Ze&`XTzQW(7I9r?Fg?ja17kf~h#ci$$?^6aj?rW@zX054
zF5`Q|iC`$1PnyJc5r0Im_93FDU#Gw|gf+aMk|mf~u}7=J9t}ekFnWU}svGAwGkg=c
zTlGy}%>+{wr1(=6fW)G9{C`7va03HO<kcDdI#0~b=rEhX`pLKY00OnkCw}j064TyZ
zHdR|dUpzx?Uedc@4hG02SX>lQmBKwjhHIAx`T=X|yO+(2S9ttkr5ih2hv~RhLIvdT
zJxZ#QQ*s5<J<=wkKIycMHyD4ShOHL!97v@k(@70@_HW;;9&j)R&=??9DR)6Z347G$
z<$H*jnnTjA<nA{C-emnpt+}BcfKa|+13_o(;%F}FYHYzgD_~rkp~`<HK3qDpD{E2O
zWR8t2IOnP66ZahPYKQS^^P5lWE-WoUVP?}(ZcxcSV^IZ0ZksQui5Jlu$l%<Fh&1sD
zqGW+S_a26$T9yJO%h*uC&7UUXcifTchzHYu%|*Vy5%EjHp}TBE?p~b3oRhq<d1sb9
zwcok=a>8|8se81`0oldo&8w0#U+PE!>cWVUJ8m#t{~5PDM*M+BwiAa}%&XFe5+ZQX
z0X0zNxd(uJvg*V4<w4p&Z(*G00iShe(XZy_b$-(H<HFF(ahWgWmO-nGxk2Ifob4Rl
zd}WF2h*B_1{LgI}v;k58U#;Y%kgr11kX-#@)uG9f%6>W8BCHo7b-)Ke^qjo>z>yGY
zQ3K#c{wz@0Gr2?0c0t^nVZ#eG@?<5BJ7alulPXWB#i2?<XKrTguD9cwnRR4saLBrm
znWq|Pdpg)?VKMpJ6{H&PcctlZ50ox(Qk^yUq@Ti2V-8(g+SNL1(oG*h1@hU%zOjV^
zC7lfHlKnP_QVg2l)iMLr-sQ24fTYk71@&T5_>o;meUKvC!2-|$;Fps@WSD)i&OjPm
zteBIAzNmp|0^b1Y=>g8D`xGEOZIXg%VMDRD`J>SqHY>c!kq1#7^~1UoJ-6i%;(LwA
zajCDohh=Xd>Gl&|L)c>>LK$LY_rE~|DTW8yhXWg#6Na8hG*9bBs1k1Gl>l<Y^%p?5
zies`*?!UuMKC_WIIrSTFaK5)ZV&neR`*MIPoFBfWbfjWNpCnNeue<!~h`&TgINzJ{
zXz4g6^Wo|B?=5$h%p|v7w2kW5A`W&yzQXuG#S0nMe$4`m*jJDD6Ihbp&k4>OO1z34
zt2xXw7xl;kjLSBm?G*9~3*v%@jw0aR<nO7NapLE5oYoUPXQgFv;#JP1d5aCG4fX3!
zGQQ?2^$ij5oErQX@quipu&EJq?emcrpKsj!XaR75PAJ)kGYBrXF-5mq5d7T}rNS&m
zd}_HV0uPQwFbd!s&!RTko%MNZzkdB>{IBJ8i;XG%CZ4Nice*u;G5dtCnC)nD+a@ck
zN#~n_IF-M{z`zm9FJP|jz;}@#(0No(zAO_DTzTZsX9gf{RxG%ruL&{eC&F;bwgc*^
zBVKutmhhCUrCdJp?N>V9XeKYY<D3`3#}b6m;0_i^&|v(RGWeb%u*w({3Hb(rEetZ?
zC@YiZNa^;n9)~>goFysy9Y!J36#ITk#I%`(o~uoKJV$(i_Nf5$r(~L@K9b<%1cS!R
zPJLn7=Hj)G_6igNU1tpJN^NM`^nYsnXj%Ns{M<kf{xSLfA^`O8p0kzR;jt@g49+=T
ziwIf}`!dT=CN90VcARg#V{C!r#X)-Dd%zgx&&71;8%aRNz@R&j()`VD5r3pNFaR1U
z3zJ*OK1KXpCJ%L{7#oeJ+$h3c@kxVeZc1!L;^gakn4ElwYAA&X$U6SQkt4lP1RQZ-
z>gDMXLY!!O>oMk+_2P}l<O_gPLq;16)|$D5Q!>k2pPJjdL-%&JK{`8zDqk%FTd06a
zej+#cr8@S-3TplO`GZ5(GwS%rR^7vdUQPPAQG~xzU8_0bkETBxEeTCKku#e=9)RtZ
zV6-P_swZSvW0(=PW)J5fDfbauR@^>qu7A#n-%}I4@@{}L&Zu5K$nD`rI3KkE%YjVb
zhSGFZPb50G=NCU+E)nb>8S7in%>Vs2h6$DozBsd~H92vhUwSrb_4Wuo^~HlxaC5$;
zn);dT%#-t9F?B`|Q7$Y4hKd(W(Nh>hsM_y=BV4lVI-?uSyhX$#Qy<i|p)Rtc2}(+u
z5K;<B#om@<r`rExL;%lI8FU%|2CF8H1eBw_*ezT0A9@DKC>2EK-#YSOO~fN9Xzo{T
zNiq+t=iaJuBY?2Lhoy;($bD(F1^K-RA@IZ-6iSeowM^kCjOv<;*k=-zg^pVKmlw|J
z1JSs{6FwL+%+~j>B4kCTp^C=~HY6F~1_d)e%nS5NV~U(uLY&%|%mo6yApb55`|8|<
z$<H^Eu*_sD4-8%VLd33ctQ8`=v<Rn9gQ2&kXqv(OF5)v6Ph+si>8FS<5!Z;nB+A$9
zV)x$7ht+7+i1um<f}HxWWc?c|w$8>+v_#2@0ngG-NFbM<;hg;ad&D0R5!6PCjV7u}
zGceU*)r0>FD^lAA71oBE+JC|Fh@l;=HgiDH)3JX;62Kec_WZ;1MOr|g<l(RQmGmPa
z+Gw3$!D>yUp97~OtpSymov3|6Vayv0H#qpr%-*--4rx}HK=EA(+=-X373dvg)bsD2
znYQrkNkqo*qP<x~Ps;U!oz|1piZ=?79llvz50iQ83vqK{d>ceg%YT36-hwvJX;T6s
z8<NKUfcL&)#vRC5B=aJL&cN6$mQY(q*??~u?S-1@_0>wCcw3_1`D|_RGQITljS)|D
z+<ms8)DnYTfijsD$8QgxGY>wat)Rwx4S0CP#%(32%2W#I6Sc3&n9NI3$IPsD`G#yM
zV|(uafumIg(g2u&y;>M^`JJ${RfDq9GceIV^t{YqI<UJgkZ-DR#DZLjJ2@ftV)GgO
zc?<t?AMrKfZ}XRGoBQ7{J8HHwhzxGCfF*t-S?Rdo!SvY+3Tom{4^~4fSQNp7*NA>u
z)mVIVhBF(aqhFt|R{h2>L}?|DOjBt|2s+O1oQ=G#DHwHoHDvt*$F5;HYaIAS1}dqL
z3Lvjm5)>U5gt2Nf2CL9Cv0Gp8SUHfeN-DUzf0f?$z(~pNN<dq0?@3%7!H%sejL2>c
z8&zb+{n~j1Ch;8cMA7nV#1!$#X8KOZ0$$b(N@jh2;`itO+?eHGu;p|VM8TQW`m0!8
z;JjMX;@tqmZ6#|Oq(|<gw~^**YAB7ZOb@VhZxu(hvDcE!mufd<oQ2{2!WO5XJ4Pz)
z4eybdq0-iQ>mRVY7epQBHG)wQS2UcfWlHY9pt6=0;-&Ju+Ka>=IkZRXEiIGimpFi)
z&FwH|=?A3h{2uWqpXMlooT<+_=iQr>x`h<c<D^BTjQb4a2<5V#Q*n*LR`>Rqm-lZ+
z{5~R&_<@w?nHOzH!q9<_`^{t2ld?@<?~Fy+vk_?ql$irKP<prrc+I-MJ;C@j;$`#V
z$`MrYN={E^O|FYFP6k8wnK67NBKQGVIE##r$rL1i>@$th74HF-eBgp_&&^My4m3A_
zKbhs!Rzu2`!>SfV+}vnz0ms@vw6;)6-~|Z1GC1~;+<DI3?@$J)!~b0cz&YY881Aqn
zz;*owoN-AU_z7-Bs)-y!P#G^bm3|~Q{}l1Ut&Z?KIS;b4c>Hc3$7C5yOFYY#4Fz`*
zpEw;i2c8*di>3Z?#CT^q^J0V3Q~x6X-5PE8V7NsKxsriq{Qd($IJZtzWd;&9r5ebV
z=FFU2Xt0WOVNOay*VKWG_V=1)(cmkAZz3vfAjsZcr{3}AtIzhDr~x~9ro_8&fcuC~
zxSpBq`DCo15Ny9$#L7*-afRqCK|L99PbH#!ZG&@}<~y4y$2Qqt2)=f58g;TR>3bwR
zlO-;L;V@;|srOL0yMiO;n;&rc>f(D_^EVQq3rC6Yscn96YZ{Bi^NT_l%-3m}hRPtB
z$;=k(UqyV5_?yTmb6l6uWM)$Kc9TNsD4?hsT)`*5LQoFyvG>U^wfSM@-jy9I+7Y@k
zj`1TM?h6N&Sq3_xI4(eXI&jr{1q7YnhBCXvXFbBk3+ZcR!rB`JsEXNz3bJ&^6BzJ{
zSG*&wT<rf_#LtE=0&CloSw2x%^J?F1{ZA2p&7i}N9KMo`2&-PaCa*=-{g);p8LTO_
zBo%FwZFBn%OMiC|c~XY#OvYKSGZ6XDk7ODjxCL)#oXi~48dzgoe`!xqSaXpU;$>Y%
z95G=X_&xO?@dG)5ajOU{?7rtDXjOVdC8M354F@J@INIxof~V?;N#hy;W-wlNc#l5f
z8<MUIl)kFW8$~dY>jSR$%t)&MR+jd%tJH^hhtr;aZJ=EyGN24~$^u7-g$W+@ipLkF
zcV+1}3@8X-FX$JZIeL|406VnScS#mKSl9Xo4sdNoH}S70DtKuC%Q$O=6|O}j1v&1^
zdoso*;-`o|z_zIr*5DFG8cpW<e(4CP0-Z>RGk0wNyBMXKhcccw&A{#>{$5i*D&Z~7
zT^*L9nA8W9IJyfML=^{5NR(nOjBjq?n~3sv5Pmz;g&ZtVUM!xv2*Wl!H}9i@;$>3;
z5*9<HnY&s8?Gw#0_nfqiGL(KOGf2|W;Xd2%2lOFn0GTL2sBk|Tvdt*V_xAS}>vyF?
zFtZ-u$%bEv|2V9tspJ-VMi|lzBv6r4U+pX|DAd|w`bgFxxTM@WUzW%hpNhz+GM>mg
z)qn<*6mPSTyJgQ}X?Jm=HhWqRbdXG7hJm$}<zG>8Wvt890Z{21@Iou}V4CU^RxtvF
zip@?mIn-IIF-!6u4DZRRvNeZQ;p=uR;B&+mnyL=`ZiO~%yAekG;EXyiSlnpXQ}u*}
z1Dz~h7fj@xY<Te1n6KR$0ghIyXc$V8!@G9q8VXCn(jw|?gGWMs%U>#&3}L(G0ApFQ
zJ=l4WqM_$0eu?-qRC^l3FE&S@hX{~Y+8IPnqh}HEA>wZ`J*=mQr^W?UZR<@ASEskZ
zDkb*6vvEpKj5|25?JqaFGy^nv(|N`$>T#U`1pC}_^n@NNOZ#Ui{KtsD>l3|2G=GTr
zWXJGmW5=C&*p`9L#`DMsn!z-frByvQyr<vAg%=`AGLZv(?|G%;wC4YETtJ5O_7w`_
zYFx-H3_Y!j@UzFevs2uWN(y?F^mSd<0A$9ie~9?^SQEsAw67$dWl*a}Uq3NaC1*i;
zg+AgLjZJV0pKVa>ZS&=Kb86>mq-t@-lJ=w@XCW`<OX}q?g3(@jlZs%@`1HX7c(&}(
zdn1WW5AR6ClU#QK>$MR-+LYHJ;XYCoP-~hSg|8Y4KsCWOF~i)Z6I9@Z4%5Aws;gsE
z)*+FR=7v|VJY{7PYjFX^CWy-o#CO(5F6<yF{=ozSw*_qNvKbM)GucdU@3~7^oB`_n
z-ZDH15qc{>_U{qkCFfTAIdiB#SY{Et(lXX{w8CQLg`iQfFQxT!=3FDC(RExto1S+t
zkI^9M+(1AD$+OgV&o+Z>2P<Dfp)|?*skqakcIs(cQcd>E+fPr){d)zqS~Z<Y3f-I7
zXX5~8BvAwN^!5~8^mJjzWZ?jF=<Z+e!H<m8FBw(X)5hGai`YF<nZWD{Xs0`yQn^vV
zmHp>Hx5?(|OAWDyg*$OCHwtB7%w<rz`HJrNU`cevH5Ld~MMxKZ>`>)aCWp(c9W!13
zCgKx1uM_J$@DvjOW+YBFiT`NulBLo07xSm{Dtt*KIEnMyu|hi-wKv>}+~GC@kwIIw
zruM2xC3=={zP0;=K^U#aLklJeRt}-^fZN=!&;P)E&KT+?;_pO!<J~H+9K`Pt-`#{3
zHGrIWySIK|>A8aIIwgDi^3h((Ge^BMt2)_e<YApYgz-0%v3&J%KIDno#C8!(Ygphb
zb~Z+?2gis;OY4XQM@%Y|+M-fU{#Wq*^rP}|;CeeuszBk@Z&r}~CrZ=&%+`BvdM=NX
zi^Y?4H&80?EZ?a4xr-6o?>@qwN_o$URE9uzn-Y$&8jK`%dmE}$FK3wN2?DzCrrp9u
zsvXGy6wG#nv4aO-Nr(neLP-pnY^ys=@VNO1zEmPII*9!brckcd<A^=Uzv6E9Bw_Qm
z|1qMC_}!-Wt9j`}X=*JM0j0ie85c0D5p+R(Ru^Vi%F%F&zeW5K@g)f(3ZRpzU@k!3
z(#kWP^)=o0Z!mfzz{?Ld1m<cK*l`_zgS<$bTgE6~4?wLwPMJJ^;B6XmpLq!y!*hjs
z)wGBmCYMyWkvOEXCr!H$^%Ff$-jas|q^7)OW;pd`zZ34^1jjI1>7eD+e4l9(O>3|{
z5NU=eFKk|khCg6_-gsT|uU2?lMn#$i?=e|#*}U|ROgT(-xKgK%DE8)3?3aia=4fX4
z6V=BSt2gh#nJ^$>nz#fdXys-GpyVkhhRA7FCQVXmv?#*W_S*2=eojpAnaDm_FUsN!
zluQEeKwvqW(SZXlc6v8dC<C`Qi%B0%%cJ%{*V8RHz_*BBBEGV0M<YWeP?`{eoHx&K
zAQhVU=8l05&p^?7eR2K7HR3xevO8;Qn3g`|lq5luwKvJ=gQ<2k8yVP7nnEj27SNaz
z-WMF$WB?`=!TS;pIQPohxpElip3y{i$utE8prE7Z16r67F6kK%_Z?eWuu!sx4(~Hk
zc%CovOyXJ@DogKQI)Kq~)Qs}&Xs<}lH|y{coXtGTV42Zmxoek{u_oFZPo^JKzLa7A
zds4aW3|>I3VxY-<&mu)H^duE)^p<hvPb5oM>?*V7fHNwMjRI!iF{S%GnFMKi547ng
z#%1Unfo>O6M3T41I(kb7Fk`4k8)A{;PR~$NLjs_}RAgd7j}f2PS1D6UWG%O6xn%$|
zSNG8hk1}at3UySZG+4%>zY@WASnM1FK5*+hv=HeYjBI)XAW}8#!Xxc0qu(%~5or4Y
z4s1*xJJ>#5)@MjMi>pO&*A<+sr(uzZPkg_$d}YQVFHGM31N-p{8p7=Z(4n?@Kvg*M
zmg2AsC>7F3M%yunw=oHVyw_j>@R|gW-a7Cbt2asx(Q)x+vxZTU9TX(bovq?vqm|PX
zHX};Q2h3dD9c2DwhyL^%FZGf{bgn`Dn-M=n{0;geMti|{zW#EHO1QD2ON%oe;2%c7
z?Sg{P7w))Kh1{l~004jhNkl<Z6=YxGm1_8_v=2lM^o<N)GU=w02rr$b0Ak`&mxxbB
zX=}~CMD!6q;TY=VCvqZNVJt*XNDx+-_ws4&3<YXkkS}6-GG1l_erx@oX}pozt6I7%
z@AO=JUx%;4*FRyT2b1u0Ts)0I1OM%awN57O8%$SqS@Yjvf%1(z4?OOOs~HX6Uf%nO
z(YTAu+ScxuTB1~ms_3n9#Rx5v<E`hwcu4wqZbjgrQdChbm(h3jjSc;b3cP1TDmA|1
z69-H6)tX&h8~`2t9@JQ2r3e0614Fm{QLpRlG45eu0Fpp$za-^-h25&q8m9;FVxe!s
z63&pZujbF%H4`g<I9jE8Wa;;~gl6r!w1*|9JgoUfQ09t}NF;d6V9JQch#w)>1$lD=
zYX=i1R(^&i>+e*%V__pk^;gv#uzmmH<|c|houM9ylXuB5u8dAdv0Ve!e<l^Ftw}h|
zpeAB^My4r$lEX{Bnj)m$l;QSld(Q(mHUV}r^pIIs{Nf16V|93(9v_rj9@Zx@D5Lo-
zvjXWc;scM9ajFOFuG+97nWwlM710ln*<zFizkX>&gTX}nI};bE64WOEYQF3P73^8I
z@=RakT!Fo(F!Z_<m@U@-1Owr%4Eb_QQfB}p4ZsyE{Kh~Ol}{(mNQN6;TwL^4&%fpj
zfYi1J^75C>3{09WX}AaC{>}#Swp8K<zy5?v@iLlV#OzK!Leab=B8Uu<`H}K(&xzP`
ze)o2G7Gn&`WDRHP^c!EAO#L+S&&qTre*htHnNdsHxidk0kFawi0;+b>n$$>#R(1XD
z3XEj>6;u7Yh;OFbI`b>%<N(WE%sYF>)2bCg5u;(L_X&tm`f|hvlz1HJ?g|3a@0Ra-
zO5Lc32v;DVO~<@}0P|(5)Xpf?7GdRlJU&ExOd`>=Ocv%N^1v+Lo-N3zAZ^QVB^f9%
z{Q`O4Yz`$OHa0Sd%9zL87#WpxdU&;sBChzq*|O)BmA`?t=H?8t^~K%zM$7zFcgXPD
z(lqpI3aw)n)x1Mj8=W|xRW;FkOCBM!5lR!+sQ~1oIj|O~@J27?$ePV8f|!#$;*pmO
z^w^l%#BR5sWP7BVIRS~39Jg4<DY*qXN(I0dbN4g$Ji&i$P@owih=$&unT_y-T&(`M
zE<=nsvw_5^=KSifs>e_ggByUVxXD@PI7EDee0^mVC!YD;GFU<SJ>yAL!}siV!|kNg
zJmv*^VjAdVT?-ipL4yU<ld@1T)%=j^sn0Yd9IZK{Nfa0hCQoV44#9=g?4Ljuy<w$=
z^@UAPod2~auifz{IcxKrK>5j94Br?d)^k&WOw(o;Pw?9fA+KVsMo6Ux8!oZ&6NL%q
zN47q<9=Zds1AF6NZ)@lkIHiq&=?c4MSV&qy3_SBI5p&{*HKvCC2A4L$lP(Uht}Fy!
zVm&9oeZ$D1w1%UnsDWZLUYY}=++b$0ZG*zg?Y-g!#pXBCuanIM-}tCT{sv4NxZAyD
zbJcnpf@<d{V_tR<e+d*9j@=O9`@>f}h|2%cD_kvidP@+nF#1!(&z!gsyDID@n;&p8
zDT|?nE%5afxwAy0-W0W=0-8XVgC%}aw9Ucf85XW^05f4EPmFq&jM`1OF_3q47cfB<
zTsi6mRdWe}Mhajjrb(phFCEEbaBr3)<q@BaeC(1@D5~@y8QycDlrO(M+RNseBneai
zC`@OlG^wNRR0F9#Zszd=`RbjuduNu@2BG(5-ODQFU`e1CsiS|8yn4iz7ye6iql{Zn
z23D{yXYg#527rkM_e;cM#Aj&OS6I1667)B#CT@6-1|?dJ>&z^v8cl}QCJ4go<a^}v
z7am<A*2^aRoIkly7y=9VvX@7>%i`iS!>22vvjk|hu>f;r@!1-ae_cF<&K7?pVW`|y
zZ~sdJ&}1xq&iBQKUKxSgW6Y_HO$Lygeyg?e)s-5?!)66A2yP8DcxeY+z17m%Z%d)2
z@xS7_O8&gDDtW9CpiEG8n;i&tGF~BnZxY~uz&r3#4S#h9sc^;Mw-p3oX^QfQk=Xn-
zB5X2eo-9Gl)X@nSSy-AWiTV|zxUMtc9IBwg?_78>;RocQvnuxUdY*sApv5~v<iUDR
zM4q*q1%$9JqO);9(#)yuH`&rIo_|~xYHBiY$uDN??PBQk779DOQc0d8{!Bq#7^*c-
z@$2Sw_M!_F3J>&M_ht_DW*KuN@jB24l>xJr@2j@A&EgIwmbv2ym+;aPm&xo^1-AfH
zQuN!K+<a$*O2?l&yO_FwJ<+@3nJ2OdiJYJKiB$kNGi5taKFqq9C!3L?j+_nnD0-)~
z3eJ(LX>YeQ7`RW5@nnMjny5G68@J21zKsLnA}5vjNKCz&0)9vWKxtL$!9ct&{zjkQ
zj%0sPL{;HZ6mbXpDZHVZ-=?rMFG01EGu~H!;Rp-!Em1M4RY-j1NQiB~WQ7d_OY?Wf
z^_4M(JqP6!@eO>qVf`utjrfhL4Zt5s2p;JL8rFRXLJ4<xf59T@?t~K-O1uel8K#GQ
z!5?PehPU5n<IJp=s5QKnnQpM5Jd_#2xZTyfP8Z`3RS(Jd-R}`kNc|Qc#}FfPsdq%2
zH(p3`&mUIn08pbp(&3q~LP<Aw9LFlTerazIl3GhX$fMFT`QgHWRcJJ%!w8yo6$5vy
z-}e2>g>)Q&k$<?uiW}fXT8CAc{!G`zVC7fIdvgmwqc46-UUJ0RJG{UiXP&`MZ`Z&|
z=8#m@Kd203z&d(h%%TrOrQINIc$zGINFMPa;xCd;`mg}J<Ak07oi8TqnAXl`k&fq>
z%ZWvsJ<coO;0T(^lb;!waFQV;($AkGzG7Z_&Rt?a1&1>Sim0ZN+iQ8N_f$ZWEm31d
z5Y^v%JINa?#AKG7o0n;AqY74}S*ybI2zz7cYkP%|2%7#q2h>>Mx&X>mTv?rK7jrBN
z6-UYW8u&}Cv62nI=X=DrbrC8MFgAGH0jJklS^bl3Q~J>smR2s2gk^wRAJ~^HF<{AH
zXUOzysBa!5(=bgS);PWZnyJcjv0|yJWaVb`(iV8n($N&`9W$0ytGlcz5Y3*DdQU1C
z^^S<nNsE}d9Vss#=A685yj5{KeVh3I4#*uwYCO~5^AS(J5F%w(LzCL9Ck87U98nrQ
zuPGd96(#UEqZ@9FFDOhbmQskq@wxrV2YVe~52Sb^Lf-CQ^n==83h-!dS{x^<(i%bs
zVKV1vyQK-V%Szk^1EJoJ9#iiu_fvmQZt&pt(Oq5Hg&asTEVttj@xwX^6k9p}Q3sJ)
zSOc?0VJuG0%@ca6_-q+{19jh{XWMAx(gDaQoV3<G`w~$gPwOnfU#woXw1m_0zwTfc
zwtJz%e6r}?qnV@*!$4BsCI|RpP26+bIMIl8uoB^55P9J}R0+@xXJ)sfVY1kCpliCY
zv;dYFs~4c;+pqMbDbM@|31DS_wbjZ}6+~-|=UV?G(v$($voj}^`XH|qUhs860J$1!
zPm?%5meL+{l(P3AyV3HfH2PLB7mwslCmZL|QJyH@OCLqSRS};!Zk-io#7ejP*9C{(
z(Vp|sL@NCfICh+<ubZ>5avbG8;xkY9MmL5)tp5JSBXj`2hNB{z`xDDF!D6Nqtmsh{
zYFev9h6PzS`Nfr68~de-OG^f#zN~Ekp$hcD(x(FJJV(4T9s0DMgjQITy4obb6GiCU
zB&ow%hakE4GwAmL%j-zOI^xE4GXV62j+#9FIpUF^s~{_0JN*bZ)G$|XR0ZZGd@wQS
z27nZ-*6fO!F0h(MkW<Y4$rs0P04YlGrSi{Y)A-JgAj2w6A8cCgj)bX(QEJ(by95M`
zc7{%d{X3kA{OLfcVnapHko<gSINyxMYq-IlM7&$)LcQq+GAjbBAk+iJUe_sb2I`sb
z@Bj@LFxh8z^=mT7Ie7|+R;4&RfTQ2&jZFIpCd+YJLU(3j(9DG2hlrne;X&p<A@M*>
z9@3f9k)E!Fn%X!6BShGh(2-l;>}a*NDtT<_viGK6=_&u-bB`@??y_#A%J_0g$+1AS
z{w?Az;&&X(Z5FO4@vM*!NKsQFA)6Drx3w5~urhIjuaWdQv)2e-xYA+?z0XN<!HJ)%
zteDnsqMtAwM$JtsVpkE5*4R24De>#4VwB|v);yR#*5Nb?a}6l~7#L-nTbdT8`8iul
zd2FJ$fn}?VAsI~}dgeHl>mumLn^yK#^3Rz^pt@OZ6{55cWe4;aptHh)rs@Go=u#2P
zCF19ZuMxjzVA5bopz=FU7jO128Ey&I=&Xl1Gv!dRsbdf?a;5IGW2xj6Un7ES;Pxr?
z1C6~pZ?ujUd6>f5JW?fK#xhiPP)d8b?T3gm;ssUJHV0c^uC+nwXAagtW%#-&1X2-c
zu3g_*jL*z^E2dHI);L%Ne<6M0og9tDx6MZoB{RUhTuEY%n5XoN?+Tf;A1?$$Eu_ek
zAQ&ugh}e#ned?*V51df(%y-t(d)xeb=z)BxCY~$+DxujZ;>r8h2E}$JN11>ZTIrOG
zJVIb94^z(i2p4%-zYs?URn&LS0UCL!TFuY3YwtIIT!uPX8@gK4I$~3-RUA5~;z{nv
zhAI$G2cVy9QEom#v7#efR3_e-cYbdTzl+ag)sKlr_Zi2|KJ@oQ$azs76BV0k0N!Oj
z(sp-@_>m!P1wl5|QA|v*4Fauz!ac1`Vzg5G$t=MrEY_*y`NkVAcf3#~OCJ6y;(Nqj
zns&YbcTRw&5f`?gU?#MQ9U0_kV<0Q~kxLH9u$f0r9HhNnWU^-b-V{AU;=?Px)^W~8
z_P>I~m3C!YMm?EQ#jn${9HoKT)ZyHu8gIOPZ+1NwlYQj)<`bu4@#$@RfVO*Y(~AOo
z7}f?bax5BidIbdDnG#Kg1KTddOMD?OGyAH0?!ueRD*sw;TDRbWcgur4@7PD`W2ply
zdF;SxskgPXi8eXu&xQdFobo%!@S}0FA6e#-pI+1e3Pe78YZUa_fC@!@0wOZId9klj
z0kzirrb@;Jhugx_3}|JgfG-Wl1lrPD5+?_k8(viO6bq1<T>7%<{lx-Iw!;~l|7_~N
z4H1J@wY0=%CY9z(6&#cMB#ZXzE0xI%@9&L*R2{ikU%$j}wwOv`abs92ZPL$Z+0ly)
zW9zM_LnctH9q6pMJXp%v(RgsBypsX%HhSsY4RvhQ8*{OP1$vA4vY7<uaQKy()0Iu#
z;WRT^X^z^0%sQEmw$x{<>{ryYCb*DA0pK;_PsF;+*rLLEvz1w1HUn60ZnL$a;(@DF
zDv+*!wlvA!iXmR{$_++c5&$)s*$b{s%u))N<Bb^tDU)596j&+{&!|gFq!tcn&+wMq
z6ql2U<8$Kc)!~osz6Czk7F;I0Pf3-zTGhyJ+c@xr5BT#FX6(T8w2odS?sjL2(~L*!
zZ5l<6OI+N)pZJS)@m%>&g<WY|h%yrXcaT?&P2L&FKPTj&i*1w0<|lw>XB2{f!?>)E
zE`CCJ>I;9nMsuMW%ES$B$Ut2Q*jn{SwxCElaLL<ew(0K?cYyuYc<(bZ%-qJNEm|p@
zFbor=WDKGjt)H}Cd!OLm&X&Pw*-P$OqLM86i2}oHz-y8#O3fc!d?N$wOru>I^Eg2V
z)?8WHRO)~#rjZUX#s{OmJ#o3NSv-lB&$!#%-imRtN#Xrw`jitHA8@fXobQ0}{Q+(2
zcZ!MICN#n`KjRUO_>~vPs)wWqI?E_O5dd;lZh$x_EHggAa@;-@tJPTzWZ>`Rrban$
z3V}oc{fSg|<SwqDiU~+EB<F83!dFr=ImOAKcgE39Nsx(cYb-dO$Rje#rt-viZl1KK
z<RgCgYCZX_4S3lgP;;j2?}(JyGUUyl`7eA=e4t?V%u+o<UAE|d?l7aP&GU+60VI>H
zZDz5C5NP$)Y?KzkD$`2sImwAACOCa5i_n`0II{-EmQLW7gZVCps|T4(?>?D8q%a$(
z5)hl?F@#;u5pT=M9yzM8Z$=F*Z1@U%FQ~@tp@fwX^zZRkDl<!qCAHMmQqcKx^aDJR
z<iB#O_iTjp3rb5qLDDXsKjY4?oE34^75CqR^-gR4DFES;iC2I)m9k6R;CXW(rue^u
zi6&KvQ4w%ML@)lPGa*{ehRxaN1LvY&6X5elqD3(}H9t3huJDS*=0M#U`h5a}1upS^
zDafcyY_@p*oH;N`q6RcOXRA9LB7R)&Uw{lc4(u5(GC(`kc1AbSkK7Q$xU8gf2+7K&
z51#bRgT2PI=d|bS=#k7IkNE!HIQN0-;|G(zzpxiq8x!5HG00%=yHSqa62>M1yU3@F
zhZjC8P4?8&6!{5+aZXyPZhZhMPpg+_;(s+=b-fj*-UYEz#1!CVA;3MP#6Dp`3X;Ez
z4UQ;`$QZ2IX?PDlNbmd_F<~?pq%jYcyPVg<&<0cadHZ{3uK{Su$lul}*km;FU^HxL
zjOjJvJEL6}1~8d?|7yFe6t*CwPJo~d6;O)<90&k=3xfl*3jP%FWvylCiLaj`CTbmf
z(9E-uY7$%BJ{gwu#YSKY{!ZA%el-{R_Wj!06jYB2g2_j=NvZLM40qyP2|vQK35nF$
z!f5d<{it3o2P`4i#dJZkzuQj?FfU)R<$*GUgj8V_==7sLjIi!gsrv4@=N+ZcZTs&n
zWyx(8QODB>dzMKI$|(bh1HYI{ttgyk_JY9Fe#3a;`TVx4xq-6@{P-KAPy$D%4eR;C
zicad^>IEH5Y~Mor6_nUJfO1OVW)`bYR2d#jnx)#vVRH_|>^A=p@%O;FbezQ<(L>Df
zxQ2nQ<H*^9SNe^#1yU)b{&POR-y%wTe_Mu{qqjMKZ;67Z^k>L%R|SRWdRbh(fWy)Q
zB=|j8jcR_3?-X(G)(wol`g3%CC4U^aenSO#e&X}G2|+eoV4UCZJ8Hv|%DjpAgrNQA
zKnGCAcZz*JEFrfd?DahgB_~@ANUj42R)kp}@k_*C`Rk-)0JFmAi8`i)ZI_jMNJaEY
zA=O3G)Ap0GiW%y^-r~vP`nhe=NVuCr#|y}IXZN-7ne|`0h!0E=`W|t$7ZvyRYNAxf
zy~9dh`R9^YEea!NU-N{6mpycb;8a<ZqAg5MyzvMVr+gp@C=5ZWGFz{;6xyMQqgRqa
zPBs$0=jIn6HZyK)z!sl5w<Ut~0vf#7q~L^TRSCJk0?few2dXmXq>*l2fj#p!9~ERp
zyXI@}n?a$JgxJP9Sndg(rLhqCdnT0Qw3Lt*fa`|;)UIz~;z+i5|9Yvol=qh$Yw`Vo
z1gQY43U^74)WeZ&P>31v>GyOJkMIb4!$X<Lxu*4FS4-ygIpW`mD5&5EF?hp%f+gA8
z^u`0URc{-<OES<~5x6jDmYG+sIao9HxFbp}Hc$v!)TMO5^OJl;MIcwP=;RcFP$%MB
z#3|yAyezYk+T~^h7UUd3uo3YWCdZiNrAK(?8;w74O23*=QC@a1KywA0q;=@hu>XLG
zXrppU4_DAcxpu%UgEvacy2gmF=&MzS+orIzB|WKjjRO-s|Hwu>zSl_@Y_DpcXH4(N
zOv!#T1)ezW4|sg#*(K+3Fj$)&0EwV=Rtprzd;WI5$>EEC$r+WTQk)1}<%j3ZISgTJ
z%baH3+i?p<<5Y0NMREWutZI*cJ=!q00y?Cyw|gyofRuBi{mBj@pyh|&HZ|D6CSUAb
zRyj!G0`rupH#b-Wt=WY-zItc&9PK?^)Plk(I5Q`P?mrj-?^V5bWw?sz74HrBf;mZl
zC}DNFej{=7h5oaOJL;|PTm(^i@ycGKd9{;286N0BLiJ`Rb7h?AuyoXk@ML(;gu`6u
z$T>YQ6SlB+u0;|g+E}%c*+?4GpUq%wus-`tp6j-Ul}VIZVn=B%sb-=_+Ra^=HlXB`
z4y5*(A+u>Sh-VOecZOr6e#|YrH1F|GLDn#z(r1FTuv2fP%q;nShhALrf-8VJ;03B>
zyTam=VTQdinFX2A4)`}GXn(NDkhkxbXr@Y=fuGrorlYA>i{$$w!@M3ahT05OeDpmw
zR43n8p+SH3K;l(c9w=72+YA6170?`ac{EAfk)$|#?|lHxa!875S9?>mjI@kxfUw5e
z3{<+(fSC@l%j>#~G=aL$O9Ao7-ARC{=$9qyCj=*kOpsq<!rsOzEOg+N^_}K?LGu3^
z2Yv<^ZL3fw@^HzSqzI|OfpX&;jg0CeDe!o&27pHH<+u(218%7_s!TQYn#UO;zCojJ
z1b{$rRB{VP8$B?>-w9i$=dWerV2z!RNG2*f4qw*Jr?Q@@H(uJI7`lfPzOxC98v!)4
zZj!=$;hAdoflb-2;bj&E^N5f3Qn}7lNE;YH!padIqXlvAtnj;$do3ElCd)-ti1v7?
zn_TE<6`g8)MQg$O4pPVqWvLKivf7bk-o7vnxi-AN&bI*K49Ic-Wr2Im=~-Mv(9YB@
zHCe2Fv}e-~T;nz2&<8Yl(wj>^b;bG*u=jtxsbZ>t4?8$yag}p}Nymn%IJ#JcHsWi<
z56le<qX7p<sUy#ut(UO3x3~v}+9L(`J>QUl20DOy-aqtYAh^NwN)~H_MJN$nD%Q_d
zbig;hz-Vlf2ZeP%U+phV0D#~O7V*wUM3ioIva(!ZUES#;^byYyUq~${PPt-$#5?__
z{VI3M_^aM~vE$+cxx&W!7ZhQ0%(EnkIFeai-g_ToW=F7KFFV%%0YlC#%UuxIQvDxI
z%|3F98$7w{kQ+LVSh8R$X3U+Bja83SqP<;T0RO<R2yQm!13M07w#IOSwNTx2JCpVO
zi`pvJU!FjQbG>}Wi5%dsM&xYS8oh}B7Q`6PNvWxAFrmwK5jryq%h>aQLp?D+C@nxN
z0)Wa-SBroxYT(fX4&!?Wq_>;7vSJTBWY1;N2k4kxSlRh#EK(N6$8LP&!^28p_h^RZ
zKUW)~=*7kTi@a(vrW~iCwCGAMwE&SBe)9r(6#y3{@<Ad0c`f4Af(i2Ouz1BiSFrEb
zvy=o=?_ev$woE)~fySX<XJD!tA2|9~7F^<y3z)RtL_p4)2GZL*8!UH)1yh6CA^=}-
zKmz--<o}Z$y20d`8}^rxY`$XKLN&d;24dU{KouH7mCPr@o<St>Qt-lcwcxS_YgZ~O
z8y_jPqzHVps;}q)C1HaNTFEyhJnlvYh?oihHL#+uJn4oH9?AEDOAXARtMCYC7=Tz`
zzXT;o^*>oclVhOwzY&8KpI@oNs@UI|16BaI-9m@DKVQ!OQN?F4VRyM+LILN&I*}!y
zmFE$$zYzegcm>gJy){H+L?1oN;>+&h=i~7UY|UgOhH6Xq4ge=RrzHvAZaI2W2H#p`
zePN!zVl-7lzQJ|({HMlRB~t^)%wiO#--y8XkSbShHP#XQYOXiQ@FWMC-m3s-?AriK
zU0A(MsHL-nBJ(ecB~^fQOe7BZ8u1eG^ZK=-^S@ki1CL~jBRN&ZbG7S6CjG>OJV-Le
zEBccMo^fZ%-sCHE&l|xeysz%CWNFS=%Z3)E=5V^N&A`MX=j69OgVpE{<S|zc<{0tB
zVy6z=MQg$b72ang5&vj!Do@X#nl)5OQDf`H5{K@+BqFzxdu~TWq{f8}N-3BLR_k}n
z0uu0>IH(uH1B%`Y6;aRCF4SEP)|;QP3hk!-2MF|H??9{w<|k`1{)Q@fasQHuXKZqg
zR~4x@MSNvDQz4be&l82$wG6vlD&jVdaKTY+setc*1c61!NiMcPgk4SBpB7xy`@|A#
zq)!uB&j8#RE%8@F$7I8O)0>~q&zwF!TUYzrvX@kJcq+l`x%oh^v4_HHsG!eV`$NR1
zq=tEo_!{wx!R^T!2|MH~1zF1NATI(y1x#+u4GbKnBQd34#{dO;6>-&x=bK&<#~kS2
z2yn%h%x+4tMLNy%`WR$Jv1yCubZl=-;uS!ia&QHC>fQ#%yf8IF%*n(a9+M)l3{a$P
zMN}IJiKRVku?`>-&E^&W&j1?zwSJRA({WeTQQIp~NRMr2>l9l1FBzSXat5(}zyWrf
z2YBKN1ESl)Uc^0Dvp!f=C*3?XPX$TPk%HPQE~Vt0J|q;_Wc19;G9OHJEm#*Qo$9p@
z$k3pk?NYA1k)lGR%EEyMV*NSLz44>cVyUHm8~zSdU`w94!Q;t<v?FN%RT7miN1v6Z
zjed;yz$CiG$)9)i3!OH>KU@g|SAywa{OP#9_?0j`kmg8pbXd-k&)3f{&mIvqT&5cM
zgKu`c@%Ki-stB)?Nez2@^GLm2c~OTJn#x(>D{Hy4^r2@vrcLx#a|?lOl-9Q`jnc`a
z__t4PhA}(b-=V{343&1q|E2q1vhymGoDFogmYFXKSPL`A04N<6cQjw6PP&aMx)O(z
zwbPWq4{0$fI!T-LG03g^?u}`b>cVM(uh;c3eto~<g<BI3edS<ETPcxQ`u*C3syPdI
zAt^0zQhme+I+Ydyps-$?qls!-XsbOhdi?=$_+kT99j{F;Cf%$bk4WI-WJaGuV_s~i
zx2$(hkXD=2uRk^#Z`<1V2R()0)7$1Ony|v3%(bSoBXAe9B=Bingsbt`rPa<ZIN&c4
zW5g$X>$i0V)q_0?DhWc9S2MNTdIYzV040m+0+fh8H##8PgtRxZ^8)^%SdLTbSjZ-!
zfC>xL6JF}V6eIN%9JtMmPqCclA<)%Q*=~$ZSS-P0Gd5ZP;GZL|Xn9WD^Cv40ZWK|E
zcIDD&)4==X=<7E|xhTi3I4zY?yQ3kIV(o*4Y34lUj$>Q07fPQ7=)eVr>=`y%Td#50
z?9!HB*^uz)GtdNc;2>2z^JL>f#x;wSnckzJdf8DlkWp2eI*g1hZP>`h`p0roR%M-w
zCXkyM3SEdQ*qZ>awGyG~IEx5+iuf4>@`2ZUD|UyW<dnk|eCwdjE`W6jaRk4|%^F0o
zP@Y28=FBszQ;eGkO1>_)KdH@uDu#XJ-C?nQgdG6$vaIY(Y`tnq8wbD@Yq^2&i_x?7
zW(8L;L+Np6o!D<gve|)L6B((Ce<TKOsD@8ADSLX)^u;2~gx{5U{-S%YFroW(5o^Lx
zUu_20mDj1s?`D|UA0vKFW(jpjB<~`AU^#1^s5cOwZGt!<i@_qkidG^;bOtX;AA_>(
zR~vR4g!G<j=5@*H-geO|1FiK%2KU@qHGQK9sU)hwMTqU3fB`wEvq{pIjPWTC;I=z5
zn1IwR3^(W$rAJVLv64M6SPRL3hfVvPKZ6Y#ydyKYqD7j~%fa3(v{=8HZTXq1^$shT
z3=Ev@?iK+oM+NqSy+9){o1Xuj%_D8x{48C)TZ#e;E3P{dEE5bj;wlCd?-!2nz~(D}
z^lSSS$sH($!>VEvbAi$pva|@SEUv_82eV#PsjdkTATK8`6C7hPfv-GIznMqqUow)2
z$%e%53<_;iC_187Dq!kGksRj@f3#3B#Zva6&jHf5MOIrvvTbZ&!Qt+#g(!>y7+JRS
zdIK|n$49=P%FeflZxIvP{bI=x-PT`Sc;Bp_`p9wr1}{-s1~ny1Al=^H)RO~QLumrw
z^dk=Rfx>%h`WbNsV&C^`3Z>(SitV{)Q$7)TFA<L%(mYwb12&_QSz%!?tRT7JeP&<t
zr-)x7eq^1ccFGNvsFb<5f1Q?BLi`l*bHp?ERoZCt4H0};8}bO;*e^?-Cw^kXhdTO^
z)J9n0BfljUrbxcLi})OIBwuSHegyB2SkaYKKrGHJgXn3Xy<^#PE7@kafD`%UhK4sI
zP)K@vMfvp-(MLS8Oy%YarE&noBZeUTUTx5|F&XS_2@9;cwebzg22VD>?7Z2Bp(A+a
z%uB!1;d-Se*^c2kyvUpN5~{CsVma0BvB+TLJPIlnBOWPCnw2AUzQwRb4IqOm_-I_f
z-puX7)@|s}Gu{WV=eK0&0x{o6hl<38g>-u>Qp-^ETNAJ{<#2_4bNq9~c6B`DiE$tY
zyQ#(L(NAV3YMJmp8Kly<zPPH>zeHRjey|Zd8m%&a`t=+M+2GDZk0S-)gCW?k%2tuw
zf_$j7ma2nR81@@`yja4?8XV3xQeSIc*;s|+CV?JyXf8&(WQJq!fKQ^d!>>QEiVb$F
z%r`7t9U^{QXJ931e~0TUDC4|9?Mb{+tQ}|=1^GrECsc7}o>`a!N&6TYWSHul{3YT`
z#NSV4UKHG0GD}B1M{rI$dxhSM(E&NBcxpUK)4!XhVz$T~m1XruT-NAA+~_aRaYwG-
zLD;-O@X9W38BoS*7a%4L!RYy`dz6sX+WW~*lq|#y$baS!doWmwb*G1?qB!(`8|<u=
zC~W;zs#`3<Y}J3l9qmmCE)hWu^q4l!?}2u{8ahT2t`a0N|ApQ9{7&RcIR`;?m%+Nz
zt|auSlC<FVCes?%Ha|%P9G_;gDgxJ7Oc(K;KW{+>of+doMpoJ!KCvy0X?9A8t<r{M
zE!J?b4C_eS&>JG&6Zv1fhv0gK^xPvJ>23EWMuMv|U_n8QScxES!Q%wak|6;alkaEd
zRyv3P-(s~m)I8$`g2?PrHF&GR*IluY8f#A}gfZfocg6MK-Ha!yXe{n*j~5QC<XGtO
zO8%3w)9ZT$fB?95ynEuo4ys5ksbbR@(i44hQ|`Qx$Fw+5-><X?c>=Deb#-FHwTq$?
zHx|DJI%HNdo|E0bj`$_wCkB-cn;^cR02QEw2TuInfb)FYX(#;5KfSPEN8o2|FBj2q
z+AboGKxT<?#k|hVLqE_a*jdB;;ucP(a;;bk#pV)~MZ3;G3(J~zz}McQ`aqD*e&haI
zqRjlazejWtpCdkDR3pRuUe`*>f`Oo=$s|+;s&FLN^(4oD1v`>5>`jzaU{q&Ag&QAr
z8g*1Jo+3JXmGj7->^2zy>93Ap;L$pmN+8YINRZoSrvAT21W9;pD5J3HLRY--ujel<
z0?Txydd*>kW;$ZNhd-~Y@apy%DHjO)N?>%Lrq?C`q<~&IvKeB`<{<XQHeNQ1z{&#8
z+)hQrD6D&TZtxg3DXbnhFp+{wlr&mTKw|}=MGPn-K9Fs>taq@n68YvFhcNFiXe3xZ
z|JQ>B)6ve>?Qgfpp6<BG++Lv-mYu&ODuN6OqhOjrzy<C(G|x7oaXSV`tX@StQt2B|
z1va?Oo&EjA0q#w`doeQj$_w5;5dDfx@AW0(OT-U6&cp%OnH5e5JT?xXglM7zrAlo%
z{Rb;IFKS>)0)>x8W@40Bq{=&K5||OIuZG&*fV}sFzLHwyb|Y72=~IqBX_swl@-zK<
z{bxHl8a;Pq9$7^-bPt7?)<Nf^_R+Ll8t9SZq4dvb^a!g%t68HXeW^K=t);T5j8#>-
zDdL;8EM;uok>6@JYe4e}{~Ym=WVJHoUS$qvS~nqtVZZ~)!fV8z_>&oKotd!sar1>n
zld)(4k(V)7E3-D>w}eHTIA^bj+$yrZ?__O_HDoM4r60hnwc#mMA6T9iPx`i|@n*u}
z3BU6OX?Nt=lz&UhlHzzz-gQ<aofI8&*@Vx5^&d^RD_zbN_IhuLMxO+Lk-b*!MjI`Q
zok`kXSf+&m<S++<vXv>HzCx{SNI+s~lf5~rfiR>7r9R?Y#Fu2G`Up$7<NvQP!KaAt
z5d~9#T+xtP!hyfxUScM7p1AX2^8jf-TXY_31X4j&DOS5?!2}TsqbdYO8V+N=d3`PZ
zo)zlwdzFp1lOFQA4gdjS*&A+oV-V5rc%l>8aM}pgFNe6!+v~Ba<L^{vZwn4r^Fk^L
zx|t<}Izo%w&FyEj8mR(cX?*fv)x;ukpc6UzC+i~WBAz1t%s63XVB>ofi0JkL=1Rjc
zX8z&uUinxonAGrdt*ys31o_##?EAGR>zVrSgvUEjy-|vvf#Eh-s1XWgw6R-y&Wy73
zWevam{LYqhfk#jVbs=lHTfb08B8C~|9<A1wS=*}Xx?U0hJ{R#n0R3tlW^H{Sd+Ou8
zQ%I#R;298Ea+eb#?A9caCDSxh8QHV-zzs%3v_|7!;2;-8ya^6bjQc%WnpawS2kKzO
z<_4%dKcagQVUkhk+5;4h&f$i2!L_WL3WphhCI^a_2?C$6zBd|y=Cg9gMZ~DgvHG}a
zz>~$jd6a@E)y9M%b0YUGQNgsVjZ26s3Ij^Ec7*CnatsT^D*=C%*<7QJh1?s^N<TUO
z+rfEsgzn5d-X+QXWCA&cR9h-^0+7uy;4R|2naS-_M|_C*9WMd;Mml^j@kGB#o+Te8
zp{>2)@;>5-%&23hZvj9QRA-AH)(zs4hYVQ1P(ljCPBi>9B#skl#mRVqe$$7BP2QPs
zgC13L115_l7+L2#!o~c}PX?|#n@F?pftmgMgqO|nq&rINXDoVyKg(b{5AbI@-2FfS
zLS`zOaB=@LN@;kIoxy*VH**|8%E4zGj2+QuvYAgULz&CpFdkv5(q3B75=naq$`5cI
z3;P|cbXM6}(tRl;+>yuh9sp{bq!fOSoW&Ni{R~20oZp2Ev<180;de$$Y6}3xiPtYL
zR2)hQa_TVpZE>)JDV|dVl3Qs~WST~6R1r@RUn%Qt^h1uO){!x`)=XI2P~VLr&`i<&
z5$sSe)zlde*w|6Medhk&CwhPj6j2#nG`}Mix%{>kr2`6y($=3Ve|M4Kum9Pl!5#Ty
zP9$$F>zE6L&xx3=uvA-Do2&v=S}?6PZ(b})30|q#ksx$kU`P+iuVh%oiPi3?fKJ?+
zQj8P!vZ!I~D9v7MAm_mA1$J2U^XBOlF0Clkyg@-^w)pBwF4hi2h(YEd1wm;$Cs<!O
zfI9QW1*LffKD0ApzJP*CMFf-GnDld<;XtJlg|%yJtb)1#16IAXcMzz;mvRXrEYDo+
z2xqfnfjUcTQ~Qw*`?!>y1rPjY%}Fw(MG^gt9;1re?tlw>)}^-s$l@c~ucHsDwFg!U
zYJ;2E#xTK}@2ui8>w2EiUhFyJqkmHgZexf^)svpno{`fY*Lk4qZ?aUXGi-Pu`ain!
zUKD|ABA})x<y2!k0)IoMyl5nutZi)I#Pw*J^%rW~YnB+WyvM<~fYyxhN^$778Zm;X
zci^m!*k*(2pO&JIUq1tc$rHZn(RQNZkWN&fPRaLY1o{aCvNI-gK+>2Ik2YG#ef{Ul
zKG4qbm8oiSnnxNfaz8yE7~1uMdhUVd^oD0_EhlT&K4btRTx>E&X`_iIDE(|R2IPji
zq+(G8Q^c-xejDEp%Lc-rgDM0c=nqI)v<?AZYr`Fd{d#K#A2;uRzF=VpzJ<TLviNh>
zbxdTS8MIf;!}rUvOF6(RL*24aaZCoi1_}t%vW;}IuHa$Q{`G9qv_I2*ec*ukPoyhf
zQZGJJs~f-|`=y#9?L?nUu6)mL?F>UyM4}gf%{KZOIVzJCo_id-SkN|kr*seiPN<Fc
z%O;L9i9^c+Zm`G&#`9{`6PfyaJCD|gx}jv9_+e>&#KpO%k&<+Q!;kP1qQin6Ihm4p
z4#sICt**Qy$EmrRI$+dmQ@Hakb@vqksxjS=j^Rja&u7b@77nB|Vnd>WE5asqxeNYo
zq_BgDsD-yXz^%^kakG<fvghe7TT?%8Wd;64FWF>`s()cEl@7eF3`6VJyPttN9zZ&}
zk;w?$o+sLtQHI5UoYDJ#f>gV=szhao@7L?sNDZ!*ybdHpLFf)B^;Cs5AtBL5w5+*?
z*HlE`?HWwrx^>Vho~SqPqkgZ&5=y?VcGocDt0@IbY_dS0s;hn@xbBVm@1enGI`d}~
zZRuwMF(_zXmqVp8i-!vePa=rH<Z+%;mJxq1Nx8&o3lN!GhmR~rG;n7noqm)53C#Ws
zx>GAkVIW*N-GTyA1-Y`RC!M*WFf#tkJAD@c>>TlP#4nu9+0=Fxw=2d1cbn)ZjN8%$
z3nFF@XXez%!t82#!`^VFHNHfkqs1esn!C98h}(0SNlE)nPZ-PI3&2`JrBAiBOdXdc
z#S}xy_|$Wze7yh+20mOe>ctiY9%jVn2#Fqmv1%c^ee;H+6ecn&))s)cGa7-yoc!Vd
z9PJIVVk8Q(@C#u{iiR|W><D%letW=Oo*`0ZhVbuhzFJIR@4S31#?^4x$eXGc$0%bC
z#3R#cq+GaIN6nFI*1u85?XaZ4+;+BLza=B<=`U3hG|*aAL&T{Lnl^%DM);H%O7NUl
zUiDam51KU#rBgCMcMl|`lWFs+%_5|JLW)^+ew79Y)vK`43H-);(mJ|T8e^>I)t#yL
zpA!meVroK$(+?}<7v^`b#FI4r$gI6+#L*Te2b*{F9ygjMaXoD&M;vfr)Lg}jW)92Y
z9Fr{fh)VMacAMDL0ol=&$q*aHrwcws1b2J1zp9`uC$-Ju?MM(>GMkY%tQOpMTE>|$
zo|;Tm<X2A~rL=yua{1yX1`dy7=h+#16HE@;TDX8F;$y@oIt|opaKxpn?j@PkhWD+B
z7$bIaps!4TeA|9>e&37%5sC6>Qwef9D>iC?9U*cgj5g-31zM-2Uhvl&3mlddFb4v_
z)i8_%lD~e^8bHf>T<{x1#1o>P2gb}Sq_i3IPMFO3L_%?*AgAW)jUv#*0IZIzK{Qy#
zVLURL<ZiPQxWx5I_c`-aGwC~LZY8ya(U$GDvppNQw_x%b@xdeju`Dhx&Ol%pO1N4Z
z>Ks<z+&F!Mos9XnDp2FVpN|}d{o00AL@rr^mSrEIv>(X8nzb7*;hzd<P=%C<F0$iq
zcn@W+mA&2S4cAu#2)`$jytMdx(wrAm#y>!+O$6W_HI!kEK)=J%4sgAby`cN&h&P1d
zhfNN;G|!;wL(ghu>_~|NTu>mAYV1&xU(kH~eW0rE%M4=SLj=Dc%}cg8-HS!F(aLD)
zBS<Uwf<KcmI_HLSn|HkUsQu$;ZdfY5Gf=c5#Tz_;-9`M3h`&jVQ<|$cGT5M=SG||S
z^p@k)rs(QzijC}gu>>lsJVxA8Wf<Vo)d<-ebGHzIWaeTE4#|AfAFMU&Y@~R-2C(yt
zlE034i1=s$Y{X$*d4y(j{Yw-=nLSj84xupy;CaoUh%?(S$sZ|LB<xK!(2D@zQF}BM
zLvHkZYi$a*tM|mNSKR2y_OM=3Bj>dYa3p++pDMAffWO$;A8r%SdlI(qB-s_c7^RJQ
z9ZXttyZ#mZi^7ehMLADC8iJqJdl2l5hW8w~=IjzS0>3^5{9AH9q#jgzZfvH2vkiWw
z;q4VeFL6oVsP9kfC6s<4<3_^f%CzQE?|%dSx8U$QJ6}BunGvBkM9{m4A5h8W5kKOI
z9{AnThKhePI_uZ(@A2$`f4GNED%iQbkwU}z=0`xNc1UWmlJnf)8`=cMj3~tJQH{?{
z*B>K}3`vv>a{~i)W-}^)sC%~t0PHyn8wk{eTtX5r1b{seTJdBdLmU}o^NBj_7r@{>
z#i)@@mBM$if1e{hkhq?B{)rg$%8k_P=%-%^OikgcVWQY*e_^9~HL`d$-A>&yU%$ur
zYaq$uAc|@mfkAI*AA$mmzEH0-u+h{u4NusDm3P?B)fOdB0Guko($mhI<#*n)y=Uj^
zwTyT~dvmmjK;kYc%2Q8k=e>B640jbgLk?t|U`6Fv7vCI=Ji1!n@Ehmx%9xbhdZKWK
z?(-2QUDBzXD~~9#w%JeRX0z+6aD6))GhMQJ4>o3I@d&M*43WV**y|GvqX@2r0g2TT
zopoI5z+Ww=+IWfrdhC@fuClC2I#rcO)Ag$eqbiW?*n)=ggxHz1i)_SW<WN15yS6Ot
zWR|?W3mx6dxeXvoc-s;?Es}??x&J$&?_{NX^?@j=Ef#+s5b+rYv?ow0=l_Ag+O3Dl
z>kM_^%{gdx0IG^}@h<r#2~lQ9JGWw3Z!Z=ca4)x8sF!^Jo-#?AGy%wW?_r>Vqx!%>
zoVWVLTF|o1KAu@H^Gn2ETH@CMS|+PWtD`J^{?^pjS3rx(GoxkM<L1pP>v%+k7g*jK
zhS(Y}oKhgF()85S-$()*8*O#9xwi`w65-M<n-H$ppZY>KL>xJe<pX-m`79rdc~q9}
zT+Z#`-E0aBNo9;z%X-QrgIDg~@JfTsI)(>}I{_`TqJUJUlpd<!00p_o045sN!<0&6
z%mja0?seuO?zlbaU~k{AA>iex&lcIGy0708p{uSlnv2OuI4>N)cu&6){5{oMrH2)x
zF>$1%!(RB{%-SDH3NII<fI8;XUp5~>Y4gC=ORJ}Df-9HZpdsaP4iTToLxdn3fZKKb
zUi1kO^_e3Y+>3Ib7mQ-se52W9;WCf-hF(AdTq*HpQs=J|B5Abu1yzI|#SIlSy~`n`
z0CVp4jzFbSbh*SG+eJK)tCa!53pb)Qo;cErpC}+kX1)4g!Js7u9hQ{t;^s>m-ClAE
zy%<ok5Yu~MNd4`5g6RwI^?NbY|AAn#*negHEyJTfQ4p?JwiZ}hIe((>pD~iz#XRs^
z9qw^7;JsZ$4YKCF)lg&z&kP57Sf{@k(QifqYFcm1*A9gKC$M^2h7!B+*<M97@Jfyk
ziGC(Ao{UW6z}YPEmRBsIFuhl}1D_0^^kg$v5{ex*KiXK7cWZcJTIP#fz^yyztOb9)
zkb#yUoz`^788@va(|&#R+-8Af$jGEuZ#k7YWa4%q|G-L)R)JJyDl<Q`9Rqp}kdjQP
zfYd^w9?pdU6%1)Tknpx-^wJf{Lgkp!GjQepXW7Mf>i-48&fbhHeS{2my~D-J=}Y-C
zkVj<+2pKIae2n-E=MymAoee|pk^?9X{f)wBMwxX!Yu_TiM%>{#-!=n4YuWA@)%Jtc
z1P62Gk$iukcHfyyx@ONap0QsF85RiWjj08yx@Sw}%K*yWKENtMmP@=C@Hczx_rzu`
zOlagyC9W2lji)lBym0K&{hMoLjWH}8<}5`>tkb{5hm5SSB&A1-;SC(=wD!T3Rs2f0
zByVYb;TFn-%ghMwjfE~{{UwEsH!G;82s-2Yqa_Fv!t9!0S`$OZ#CuCYQQD-~!4@TB
z@w)XZT?3!@#I&9YH=e=hTzU9wqJh=@=j`)fZ-tfHQ8s&8N1x1mo21tj4TlFx3M0Pk
zC3z{G);~85_6`$xC5hjejo;SqCIc-AP%45>CZ_abcwPGt!H>zFr?ARe*ocQWdwb@u
z7k4vu{X=ibf5BnAu490TIA?Qu`q4e8Wr7$^RWLK~|IA*W2xgU~rgIx#zUjbZ!xJWw
zEfu48XrwOdVRYKVhKSkYgD#{@4Kd(qtsS>dv7AOZK$XlFd1cw-Jr->Opee`7fQ<9s
z{((|>w<#gY%(KpG*#ZkE>m@5#n#tXmR0mIZzrx~%hkhaWR0T2EoZ+7J8#%NSi&t|0
z8@+6bK?fH6k>CAp<$ss+EF8#WA}uMO8f$E=tw(Q4B9X+Qt(~F)AzfqOE_=`ZOMC6s
z6$SrK2r8cVYm;@GF21i0qk>gDGr{Cw^D)#^w*cYt$<?xNeGjIUN@D~2yZz4>QOEIx
z-kCj7t+j!1H~Z;$kMN1(si`j_F&djjlC5J<P)3|-*OuSiFDDV4-y3J$5H!C-C49ET
zH^V)bn=*Rk$dzgLYbdxH5ldKwF+INnAyOrZfsq2QJVLfke{U4fHR4OepRs|xS;_2@
zo>`AevH_Wag24(RNxe&~dXXRyWoXlK7+VjQBqHR<fRfXu8<T#c=ZBOz7gi46F5)P}
zT!5dd%=LJT1Aezy8<6sqo+NJqa@OQT&6+$~9|KjoC(NL<5i+wI_?AeqMY>nO1ho$K
zCTZ)8Qobc&8D*vezDA{w*&lxdR=Wf#{mfO8FA)V!r$x+rhhc|}5=e_+8B>TAaTO?9
z%bA7A77l1ci?X*-LkfC5MYO>rB<W|QSovVI(PRaXfy|~^KfiXaA=^K2;&cS&&HIWX
zDWza~0}YCArKu;Sg1IEoKdc6p(W+OO<-DND_{0J&5Oih$=o9`(|M|kUi^~;F8-D+1
zo5Xk};_q2hDfha~8o<QCQ2Ep&dA*eVToNUzLTO74sU!eBawsznvMfEdzu+ZSvgO%+
zKSBOnO-#Ij{-hL8H-?aQhZj&!M&$s&qjzOGy@(jVBQFs@!>c|lRTjTq^90k;8p_H@
z`9x$53Hm5y{c4iAvm(yO-FM8!@|@uIDXq?2nf!Xd>GV+F#af3OobuLO<bYry!(Cjt
z#>;vSCex`nN%X<qsW8$@w!xnDAc&6~qY*bQ5prv!*x~@_08Yl$JyD!2tkL_9R)l)<
z7Opm}c7kqIRc7M!y^#Vgt{+y-ow)fU!R#Sok7G<JxSTvwI<FIfdO%TA@Urz{oxbQ-
zBI1=&evS99Y`RLAMJ=9x2K65!{sPE&F(uq9p?h!Do<$g~XnHP5f7+#fN~FMO6yD-f
z+Y9tmVo^8Dk?(tR$>QNho3iKOO2q`pjLz!}I!D5j0Uw1LWv#j4!&uvYaPbx+6YjZU
z#VYKjDy9+&^l0c<`9{l^x7)x40Bd7woRKXJQ2U65+eZ{wyX5Q&ywF$l1^4jmseHwg
zXthkJ?lrIwM<eM|7j{qXswNm!ql5c;UCWr=9t*U}BaaZ)SE>0rk=U<9Gy~~RMldW{
zp!u3VSc^fr1@o<*=!3W;3GD1gFUiW^C?I*}lHG63^^`OL%qJmLaw!W_OX}XT@EqVi
zM%-@-3EGHn5r2-D7(A$YWy?0GiMiNB5QF7r7m}wSbG%xb)vgCZP8dy3lo3~!SwTv9
z>2)&;n=qpW()48BN39)IvUswA3*62b0`#N3M@4$f%qB+s`sr-~L2!)ZeL7|o1<o`D
zf%*xV``)CFJI;e@7#k^Y%}demjlWSR>4a}i3ow3wbBy~1h*80)6g=?Nc4u*M$(Ak<
zfjI|bj|pz`4?+4i>(Ab7D3WlC3TM4dg<Blp5yW#w#nu_7*YR$suNKMuKQIL0g#8aT
z9<0E4Prp&b9<6$i*>cnof591j!AX^CHy&0AoNPx_zAbpP4H3Ia%LCce_lR#1Ie}IE
zIbCdhI!?Psi5l_YEN}kI9xY(pWSuf$y7l=is3WzvR@qd>K9ZM4#B;<K;_<!Bw8)mo
zJjFo%6T-c6y00ujVRKJ!*Y6P--dPXwjsP~>-yb;nPsuna@pn5C<Q^_A$GXD$4;Rqo
z769%dK7;xiP>X_PVdk+RWElC055}-e*4I&SwmNI1x_$Y&BtVs9o+z1?Jnrz`Ja-Y;
z0iCe7@ZaN=uduC)1uW=v&_P!@T_L;RiJlYw#ebzTwZ=f^ogvF_I9nSBIHyq2F_v&o
zW+EbF0;T=9ZfD8?VLj_7@*prhF|4T!wB>E<0h4%#$5$n!phB|2e+78yCu?4tO>aha
zwaMr<yuY5jwY0f6ZRBWkRfhE~cwMW5rx{>NR76kyX800>8Rm@i!rp78h3wY|cEaiT
zg3A+V^9Fusmr!tFHLQv<j(DMy4QH8H=?wtXMm#giO0J_bvm5BZIjs9gMqokDLgWmX
zl5f4TQ_W^0<VbKnaMb?<Dbz)LBn%uP{(7?xFCo>{ls~zxd9acF=J)5NdS>pKinx>U
z6{DpB3l}1Kx@FW?Z6%#xcfwvtnr85l$nR)JlawVV`VVCJ8~$rDiRzUvd+zqp`moY6
zgv>j$ds8u^f_YL4<2yxQ&A(CpUD!5s#@25{&>s10Wi<c^_)6vjio;EDd<|ifnW#lL
zrZ?tdyBSp_=Q}d;i)Gsb_thGvTpU2%Z3&k!n%=gsp8S4I0Zn#{^{*A(t5?oeV}+6Q
zlbHhh4*k(HV=aZx8lLfXfHUTSKM*uZR`F_6GKGx9(JY?7Ce3=~#W^EMU}i=qG@BlP
z`H^-uGe~U93xml%kzb`sQrA9bcyCpIr02YgxFbJUT)c<0tvP@>h1)swer7K>Mp5_=
z6*dSfx4Fgxv|Eh>>3s2Ws1B!bodrEnC6r1aO}}0uzVYgTAXDq9AWxQ=51V&>iTHEG
zUqku^M<C;|d%VKN;ZdRDLit!+wHmA>A)7Xf@E4nxQ1i4Ox%1CPa20C+Bo_9YRp}bw
ze@|3766zLpa8*1+fk}l_K}GcDgJaEtj;6l+z?nIM81=Us{(pl-z96KVs|<6w<ZR!(
zTKbW@{}6huDT7Uhij`JKp4_g>C<A;+0}|Js4Lo2%LH_!=6@itW6=eQ7sqTU67R#`<
zFyM>nxWjtTMjI!b1ESOXGlyQvKn)q5tg*h`0yoeAo<L#0Y=VFi*O@~=XF!GtOy5`$
zaj}8Y;nyY?)Ln$N)ZVR&kg<e$0dr8Xv^il+GR%z>vL#+Wz`IY7SB+WHWStSW_4AmP
z!Pvfn_j-d8J3{@U3{cxN!l2CF86BtofPQ`dUc<QXCnCifv<Qy`)>9GyR7A@R{t8g*
zJc_zauRnrYcGkF&8Qw2hxzglBsSgssR`>1I>bNbb?AK%<qzb&V15~%2H2pQq<Mf`2
zLua_>Sx&|?UM-K_mO(2Zp8y&kIiV_Do!R4f4}gr2r>ejYoYp;oH8ZAiq2e0JIvaSd
zlHaY3k?G@|k2)}VTt&s7<mgalO#7C+4<kTE_gGlY`aK$xJ}2jISk$73I@&M;xrYiz
zJDR{o-2|!q4;HbNI7}vuYJst_*^C|>?+O<!JX#v<m*b!o{%fEQ<BEE_#-ik!)K0jl
zV=q)dEdWuyICDX8Wd4N@hsY8HnOl-wST$GAn*W9jOtp`k8p{Vf?j9T>3BrboC$Pc7
z+{n>7DFwwE>+M`PaFM(7_fK}P=XE<+p)BP<=U?h5wU^kZG@Oz|w(lW0$sTql;W*&5
zN=u9SL<g(_%-mz~LaLpE%nn?p<Ag5`@W>E}mOS$*;ukWQ0X$G(PnR`<&B!w|JdQBE
zyYyH~NMwdmLDc4h2ZsOLk)OQTt8Z=>FEQBBM!G3t*1%5{7q?YdIx!W~5Uz}vkPK2P
zsv<#<1Fk!$WOSxzsx0}JV@sD{Z$+De#*4{}x~02oLj|;t_%Y%qdLEU>oJ>Jq;{!Jw
zSV!sP9;fNa(tN=?3^^4c^vt{Xv;=1-^Q03eZzBVlclh@aKX9v3v6Z%Fk|=CYxJQ$G
z^j35^k^Yx9Sa5?vyh7AB*zagyTN1W<vw)J*HFS;m9#O&~)%cvzAW>#y_ig_&47MlV
z)%Y95volO%Dk4!I#$YgLU>-qZ8d|Nz&Fp@?f!gFL1-hXECcY-GD9DR8Xsp1?W~@Y*
zJhDTjn=Rq$uU{7bq5g*gPmo!A+ab|F$$$qqpB+Zi8CcBlXT@d~Jy~hBLzAvp)0^^0
zM9{*Gj2w40X*G<sEa*mmHS%OzMz%8@a?Y<zmNR#}IDFxNm4CjVg;Dx)WWj2aiggJE
z<N2#PHNJxSuEg^RWU{e-nH3#W<8}nD2GHJS<}VIlZ%$jxqeO$+Z55Xn_E#D+U4XX>
ztVq1(K%5*lGk}g^9b?3oh<}H9e2q00=Tyt~PQg`}U15+v=g@{}B1YC?v44^e><CfL
zKZUvOmx!;Mb@(}A{16fJ@8xz)DJh5K;=v)gu>Wc$=`A5COe&qi*NL#uTHou5xqppO
zICt*_U{4d|Vh8fgC+OIW@b=N)hY?G)%)FdC4F0>l6If=SqSpdzo2=DN74+PogDTT$
z1${RAr-(B{k>r(wIWU=3Y;QhXmwID-#S;#EXIP;FRc>V9VL5J?Z*|p73l($kL3&XG
zD7njqyR4{s_EzFKv#Kct5o}t;(5~F@o-%3`ac|!*EIYX4pecDDF@PFBy|Doqp6tY4
zzYy#?nvrY7K4~LKZ_V1BQH%#`;FalaxQ=tgmx!n3>!d3}Qg0)@FYDJ~g$00yELi;#
zodJR)eMza7qQ|%!2>de!^Z<u^0Y{`XW*%{j_`J+;B|trLB<2PBNTGkD4moE)6)O_J
zs@@2m!CHN9>wo1nj-SZmgdwh|X9qi&=^2)-kMORe)#O|DJP_k;-2XrZ_=uyKiHtk&
z(3N?DSu%|^V{fFC2xD9weg+z!E@^~R^e<@@EAWb)4HOwT2pjg3)4Vo+-psKU)(L!g
z?|mAX6;sZ($RY(N0Ofw&3A~H=G2#yq6~&m-a)Rg?i+DHvnwH8Rm9eDUY0VMbuOFee
zO}>(KUO3Ky{paK=i#~!H@tV+eXI&q)`QY<r20wo)nr$u-KUoj(L=%ca%04kc(SFYm
zv1j9QayEEerlRTMDPIc8uz@?N@aSd2|K4(eIg9#aIPPFxb)bAx*ie<50cW&|ePf7Y
zX}ZhKeA9OjO!xsRZT`o>#tAp3LXwDiSVutDz`f8*qvk2KeVl2<eobUpEza>3?=R-y
zK=hFK<lauj?fWkgFZQyKnIhf;j#S(kHfJ&Ghz;!CM03XfpRoUGm)%H`c0q?swZs-A
ztE&4uGw1*Rqq&)j`9ycI$Ssyg0XY}oYiSX{s*?|&YWC*N1Q6ljZf<Vw_!bekmqji3
zzK%qVrEdipX>CM|NYVpUoqI1+L6gt}JO4y3y~A7v-n2HZhq38blr<KN1FD!rc49nv
z4HQfe&<$c^B#CS6#2s+}%VdC;O(o%Ip<6C$HqmCVE+1x)QpcAZy+?b|%R7#7x7mqX
zaTh@(>cIP1e)5#T*C5A|M&N9HeZsUIeUW(qCrq@t0LPy|rza~KsTo^`D!Km>UvXic
z;E769i(R-lKM4Re?{k!9OR?rR)C}(|$2NsKo7zeV?#NRpkDMXZ68oP>w$4b(WnoAB
zN=sV)UjU2A*Z&BGB&p4n$5ADyTMv=&pLmS;yS=idLRh}%<X$%C$OQiHXj<Ee?T`{(
zUB&O4U8r)81xWroE}*lQ`dzXzQoUXSRlZx+?O+U5hXYA$csaxWV&5Ay&64A&e7OjE
zukOQ@LRS{b#C3`P5CMMuXb(yr_X?#PJVIeCKxvc5?lxSIhp2C_#Re5|u9$v~;P$4%
zX&5-t<7g%WROR~gY6&~uVA_xf{^Wgxfn#iOriluuC1_r$#_b4Qp3|M^QLD2~;F8}{
zKc-ShaTW_Kd)sWpixh#V>b8`QUd=IXw{lE`p(CDL6%W;;Mk@g1Dx$Eptu5=kA#ZK$
zZ3o*FTuHuHS{K}f|8%2#uGXep1R-S6811Rrui76PL-6+{7>yhINyAc$wTlb6LTzqS
zSRF1V%ziKI%L8a~SW{?X07SKF71ao~=PdVGE5dBhcu|52@$Gq#9qPP(T|jXkHju#|
zP13C%%aSM0SxCi)8fza&_Ktw59jat`nq>c_j!@lYZ{r_!9LjACbntsiC1OH_Ci_vB
zgm)w+x6nj<iujdPP>!Wr-Eqb@0!Yc<>j?cF%c1jA*yi6Cul5I!g1Nv%XQSa8j5JSy
zr2?2uyr9CDxXl2Zea;YcSa7*n7t51D;lYmXw0VGQmT0D4JXtT=#2p1OAo=nRZPCeO
zSe*%$b})1Yz`?No{@&=q0{H1T%$Iq?fO-avEe8CJS<X7Qy7N4R9B8>e^W@^t!!%n`
zk!*0d?+}>c86=hJtPJoH@lU1;<u`vL7)~UdOXyO#bQUN1erW>pEA(Pva<m?ZRU_=2
zG6$yQy)?1@%<KcXhiaXE=_H)MEiZ6B9eG@7Jqg`fK~P!-&={H>&3mf@q|b_&D&oJO
zrF+C?UdgF$Ow8>zDNyZVXA_|$>S;)5CQ|vlh+2~GKC(g`PRB(YEhd;dAQj+_fK7d;
z0FpddJ#OB-`5OX*(lU)3w{U<+xveM4&vptM+NpY6x^$N}0Qp09enju88eEWg{(wtq
z2q3TP8jKF_0EX!}%hDwDCavl+4BO)>AHc8$uDeJ_NsN}O-ai>HvH+-0)^c(~5}xyi
zi3;dP#7}!|j!S#4*u%Wp%0J<j8bHj>N|6OlYUJg2r_2Y-dOK2W$w)g6lXL;wLAXkR
zOV2k)qqKJ{Pcn_6fUp{Epr!hHc4R}6841cK2VgR}UGUg~LRz!S`h_T3f-uigaLs+%
z=Akf<r(Zv)Al1|h7a`N5shn>X{%3bU*N8tO{@Y#`S(*N!gP_Q<U-nu4gSr<R0#gBP
z)?rbH%nY4ESuXy<pWp0dqKi=BN7e@sh7xAOI}LtYfwfJmQu(J~Enf{2A3zn2dG#`j
z0R@I7Q@gPur9qMlf2~|Q&;h+b7!h%{xiSsl^7AH#Kk&mftEKS+rRfuL1Q-V38pa(8
z!|d#}I<*;6ldb>qkZ-2Jow>5qHPX0DZhUTn->Dm15S~0!dZ|qvcUj9&JA2`L0|{}%
z76+q)8_0qC{R@)}D=ppxJF2YjJ>rk`EHc%>-_4N-H$}ux`h1Uo<XlEOqzx*{SFhID
zP*|d!`=8D#<x2tdnHW~0ooHbCS1O>TYWI7@KO_2xM|=IR+Sso6hFnG%);IW%hybs&
z0I!1NX$LINdnrZ?<qD2{hzRRtD6Q9EyQhCLAABW+6Ai9r$dUvr$$dsDFb}rA6(?z6
z(;9OFi39F&ZUI=8-0=sF`8U&+ZP$@Tdv9JH@!2{QZ|s)9W#I-a;kew`d&KWnjh+}N
z_zos)vIE?3+YhE8J==7bcjDPb5w!qjK9lUtz|S6AC^?~%Rr&5;-l0F*Gec}~njJM5
z2^^a20L+lHJ-$jONpZi#?7EL)SVw(fZ<7QAE0CE~YYM|_KsO^2OB`blB&yQlT(W*a
zHNGT<U5wEaXVh+T_$m>++X@Lp_mn>+Dyp*>G~4nPJj=61OhKQwh(Fg%`0`8Fh(EwB
zH3Hg23s)MQyHUZH3%GJNTfFyT1y9zzcpyK@Vc>+39$12oS0ss4I#EL!KJwvLF1~zD
zHpI7q)Y=fCC<&gdeW3u*Ux=ss^&90xa$QKmcMPDLjZLYon3rcT1xvLv-E3*NWiWz0
z`P|*0wy5KUsG?9x8Xy-L;eSW`bNyPA#j<udM=3E6ob4-VqL_vxg{wH_zen66{zm@R
zaNiPvck9QPP4~H=#vM;A(zN0-r*$W?)T;-}uq!UUsN{RzmiLpSGI5}_!4&CtT6~Kv
zWL}5w<+Icc{ulEWP817wD1R3C*s#($d{&>`WPJ98qbp^8PiIGBz59r-5&ytfOLX|d
zx+XfP-ov_x8FmX-0@Vvg?_flt)Xcerr)F5@1LebtTEk=zt+jmjzWFzs-5@TZu+c^h
z&$ZnHJf)L!l{KBQC9Pk@^%4Ly!B(AYTJnfLQQ|XL4{`GR4cJYEuI2Fkim3(apbi_)
zKT>&0!lrJV!ED?SaAE?1yXhf04yKlC?7lWAB$c_t8i#tT2hG%eubbXCNY*})-OYrX
zi<!fK56)FUwatFHz%)!Yym<!w2`wf2HxikDk9d#xl|!XeHlSnoAH7z3m%cK4#3Nil
zbqC`p>kPn+IL5Es@Chz8tSw?DwjS01t}w@N!fo8FHlP&e&0T2H2k>NrNILX01$M8@
zG1oEc&N@8)fzvaAgU>WjE`IJMbK^A<q(ywvMf}Y~pQQ~j`;dQ7rEM{z8Dn?_{_H7B
z=1mB?g$HYfEm)fys?;T;`tOMUgS6#-dL(#KzN2o7g5ua{{&<7H?bw+eL~mGNx!Nd#
zqdB&_^?jn>{*bEw4xFcodH#ReONsuv!x97DaUPbZc!kwG5V!X0Vfg+O%bq_Vq}dts
zqMnL!9X$jtvMBumZFS}h9yTRFB5Wp;`z<%QtofsHY_aw}1xxs7f`G|p>rKGY$~3o0
z27HM4N`0#&5>D1Rxidhx4I&GJ(qqK4%|1|yF>OvFENoK79<yyA%^L$CE%tJ^LQ$Zz
z&9EyV+Qm%=C9x;-(0_d@l*A2=rZhE44fLyNWZP&WT8`}}Nc$k%?clt;fE4Q*Ol#;<
zv39Vr5{)Iw#byk>p-?<nCwg!5;0EjU$k)HLVIz8S@tP7?mE<8w0Gs$rd()nDJX&GJ
z{lRKeNeyfVeQ~kw|6ti_DrxVTlJy<{lsQU@E&?~u2sPZ^|6x_`N58<<M=;Xa0{6g4
zKG|r}d;ue(jra#*P@SX_;4(t6Y&cK}cMnEqh%~#|mB!8dFGwLZcBX&@SuFMu_Y&({
zC{tgdhVq3E?J*4Tapn3Eiin4Z|C_U>wu*rt6$?;jrEnGT8tii>(5UvOp70uiC|5zr
zKQsmzO6tRBa9C@hHvdSnKJxP??!2*O+R$M3Bo3dg+|%+>_#2xtZ^Fld3TA`H>Z$3=
zOKHu<bp=`n=s99XrlpegWGVcENvZQ1hIB@~sV$_)klF4wWxxu6YFv=ztC-Z0O809C
zlvv2c<17`ACqpHIe>c<B-9MA1X3}8$u`kho9ccy0%McY__GmTTcdOmLSo%|~&woYG
z?9obhK>0_!h-~b(W)K2=7uNY#F<)k|rOg}J3U2$E5<^Yg91V3JEkfq&ClLR_J<lda
z7y4V8Gj7~t;y)xq9Wl~(Y-vWkv_t3qbA5irSI9!V<zxr8o3kUBP(Vr#<xG`GD}UKB
zl(ku};Q<9|X$>nK+x38-I<ITdYJ?c=E$&LVFZ`W(Jq2gI4OgU-dy|&DMubT^a|0?Z
zKrShi0>EhO<-$Kbu8Y`_BBILhrcg^~=Xt~XFD%U8BTguK-T^*mKwxRq!iEx7#Q{$<
zWVEV*6l*ss4zV&axdc;Pa4F5^EM)gV7Z&nI#LZrc6bvapaC+u!{Tr*djpQiDh_9fQ
zk~wLAW_5H&FRhKwka>4J|Fp(V{;AA+by$vcA+ya0Dl+e{F231b`gpJrCkMb?MM-GR
zt}K1%nT8T+_f@++TCSPLP?bRnGB{zC9xm!iRPHxdVK(hSWwdFSa;e{p{_;B`equ0x
z_)El-6_AeW02mg7cDU;UrDT<#ODkqwvj8BZI~txW?fuZTQSIHD1uczeKilAmAT}+T
z_0M$|Y?wVvcE}eJ<cY%Xge`7dT#bMYH=?S}$nPGZQvRSdH$QxDHjw3J)2T|9ps;?!
zGyu$2Y?W+5jCF%ObyiczJL!&$${8Tj55^ca5KONhjZ+q5TH1S+Dk|E`)eisuouUW+
z-@U<{ftvjyNo+Q`Bo*PFa983(N0xpfc{^Hf;9s8sr|HGIf;?!s)D6SlZso2lrq&#~
zJ=vmk8I?&jlN7u)<y8Y93>#m%=lMTuB--Jhe%M5r6Z1)T7~nRAR+ytZr1m*kHE+jl
z&e=D2EK-;zy=N4_8Hx3{sRM9yUsxBISZT>on$74HrZwK#t87|+RQm8Y24!5=>)+y{
zCX2y6uIEUREAN6T=s7zvHB!AdXU<Z~e1X~8<dUD*!MO?YzamNrlV{&7El!H?4>SVK
zX1_IZtP)rrHj5`YypmW}9Qzj%!U2M3Z;G7l8o;9+l|5RXcWzUD>9*d*0@qvPy`K=F
z)U5d#O-5u%_&|Idfg370ch&?@kjiY2;yBnG+A0NOBfz*e7@O?B!eV6uWlrQl2hfjt
z(USAG5WD{~;+KfOv!fm5_a|V06byF&k{F{(rX?dEMvE7<6?ca<2UXxAD?3Rtj0tBl
zk;sgGS&5}blRc>2aK!ICTB4C3fCdVx#^~O$#8<-8WHq^5Mm5sER%2Y9jYVio(e%TH
zbKaNj-6o%m5nrqcU57~(u<eg`Lv)r=>joF4(vpk!{_qtE>&@QF{*K=}GZwMg6tbwm
zT62K)6g8b^1si@_Q#ksx;2q(Z3QI0iyUP*qvKjBZlJt@9j}d=F{0j%znuID*0jUq-
zf&-L4Z%M)b74dt-fyK)yl%;|6Tf}cB0qOB14d+hX;u|cI${hu_pbFxRC73>Xe$mAN
zdMYh<E2$ogW{AnfcR28Yc<O};H3IePo1ZO!-uV8}h>+5nG7Iq7J%bkfSBtK9Ah-tl
z0Xp%v34lC?ij@3rjc5nXe_jMGSi+hww^(au&3heJPK0bTE@&deRn~<!aln=XcjA2j
zxr}|{2z1B^n{-oEsC|j}b3M(pWJz28eX!maDeQ%<lk<CH7T6d5M2NBA`jU?%3X0m-
zaE86!0rM9e^^@%AzzQcE=!}Ho)kv$(nE0I$45Pi{`TkM<+8XJ<{QQmxHeue|S*YrO
zH4lA*c6i2P`^1490gssV9t73dkn=sZvBw`KGA?0Xkryoo<pk<nJn5e>U6EaXAo(xN
zrK{@L(Zl;cjjc^oD4oq279%I?>!A^6l7M=B3(^mr><mlIT$nCwgN0M=R1UDQ=8P~*
z-)$;kM-)aU9`BiK@b4IX3#+@&4nR-!M*Z>u^KoJX)IQ=D{%~7JwpigO$Y3QDHNfo(
zfB8A9fvIgw#iSa)TY+a<bB`N~R{^nhc%O?66YHs=DX?t(L{08B;7lh>BY2{MBb3`u
z&wuC;V$If+cVs(T<Fb+>@)6016n;nYW#!ie{LSs(tp5Qb(xY@?jULT900_Dk9GJq0
zi+7_&o=w|)|7{T;2WEQSn0Zz+WBQCf;b~m}3BZ9`C>6{q1Y)0z6UbFa3xNC+9fJIi
zj7Nz>VP`Q}!ehk$i1-ckrmV9!UD0GA{jbm64TxNuY?$~+kI*C9sEal0BnVQ$vaoqh
zC4Wy65Ow6-1ArjI&Hlq|Kes2B4nXjR0R`vl26k?V{SAK}_Oj*#1eJywN6t`zClS8r
zjYHnxa7$40110O(3J9fD2{&YLi`Mg>5hZ1|lNoBsetwSVdAoO+S-;FU$Q?T7Fe)Us
zB>M@D@5w6)4!Rh@0xQssY!orW^0<c0m-m!QEEu$U-E;t@X)tfx%NLACw8deagh(@S
zV8dE&yXOIyD&OnoB)H`ZpLubNq>C5Q`5IMTONA&wkm`iJ8S0mvm)OWiZLMAhU|n;9
zi2D}t9Px#{IMUS|AJs4<1vpy_mUSJBL%(Hts3e}sI@Hc}g)@2LfK}^O1nF!>UOKts
zFL@hlw@IEXpMmu|n7iw(T&N(W)0xx|ucT(GI~K-VHQf0T<4-J4s+9*0Ob>(c!jQr+
zj%Y)N(0~$8*7O`4p_ZnsvrXMfZed9r{>3=wow3LrJNFZgJnsWpv{}EQV$;df|L%=r
zP59fp3aB>H?S`YeT7p<&=X17)1#Ur!)xf#i8;%Y9<g^CyBh{%-Oh8n<^M=s}tGXd0
z)UTX5>ovsY4DbHVP7G@yg>;`YX~M-=sKSEAV5Q{z;_oJAEb$m#%ozVo9j!D4enE(J
zA3EAw%M%;+05GQ-?PUueASMe_Atlx~a?lGaBF$D<=z*Ub{DXdFNkp^8gNkQXbHMM6
z#!e{2HsVXfuUMWYVC?v2u;%4D013pEM%z7@wao@mw-H>0i|x}>GQr!|IJSEI@N%G)
zIr_?=ON}i38!yYNH&<~WQ)}&ow2|sG<0A@AMXSju1gYz48^<+e#y=Y2d1L*Lu<|v5
zIV{5Ey>J6%M%{2lJSc1z=gf6${gkhtu)lYT;8RMfy{`d;)N&V|^F88)OG&rduS17l
zUnfUw_Y7j$vlnk@*z=F{lMPy;cU<=an^!;~J+3>c4`{1;LyQu#^+u{4KN_brku}WZ
zcxpTd%wfpp*xZe|LW)1}W8!J{04l-v@}s9;p7Nc|{9>J`J(%DP2XMEZ$tr~d7hkF#
z;ze4jWRIH%SYf5)K9Beyi5Y|T+Mih=f3`A0Fx|PKo$AD@I*pqzL4|+2*<7Ly!mKs*
z;qVa%s!>b48tUEIK*#YTAEb<AX(y3j>q6o4fiPHD47yWTJ4*_5zQsJrdyIgqjSR+N
zfi#2_Ug@3xx22$b4<iNMcUkJ2;nj)-m-c()r<3(B0D--A^VfC@v!S$y%_Lxi;h72A
zjg`zxdq4JU|9gL6?W$t%Bme%6<z7j2uNeZ*K800m79#NhF`&xN8~1;+NwI}>J{25+
z2A9@l(N}A;Cz4BYu;;N0Dpc3Z{zU=5(BL2zxwH2q6gFRXBl-z$x|n(7K}ORBbu9V4
z1|ec?In|H6$znAAM3Ah?Y8XPl>Yz}o;etwDITs(Q_;e#moI4T{Rs9zs+4CAeNx~$d
z=i&ltrsj<1{r85!a}mPC2z+g1$x(mhar*5>urM57V8AusXsJMqbc&VP0T9(QV~jsd
z_^iI7ktlu5HegZU|L=%n#ILwcUAZ<8HgCqhNNI5K@mIpY8Ex0Rb|7;~{xm>l@#zKQ
zID%KFT(bid+}N{?0$A312a8dGat@mr%xp5Ys}*1tD04{1nKYRhmF(6I>@aZ9wJC+Y
zrYzy$OA0MYOna`Qx<?B{Y(E-CwvarZV2^`m(LQ(F`QthZmjtQ<<IZMFN^t~k5*cg>
zEKsH(9NSYdXgDGBW+TtcD2VUS-B%0O7wcu4VIl8ahL-dSzdYF}g0od%Yf~jAGFHX+
z(%!4q+kl*3dC_dYsUn-LzqRIeW)4MT(<qxw3atg>A8fA7%?#$9v-mUO_lTca2Hnv{
zeS!+u0Xd&cIk?fmklDX-1QjrDB)6)E4V%4i6aQi98GPnV9y=p)Z<|GMU3@f}U?Gfm
z^r#;2>h(wdN3LSD-EP_Qo-Wv#<m!%LRsDD|F1fb%h72g-PZ@cx%T}DN0liF3(enrA
z_sR>ol83K-4U{e4NMUzY%_wY4%4jF_jv&-RCad_D;i#G={j{#4i!@ddvV%|Fn}WUi
zsFgob9j~J9$ge+ejRlH-$)x-RXZ?!c<Pkb2I9QLSN=rsrD1+HC%<F9PW3<<esT(#P
zp+R@4UZTZIFS^6OSwgmbh6;|`lhrFlBve?^n2|5C{(<9CqSbw*e7<LO+7uHer%&*6
zVA&)I6fymdb(C2E8?ED4UV7QW8}99mMSWeL;T^SGl_|qsI#H*?F{jM~Tv(rBL)>l2
zKp!!o8_nQvZvSPf^%F@o7`QBrDJ`;57Ne<D3*+OLsQ3;beV{37BZKtFKH2vx9{U6f
z_Dj~s<HX?1X(^#FJl|gml|Qmh@*vadkS;G)@J3E>G-2v1x0WxyQy`snbze}$mXz<e
zSsdTI#d2D6m&@Pom~xTyLd}E)b(b8iBs17Fg;&Hj8%NMFWM$|{`Nj@G&B2<4?_cii
zX?K`PPgPcJb|Yt`@cSRxkBZ<};Zb)`o2_*q+;2XYfp(n4H$)Y`V6l?#ZdhNDX6tRl
zS4hv!h9$R@m=g}Hpw#}K5zm<T$T6r>C)NF%03dpyN$++dl$R{#(G8%W4fMfUCnuZj
zFV*?9$)N|R;ZwwkB;)~DTG}}0;olT6mZ#bgD{uDdNePK6!09!Mgd@ON-RMHP4gWHN
z1IFwC$++KGm<Lw-Vl?6&^6Y)hfzG^v;tyIuCP2*2aKi<BH#vx%ZJ)%O*O=3UOIU*W
zV6ZL`a<rCg9ytuEBn5fB&Oly6SopVJ7{m3Axj>bTBAs|E<D4B}9cXvpJ3Gt+0$$bA
zatF33FrI7(?6O;rR^qNr-ICj1V*?&$D=E#u6=l$Uh5>{?FE(tXpf`I7rXA*RSStt$
z18`^TY+x4D48z?Rbl5n;3YurM3k|?R>GK<M`%}hU=uZzukI2Yxl%jhJ56PzncdGQz
zSkKg8gmJx@LAk3Db1DTwW!UtMw>))z|0*#((*ZJpRnFX9iC=hGs{lJzen;qR?Cl{Q
z*fl7k(3Ai5k+`q2H^9hyNSL%ig9*W$*)HkX2FR>EcXFdDkQ|@}`M0WGd#mE#%r@RW
ziUEb;U1_KWj&Vaxvo@*X{hNB85vpk5FsYwzwuaRW8wzVgdF1H3^zMQI9y0zTIHf-#
zZb<2sNKWVnzN0ai*RyD5gZ48qT;3<JO?5Nm6j%rEe&i?iG!pHsHD#tvEZ1@rX0%6M
z9^TW=6DH1X30;$axwHi3$YV>s9L50$`Ps%2c9e^1AWsk*ced?$7oD%4<bp^1#tFQv
z!}zy|{{ST?k08;WE-O-X>56>?7PeUtzbCA!m?jm{6!8gB%guO&{8KSapt9eR-?_pL
zzC`?Ct(AAzmC0*roT*g3i@|-N89woWi|hJ3jciwbXwRNY8g~AO2pXvV)VQ(H6jdWO
zr&ge(hIEMdl>+8BlEA%Xw#yMdM|_X?C$DR|XK<Chk!#tl?-321?4uRqjtp$f13<|W
z-gp~&0L2?S9q+U@CPKy%E09+#IK{yog^9}M?C^_}naBVHplWM{IINv-5&%?cuAxpx
ztHLkoNMq}ia9A(AiR}yjL0v$D&F38KKP4H!&Jr_+e>cjS4@P}&*I`OL_8Zf;dx((A
z9yl*z3%-6M>WB4Us-rC|jvg~6aG<iG)tXYIry54N`A*>F)ALT+ks|uuO12%j%C-zl
zC4zxme4?$PHe^_Aa;PPOkCufv+5osWtIQogQqd^Ne}Y4G7Gv4=JtmN|NXo)T;)66J
zQZyV{jsXvsoPn#Q_oHRxUb|NXcB2vO#%qZsxRJF=yYNo6<SEM_YXD4b)yOl{)ZXgw
zVQ(70pPvgaAt@;)h{9ZW-#0kqhc$rF`Bj3tvjWNiB+%h~<c|^xHw-r|w$btb7WRh3
zBUyT0!&&4l4Z}jt#&z$3RQ*SzECR49H!R%u+1?2>gQ_+tNTo@mg&P%CT3pz5bCiR+
zZb~Dp70(-%ZD$ndb{Rm`r{(8fD9avF=aWA|fnzP8ObQldN8PE#M^71mDGV{F)orpu
ztr}Y7X6}do9sty{uwd-pcok=nCf~8{L)JZHiqFY3n3D<Bhjk-_>njUWZq#<u>ToR9
z2u`d2W?H#r4NhjOO8z(QKcK7krYGAoUg~1aW4ml1rkAh6I$nDsR)vMlXq|`k^(!KM
z2koG3yjgGPusIG%S!7{U(;{G7PHbaqtU^<A@y1+aZ!fP78>`r2!}$Ry$n&4T&No2Z
zgbiv_ip+tz!l}SEPUHcLBBDe5wb0OiWM{ZXbMZu=sH|_e0Z>WmxM4hEn|@ldQuFJ|
zFXP{6c=zUXdV@YCinxh#X%d6PS(Gf8(BW)5+9LwrmS7E5;u!7lO=-%b2Cv&X1`ZgB
zbkl>q7_q>UH){tRR{E&9;b&}iF@zs5ik|4-?WZAJlsI#ZBsj1&72!Q;WRnW%CqlB%
z9M6&^SmOC-c<KMo`ux=d*Mh7|y3;YOBwRnL0f@Ffm_P6F$`b>|UbFKTke}+mtI<#G
zfCNGASgc<MIFp^e;AHA8$&4tDq!F^#Z;Yl|y`*ToH19vI2XRP0^c?Yp%&WGJTPG;O
zg795M(Ls(?j7)fA8GqsrMjMFGeRKu}^2f8iP`9?*iOoLxJ@540^N_6(CM9lAu5(^L
ze`T7B0<QgNEhq)^X8+t2BIvgYltb?Xe)WD&NWH#gLL`;{dI;nt41GhMci@RCi_-~$
zEpW6Qg~OYn;eu6Ivgq^r{>5$0oaYW!xj=<g;+XS33bBnH09QOgYsl!`z*RzF^|XKd
zV$9EIRy^VTVpGQKcQNu;A28mI|9Y^-+BsYQh2(kWS3X<O>1yqG6KUz)gqQT|8B^`)
zoftS5!X5QmLbU0FIz-Q{HAT&U+xK-VO2H1#unm<Bs98|efi|ORy?>K&2lCZ>CfgQP
zNh>y0SUN=|*|e&*;x!A?hHfaS2lAbxmHa18?i;b;YsBYG8SwHE^R#C#-q989$!uTT
zdC3GPl8pm5Sw?(Cw|V7UA5$Xz{_v77O#D+D5*F^2rBwdq<@x_4WssEQ9s4g$#h9S)
zdj_C7!Xs&`N~Wc?8I1(daqR;e>vu2G$*UUVF_<w#_6*{777lLln}4E?sV$Y*5K$NA
zu11PbSIbFmEOOq7JL9W@6)!9pl>CV({cRamX-#_~kV`z)4l(M>M@rU~ae1oU9@zQ@
zXEIs_u^|I|WBQXiPnM#|9}!Ox|ABkk@E#?nw8l^-V*MU-TcXO#dh?dgXbO!b?!6GS
zZm9(56PL`7evewEghi+#eg!giX(b^K<0T<Zz`L`!Sz1oD-^^h4WJ)8Me+jaZ3{)lh
zoP+HnzDN9THtcWdyA@}iRsy>ryLe-2nF~lrUP^__+8Flj0MHplF8+34<r>!FnEAko
zpKno(-y*)EtE!*_CCKQq^FNqus=#J<rf@BR-yMsxVLxN~6jJo5lsbG=Ly&&)lNtXD
zLwRPrTV-e`|NfFM&)mg>b<$R7aT=2&=I3|dwC(MMKw$-G@rsHNc@1yiU#@tB#zbws
zv0K{|dcpj&k(}sv<1AY%58M?v_W+<Gy?{&L>(xc_X!dm9!vdqL(4L107Q*JxDKfTe
zDf!}7)kb>)GO22@;{wM`4m!BcpQf97jQAyeiO3BfdQTx2ZO3=g2bcN`rd&~BVqAy+
zFnv~OjWD8?q`qo5Wt94r!aCY!5bVw%+O!@;AOo1uFN`*Dr(owKW8D@qsc<?2KWgs$
zh%pwdXfESj9Ob_venxz<J$tlz&;#%1Q8VkE10P6COY(p>l*msIQLP=%Sl@h28DY<H
zFU-*jiAa;5e54}kk)X)?zXA_BfKjzxzkxZuW5idcrYIg(MzHiYqqp$;KiG>$Mwq*v
zs)mBjv~B_kF0!+Z)xnIaggk06s%Nib&Nj$qdA65`Z`3Vz{O*WdUe^<i#UEesWGAjR
z;B<~G({}nU+NUS|f3i24wPeuWA-fXqDfqqP2+#D3NJ)N1omZ}7&}d+PqGGaeInP!=
zxPU5l>-Cq&rz2~>QdN3l!Dminoh?Cu|F{8N57xfX+uwBWaHqf`qKSCo;$y@gOs+ms
zm6HrP+5FlMz|k6!3MN{-;}~j?#rtL>jf%)G5e2oYmdr`o#2I=l5i&HUxay5h?YO(y
zs_4Z>Dx4{;+V$3kTq95HFqJ$76ArDdUi1$9tF4v#Cpb8H{>s$VPI8EhoJ|v-e?()R
zxla)(VwGo(=fhfsQNvkO)~>ya_#Yg}FYq7zW({HHwrf(BigTl5G;@+SI9yq#^2npj
z3@Y*AuQuy6Ilu$BxS|LuB1h_=((<UQ9{?R-$%_^`MqGecki)-e<SA@kNI~M#VYaf4
zu*FMMFq&O)FipI?+5^<+Xkg<0DUIG}>4mJ*PUP7@;5LO)S<3Z|3i1guxv(NblL4^O
zD2Nfsg&>V53)f8{o*0I+q1)NOvNBj{o9Fe-F9+iW0zN?pPnLd&MM@5@Adcd0OA||j
zOAmX)Y%T;-?N>H;<_HYP*k2fpy-nUO2lK0S<qihGD#KkBug*@^e<nB{s5K1~)2?Q@
zN~n`0a$yla^Z#CWIrE}cdnPuQs9m=*fWkCzHIDj<p0F_Bb6M-S$8>P!NTo#GO(P7U
z2=he7RZw!h*_*>N)%{=tHa7M^>81N8iD&k_mf>sxAh*7b&Xpc9XbTjSDs*9CE?E{n
z+4MLsdJHB&Q~_dJ#SsP&N4!eFh%FxPfnJ4E7Dfs~@MrS120iJ;NP&L+IxroG786sk
zZ{|hvg;7JXdGmpbyplETte{<)PVpyK&9m?k9NO?g<DM-z8$+*x{2jseJkrhjX=^)1
z!7-dHA=|kBBQFLW4f`MAFNTO81i8e4EI0i3h%ZcUd*w*~j2I&RW+kO3^g{RbR}E~z
z9iRCL_5W09KWx7Kv$b+mc0vX$ch3QtEStL<PibLgveMOPE$PdD?QL93Vn4m*7#)h)
z2Mb*V{OR^1RDc%AZe~*E-cHvuK4zncT%#0Ge5=XhTJo410W9BnG1ody4fnbD#TOGb
zZcwlT+4+KXE*3g=pjUx}eYOSSkFI#^(Uc7pzfzE;&1(u>7(@z;r-)yTR~$`Yn?T3p
zXG-f_DoM(F=sne?^PiE(qakg3PfI>C&v`*lH+TS<IoaRv;U%zEv`fSFw>iWC*Y}xh
ztEDxfvPB!#AW#yTw}{^(egQ}9aiFhI3zbpzxsr3lOYO{(mBN>R!}CWf9xZsE3Wp~v
zr4P2u8#2@jNOPBZ43V~#iB0aeP~alu_e&1LY-D(6ii$jiFF=j0Nz!gMn@akPc2mS3
zX_P!M0{p_FyyHwtd#Tj@;yd8qpD-Mc)*HGr8R#2>{bT(IYJ2Q>Cyp@vi9ftp9rzAl
zguwnIV$VfNV#opGdAC8GiT5}&CtUQPBwPx;TFcv-16K^3@OV2=uQd4s1?A&r09Pwd
z4;Yh-Sx3G9r~uS+q#y8)7qinFHX}M{g94QmwZIr1&ECd=J@D6(C6qFHwqPdx&8t!G
zZ<~EAIz_#85Huz_txOo-d=!A+ZQhB9*d1H5gVVUJ3jlrkr!5bE=8jq$gV1D2pfGj8
zo;`8rdybma<F~AaSZ9y1oD6l*>=~f_6K8rfm2C1U&C!c35{OlZd|!k30z-S}ohqWf
z8!Lp*mZohq@_3Zk8{Tfru~Zh@l22{=fzqHr5XScQENwOpR}NHMy#R>@OZ65Nx^)U&
zFlGL;6)wdBwfu?_nxvU?=7{`Y*-MjBhc!rv`b(5-UPS6KW+^%*BagIG8=o3;t1Nl<
z4SL6W=0#i;H!kq5K?V^(uF_&lPC*I&)HW*TgrhF_3xl1$ZSPY<dwq-eFGe^;s$voO
zldQsIqmE~mx}`5lTBL%UBEJjen4kC;Rh@MNmP7j*@qMk~6ii+I^hA!>LCT1u`DyJ)
ziHKt+<=UBxQQ|-2NncG5om_l}<qXzF+gmF^_YrznI}*W2>6&gFICaw`Tc9Ll#7F|#
z<DH0KVH8rE-tWJ#gMCW*<JsPdQB$sOHkV-WO&8qJ3!01?7D}f$p`Z$bm6DuPR?Xk@
z6E}<{7gDKMKAVU{8C-CX3tFAK4+DTQNAaFY<gak6U)Bg#0<MlN+8b~z4CRl2@H#ug
z5z$2aN|bS=S{i&1+nb->$`UIj6usq4J4=W1{Hf;kvwuf|(!qR4c~Zb6?=k~Yjkq?L
zSprjAd;TSt`?SfSmyAPE9d_c+D}YmHd_m^{z%6SD`5maEv(Ea;1m#I5XaF3~=1glA
zf1(mFt;<)wga`hSRK_LS@32_AwE+hL`QLaKo$|E_d1u8lSGGF$(?62r_xyH&EjOIr
z7CqxO0-f3Ini|#s{ClS}<Woc$@!xCjb3rh)tkwZ{(_49Bse}GAqY!7#iBh(4)lf6q
zdCm!Zt-t)%BO{Ope(uTvIWy@gsbST|kGJ-6T3w{K&^4@uXf;&LJ05RO1{{`I_L~(@
zU&uK%#~a$tXa3HVPVKpWB=)b!cY1EPHr2`rb<^F4>$KHC7Xd)wiIU@>Tzw+|JQzWt
zl3ws81rgaf3@M-xpK&;0{>#^8#1ZN-?+-tkt9u7S40e#toc{#I<pDs4QObh~OjaQ-
z9zK$B>ef{O67Q&#;mF*{cD;ZL`EoS`9j2k2Xb5=xs1x-By&x&Fa*xJnlRj3TI-PsB
zw@z9KLnX-Ft!Yfrl?706A?=@d5Yb^f?muS%M)MV6$+wr1uQ=(1#n9ppY3vUQ2p4+V
zr68{}1T@<L+5`D@^m~B2`l2={EN`GxMtY<6RFlda);O@BPwF3JK%;TZH<;88H~hX9
zu~&A5i)nBNTX{j`<3|BN)bDQ*pTSh3KnJj6UdQ($q&9YjKi#DcsJ3cVvj1}$_vL^_
zZoUSne*FkQ)RqD7OabvbP5Qq^{1WlUx(24=!o|8@OWUUsJz>Fcm&ASok2R&EJZc-s
zCda;A!{{MATDEP*qLn!JIEVzq%KJV7-&K-$f-~xQ=5if9gUH&#`6}NSxn^Pf+eQ&P
z>?a%c`LlISmKNSG84EN+n;lr;D@!QB<Y;xSi<M;Vp8?!aSRT5wyPJ5Ho!f5`0LpZ)
z220yo&bB0NdH5AMt%PwlHM-80dcAW8(pqly77-YV61F~-Xm6Ze`Jjt+vW}LT`${Bx
zIGW$~mg|DmZ!Ml|IKa`~qp_$TkK}K&{akJG%ncPpX_MF+b2JmN`vP`T$3?QgU3y&9
z(mBG%pE2MhdQWN9uI5H2d{6-m*>FfVICL%VU=g#ya@;Pu_4k()_=~-6Gw7yxh0+ez
z?=5Ramxgd)^ZsCr+9TA&g}4`M{?HFlK^`%XvWQx02_2&a%=<BVi1^GqERGmOjg;bz
zJ!)*T@0(X!_w9_Sh^_RhZSm1OedU2S%KwsZy+quYEx!m%_M7Y1fxg7Ecs!B_V3*YZ
zXZxYq$e)|Zz(-K!wunAVWw~Q}cSc7o-~WajCa+_MolRMxR7i75B<DotgktlRaG|fs
zk%?CtKC=JODC~-4e};3}6Gp_&-T}aDbJh-&NR_5GT($zkRT+Si|4^WM1h=WM=sqS{
z|Js*ff(r?ATB1E3aD&&)Gc-|ZR^?yp;~eoLqCl9njUbnM65%nWT)S3p`uClXVmKsV
z6EV3n2au~6E*OBs@~<{`O~<_<wYz55SAq6qr@tgAu53WaxSoR%Jb6HmxhP{fcNPBp
znJRgc+3>{TiMo{X;efcfH^F6c^PMT6mFyI0OLj||ztKc6v+M<&!;QE89IT!r3NSyq
zQhT*H%9a^;ih=4<_?o>^Va9t1EH_j+dP2&@M8+Ed$bEo%$vX6`sydgHu{LD1?q7EN
zp*H-R_2;XdELnwRhn#u)#CAwFHf)OUb$#WlR|CsE!on{E!(E2^0`opb{D%ER%DGN<
zFV#oUCy=l-)cPg{G%b7a8GZal8K$u@n_|LuHjQ?oCHd2OoYn}p^=|vDBKE|q4$4gp
ztd77-#B<ugAWkDVHb)bf<@Wz0;txRhgC!eFvT`CP9@k~a!g%7^-Xl};oDKU@P>oK$
zUzw4r%620ydu2z9P3Ur8!+{^umWqaM40ZOIibTJBBV4qAfB{Q7r-i?1^8hKsxw275
z;+=?y;iEd<#6V49oHFe_sO)$qMO-Nqt`UDm1iRdz;@EQ-PE<mQj76yIwdH#jsiS4!
zXuQS%AlWXZ7UudpOQZ*!Q3)E&`KL0i<5o-`TUPPi-X^*SM<qX@dfUu33mdqjL(^{(
zu(L74k}qG#>vtsRHF-du1WEkRa7u1g{yOu5wKn2Y#IJM<7rtgE>k#}$#8<Z9iZ)M{
zTWl`=hzd+%ha>J%Rhb(~nLLJ03^+NOOhL5IOaN@HP@Ws_!0qj@sLtL^IdQ<guiwU^
zp9p+_ly@=;DI>1Dx+O0HFYxn=ao`=AwZ&t9;!thef5C1&n6J@ppE-6D-Rc`f>>8_7
zL4CA>W(6c@*8oI0cCdJ_6!4y`g5MfWN;;uc^x4^<zKgw2qCnGdvs>6Op@OY?XL!|#
z9qBmEZ|OT*|N2^GP1_AvyW!-8x#PzEBr)F6Xe-6=i#7g){e}DI9ss-tSYH}AiIRxd
zOi0U9um+)3Z~sm4HXt(gn9+s=;d{g$>0iTd)#jc8aDH1KfTct0e@FZp@s;=S&1oC*
zt995W4y5Ji?hMEG5kJi^7c+Q;9mx^I^3mXjAohNz4i(6H$4v@Ll|GeeYK%#mht|~O
z8>YA1(mQzVWROic#f%#NutreoLK|r++nITmNMe7Fcw5(iN|>W7h5rSUMZ_l-tl?}Y
z9QeeWEIU&PUP0<tg3if~e$t0hP+2+Hb5>UVe4x2QC!tuc-;Rg>LBu$5E|kdMY?Q%9
z5<EiTw8WF;Fh5~`^%^_s@HKE7!V0epIBL0Vm4dec)PN^bnk5ifW0Bt@{<S#`v$Y&W
zJm+LJ^&5ti```kf`U%3J;gA<_87fa*HZw>a;VZq2gJpxg5xkS3&-)|Dtv~QX1E-*Z
zKf0L^F+abBWp@kV{F7i%U>zkpvJrg?z{xL0Ik;9>EiPBPyJY_)HVK3O@&Z?rcx?0$
zsYhyw+fT&sf*UC;8_CQ6e|>ax)Ng%YZ997{@N(Zd2RN8KL3(jLz#e&<s#F^YRxIIS
z>uQ270koMqCi%?xu$PEK#8+-XbpNvA-xwv)ug}nefd0n6G@N+v!jK3!SD|#g4ds*d
zV|GwQ5@-y@?dAbQQvRK3aHXqKM@nfF*4+Vyd4`VOgPB#>qk-!=l1;Y$J#x9vcC{r?
zJ(vMY$GxF~Xsz35#*Hi>s%NZ2X*Jp8;V)?HJ1*iy%o30gsM&w6dwC*l2EK9z4whLI
zA5mD#{eO=5l_P#*&j+0GAEs{GE+UrJa?qf+FGvn&{$_1@wfpsVbdlBe>Ify2GOCC5
zEb<^kz0$UMg-EJQQW~bGJgw)U31yW)mzn##0T1>(%Y%PkIik+YC)TM0^oboT|IItS
zubf{&!V><{Wo|u;>V5IttEh%{rj1_$<{l9KmVI9o%Hrg{+gM>Ww=DJLpKA@R{6~wE
zdSd50(n1vwCw3~)BC0YVIx|fEQUXkM&7_7jTjC++R*dd}qakV22NaODVR2@osqrO5
zDanF!3dRkvCd^z7^*muuQu!v0Jj%i}FhorD+2k}mWXrY;?o-4+Y#`~)4Ette;+WB>
zGfMealmj|Fd!s65{Gs&W$^G}d{9E?<W}OKYZ8E#;Wt9FCVyh1P%glC51YH7@5(=Uw
zeNiKgzN1o*ck*7d1EBAVx12#8OD=h{_fve{e1ytIAl+EV1}Hj1gp^tIS}(+faG|=x
zU}E5o)3eb_^WENARWfzp6Q{2-mH0LVSCg!~;VTOq)0N!m8Tg!BT)cbBb@n!tNc8ry
z^3Ig}b@~qxzY=|y9XeU{r!;ME-UXC>#M|HCEG42S*rQL``wYVT&Xr90zav^2n0tyj
z`l0;r>0gxEdnvUvFj#*yPk%P?PJx&HV&+)bjPUa&i!$Rpe%hg&X$z6@y{&cl4LofC
zM#AJg1sUj*sgvf-`ya_h8)TD<Q&e#|Y~0BG=J(|5gC%>;zdQQAWF$yo=uZl_mxvz`
z&&>6|am`DHVQPPX-pWS}u4}M1l@cGEAKH@qcOrkk@k2-2Vbl*eXA3Z+fA~QPT#)d;
zTOF>p{C(p9R1wFBFUI?ctL{?-U9BHsx%qF5f~d%8O2D0ZObW~>v4IAjp#iLP6r@gu
z#_P2J)IZ8sDtP~QoXKF6XkN${NuwK?!0R<nSMi5e${2$iP|GBJsZU4JoWYu$Zy?AG
zBBMY-ARIKKsv98EcBX+$d}zh0hVdjlKQY@Vfyo*b@+|lj3ZVmjT?~Gf7Kt`K0`W(S
z6^$X|7MiKw3}6CrQZv+eKh3w0q&QC^q>p_ynY`;d+~zhOp|H19ERg+=h?j`}w6`ZJ
zj2_nTe*}KNam|zQCqX<LOfhhGz)If(<%_cRf|ZkYW1Bq}i4f)|7V*^|tWL|U+*fmk
z_wUcFYER+&0f$wvmj_c>++C2M@r_|>Ur53pNoy+>c-YL~gQKF~QnENedz~zgefD{Z
z%0Tv3sH_co&87qm=C0m+{WAbPk-JT1$c5p?`^_gRASE9M(BxzrBR})LJ_8jqDre}n
zGveqYb-XvDwE8TGSQyhkBTf-t$TK^3^4Ykt4?n(W9aKYa9;@U^+tK4-l3`Wd&ZN^j
zjPAzBl|H+;tWPZUQdoH8=Sm#I{1JTUVQEW{<_~+Pr{8~4hnwvHm9S_No^+YL@n9^#
zus%S+1Iv3Ac+wXf>mzAFx497`KA|=WtTG%s8;&nB3=rfslMRGH2hsolfB;EEK~(Tk
z&M#$mi-*ot5QWJl8!Yt2W&)I(aeKY~lac172_jS+oZ;A#ACmg^W<s6;{;&g?F86$1
zkE2nPsmP!&7U<Q8v?F@vn<sSNTCFFjgYRwUq~y*y^wCPuMZ_l)P8><=&V=;j{LWOs
zuSR<fCUu!v$Q%A}gGN`ec`3atT?cOw|BU!*?@IPonuLPxjF@eR7!|iI9ZR4@&pV)I
zoc4k>e?mtE%-cR)sSOc?xc15Z-<Fa_MTGyhIn6XI*_o_w*|q7T&5_9~_&e)SDkwb#
zT-s9K{-5aj?&)*h2OxoP#bdNkdb1(hTt-*Y0IJD1BXKB<9-D~I8wcnV@tJS#u-@4c
zc4<Q&)(9p5G4IB;x(Hag=$_rW8*o7<-d6^okJhm_nHkTvX0u0N37*abttu6^jfA9a
z35p95UTq`xY5pEo{!9#T0t5#1$b=hM<Y~*wbs*53L+sKQwvQIi&J6aS*}T}y-#aqE
zlL^w*gFIpyCwpaA^8Jb;uA0ebdxP}A8Jah7ylCJnd-28?w<Fm=2{*bSqYP^$7|nZ*
zR?`bmSN~BGwqw|0!(Aw4+=C~&O(Bm0cx#-Gw*Zy4X5|{g6gDzs06-jg{m8{iiG`VC
z;(!M8TccH9REfEv?BDQyw}>CesB3eP3ZU|5?;pL^yMQ)*0In<;;sgMgKxe;R9xbYD
z2!VP|>yafKiCzZ|SbRi2jxY@e{KS(Tk_s@?^OmU%`|0WM+Hsv86H+!682=?h@Dfd{
z61y`CyYX{3D>WrG*onOUh}O5TciT<`fQwDC{OgkhfC2;T4A80Yc!fgPtxI823miDT
z2MR(X8D+(iTrw5Nj6r-d8QYcNtARa@Sbz1A1A2-0lYxj~H(By@J0#+{51nmnNy}mE
zK^jN1M?d}d4bb?914v_LE#9!ohGV$lzyk;P2{-U&QB`t|*+4-4DcQM;y^`U<I=SDu
z`Hfj94H###o+*hko*+rjo3E3~KFaalLG2pNR2es1WY<60?9!7R0JVTM^axcpDB-W~
z&ls;-h$@_Mh0_vGyuJQ@S7OdC+}#)c&R~)J{+Z0DN`Lgqf}eQ0*+OOR0|&}oQ>vqw
z@eqR@p$!LA5<QoVePE23RCJ{sgbfkGM13WQN45FD37LtsfQkJGmI-d%P(dAR&xf?D
zIq^IVheY0QQRMuGp|aZOtb7T2lGPm%-yKFrd1$c7Yl-tXo6c}HYYBew%xgogYvY-*
z`74cBd!%h8%*dGvcu$@2{^c{n0F^k%&(v^XoxRu~SK^Lj3Zt!!6aY0}JFqYJUeG)d
za)!hYdSOxOw4TNnM!FA%<^)0qyyADmn)lOl!??6oi(dGVP%ii{IZeI?6RXr)9VRHe
z3b=?!Nf1!a=%&unz8cb3iWs#4=DlpoDQ(%=j$qP|mK;DPuQuZ&`ACRTM7&1)X*q#-
z2|=_uJ3#Kel(_BK`UmU6s@8o3ktQoW(PwXxRymMGZM!i}sUhDBt5p=7iV_0%#!<`f
zKWuK75_SzzQe$Li7q|!D8JHsq;>ufMhlnqBwJM9=zW66RlWg0&23r24TnniNM%i8>
zevSA~TgZ`fwj&<zHyv=8OuM6z>SU08K;e1aq)@_?Cso=%R2nMjHkFJe22)}Vft}Cf
zPVeNzbEaY{c+JSkeErh5djc*heXXn@o_s1sA)G)TGsDkIyW!bTa`IVN<nK0xOy%1P
zgK&N^{%gt*_)ivmMgk-xQ}77QrUFHunxAh7_nOS0E`^Gr*7sh<*}$dRN0z?8or!Qx
zq<F<92CD&DDzpq%^qiQr)6o;OMFCBy<SNR>rKMAg1ms~&!0OY1nhzX_v;F%6*T^Y|
zl1Xfj5v|oYcJ>}sktfo)<a<z<a^y3SBnYAvd2LIwml$w`_FECSG#YUR4Zgwg<W>9~
z$!|e)t}J=3Y{aJ!+(rQFj0HH_^WRMWGk;_N!gV$8vuq#$-=G9*Jb7|`FEbv&TN)@J
z0c>F;_qO*5s~S%cU#z3&iNGqL5*Y?qI&n{U?;_&a!1|8RHR1zv4A7(bQ40Jc;#)*#
z$<1sedHzum_i86uDxjYGZ#EkVr9oa$|0mp72a9pR0o0HhiHve0Q13`6&*TJ3p9_mq
z+Z3o&7d1Y3GW~~?)NffcZh^r)M?7KqQky9ZpZN3FH49Q9WKqhYC^=$0Dq?O3iL%nN
z18=Rr@|$rp1Fh-2Dhz!DHuuVaFD21A|0b!I)Q)Nvm<@f-nFLTvxaax@yy~7&%I~Jm
z>uoa5Mjg4a!56QPI)MOpTEEUvkFc1)pAl2US0jm3+lh!L?j>2ayA5CQyA#Jix)3Fg
zwP}u$1z7ZDpFz4;YvCC%i3*-Aq45{G>N8$a7)ctynn$nsuel1PuttoQ!s{Cxpu&Z`
z@*xWV`-soH@v_GOPW;yb;9uEW5t0}r2Rd3oP^{!5yEkD>8x`31h<`@>j`U6NyGD}O
z0c&FOrCPnoYxaPJD+ACABUU$rBoz-CxSLlVL{%X{nP1nhgbb^}6gZvdh=M;+mF)mB
z$*sS{FoHbbYBch|sVK|`B+q}1_*X;`@fAC%td6S^_cnHu8&IED%k!U6rFOJ;DlNGi
zUy}|&m8qPpD$ub^<K~UX;De6ypG+_*zz{7JvS#f-m7wpfWu`alkQS?7*I)#lj{JMW
z11q^yJz^sRnRx^?4A#6u;TvZr0Jq!U?`#Iz#Vpcm*NXdj8K<O;`4eHVwv^umLO1IO
zyMOs)>hZ#qE=TAb7o%>Vx55Htm`p}W&dS=kA}eDRmVt?|+nbsrFh`AAIBsHqcV-bC
ziC86wQ;n?85q~pHh<VZ>4Zo!ko2`YV$@CKU2dJ=~qjioPnEY{L$e6mho6RDc%3nuX
z7%Du$JJ|EEZsU^;FCeDG^j~?I+C%ytnGFIpj4oRa>T4>R>If!n#E&e63~uRQ?sBB$
zS6U50I^=GB00x-JiJbceowbCA1}>*vbASb+(ZPWK#Ew4jO<}JpbKr$_sFgOlyhx=L
z(?>GGuvR5!k0lT6IhZX_dd!xfL}@*8R{ubf*V-)NqxqSf#pxsd1d8=Ix@Tf&q%EP_
z8Q9f&wnSlGaE$77bDHR{o&Or~3zpMEXO>ot>+qqM^*d=)cyGyfo~(gZW!d7RH^NV7
z#;%Yw75|LDkR1koTmz`V5SvJn#uWTBX<0{jOgL<%wo+X0`Q4kfZ7rVT+nPo%Y$Se#
znxL`1!qQ%67VB|FDiK!ry2^a@;!&<P)j+MCe_ebBD>Jfw2X0CoiG?lJxZVN@xm0E7
z==#wS?x=R<6j)`{MrY$wDx=*b4azC_8B88<ev11eT>29ua&rDMZ^s&qB2v?<|AiVT
zmmEL|*GF9683NziX=>|~*#N;({E5RZto2jwb^6GM&t}Xsc<CJ)xV6KO1Tgo$qe)MX
zaQIS(h$l@DoKnaMu%+!f;<0DW?RDM7rEU34qvwu<^__RzPsFxZ%ZUn{_K_@nfsYlm
zi6juc#lXA**^#f6@cBLe>6Ow?Z6_?&kDyFVny=Z0xlJJBanlDYZmRY<V89RN250<1
zu43vrE?3mh5%DGB5k^H~#-E#YJkhY8^$x_K%1YFUg29cqrVUp$X&sHQK`X(PX21W}
zh>p4UYNnPOlQ34HKVQ6Mw<R5xmsxDVlTgugx&5A*!`rf>rCFCK+-dCqhZI*A1e(bT
z7;og{8x%%Zb-1;GTVVqPCd)>WUb5NjcV@Lqu6!}t>|z{i?m4t!S9VL;oCL@n;dvv2
zRUP(#bi3!y6*T2PcA5VicxH(t!`=ox;av)%a+me-2!QYK`^#B8Q?Wakk4~<fWB})g
zA3XO1zcI3bS0q5O>A;qgy=bq00yzYzOZY_T35N_p_ts)r;`)`2@5u?a;eVJ<tgV7L
zk{4@GL71K@**&$Y9g!k;1a6f)2mD010g80&2zQmu;#to9(!D1fFl{2By(#z)oU-?b
z;ALSg%al8y5(&|x?Xt8ilg%`(?ahBl0F=T(WO&8(zXO6FfTbG<NW1vX(W?o=8fY+V
zis-}mkA`PC@SG|j9ZYE5W)9%O)Y2zj{3;o62PQvo3)@B1%96!_8psk=KC`I<9wNaY
z?!Z4&K3DCxOv$H)7gjcY|5s#`&#BJk87N(t3^w;(0W0u~d3!1FN_W|w?dU(+KN0nn
zg<)kTkLD+H3Q|X4vKeZ-l*R1$!_%e%9W2Ld@wfwHB%f#}ec#joi?*r_xAuHtAUvw8
zIWfW6G>=MLcdWcLDKn6$TBn~ACtSX858KwGdUz(eJZD~PHVk^P6WMWQRle`6JdrHE
zR9Kzq7<$6!uQtlL%F_RP4M3HG0}g0rGbfWWzp);pM<AsJKgn|+Ni{0|j+8zd4q)I{
zCx|4WQ8hT+><x?CpXDdjQl~4wV12{CabPMuoe-Xf;jofmc3V?8D$S`<dE?^m#FIs4
zG*heEn1v!UJW~)X;p1m0oHMT(+PHreVWq%cK)A}!Kgor28DznRw{{nu#p=@DB-d=F
z@BvZ^faeJC5n*$~b#BNQXWJF+xr)sy!hf?F(Mb_@MgY2)?JJKe1^*KW@IwN%G+!N@
zHT}tGqgke`2B4F)BynFreXkr$5k@mv{G*rRN?to(Gx{$8izoP_0pq!4b3EJxAHkkV
zmTJ@V_tsxCtoL6Y_h#dGDiDuq;(PnA_s?h)(0A_tt4(2QNy|3qgTQj7o9v;DYBb1_
zv&=RwIN5)3(}mS*#n#P4_-z`$i})Gw2T^W^isv;grMKo0a{HC?FRaQlVwWe*@{frB
z1`OWUUoz;{co6Bi54KvGu#ix=Wf}u3{O2=&IKfHe>0j+Oa72cs03|nC;Q&>U&k5Md
zUZ=c#t^kh$i2X(+m^WplSCi+A;FTR0m{_-I(|f8zO~6+=+5%7TZhflx>3Oi>t!m*I
zts!3S>50uuHbDLA0~Yf=;(tZ_N5ub4gjfR5Z|hk)GYII7-x@Kg1M#e*yJ?rrk_nbq
z3ISIh%tq<&t&X06u*&*9Zr00p;}*U|yzz6F%^F6ByhL({$=-f9m=t?k!m?x<UkQ#g
zk#Nbz-YgX<*A$Xij2p(I*#)9>8(7)nCIX!z-Xi`x;veJzr6p?}9=_t?^9*KZEv7TG
zSZZrz3*y$a*0DE6QuJ2u?d)_lK(g158mStjUOiZ0S(i^XRjeVuYO?!ZXul^rvto-h
zKW4;@9y0rP{;ng5s{mbtl}LYpfb;t=4QLGZCnp?WXEn(8Y#^{i_%ArJig>WM_n}Uf
z(<CzDC2svUGs8~Y_DjTwYgp3y%esIT+~=?K%!)4<2|c2dzoGa~0CYo$IB@2E@@|C^
z2BM;?{M$zvTy5mtU@%<O@SdrL=Z!vqnw{P8?SboxOg=Lo@3>yTHyEO2aWqq+VsEb0
zjw&K{vVdaT212%Uyh@$}USLB;c8>TX;-}3a+_N4UO(P}QMiNpOU3=y&J`McE*(Mu4
znJ_OgK+(__Bac*^c{C0&M-T<)?qDl)CF0fW_>MZ*G3x{j((4@o<j7gyv93q9Lo&u>
z|6b6_sbg1e<Omm0S{V6I#ZvGuBMYXEoC9a8$uNX;5*Hr0rvdrE^fVQ|-ZBTW{O~V~
zFSruDkL>>$yXZ`bxbYeJW`on7t&}~GpAJX_au`wC-!2f9O-Rv8Jee9K@1KDy^rpO-
z$-2T+rSVY)v)s%M3T&ni;s7rgK)2f5bM8Q%(vq(L?ZBNk{LP$6ZrQD6^VCyvW3_h}
z)kXkT2J?uQlZz699iShUmd~UWr6kbSrl%h0ozOK#OP>-2%7I~g1?#^<8aS9vw=`k=
z#{C~btRp`Bk_tP|=(^@LiynsSuav&zj;{8`;At}kDN_>#R^mT<#H2loa$mW72H<V_
zy}cVwy$CnnahyAVnOCX&ff#d_He@eK#5Y=fHyXJr;L`{Vy&S`rh(GC^Se(||CjX!t
zxp3AeTg;wE{mj8QXRX*B!#_Hknps%#^h!+jK^F-EIY)da|346~|1;tXai>c=f%DID
z`hEe8DaK#9#}fjpa&3U3eQG)pTRXsSR@qYMP4@2wQT|}!FR4$}PSoSobJtO@^IwS1
zSDRfq@fQjTm0gB_mWTgj-By8!Agq#5f%20YK!MsvmK&V7n$3G>()WAjI{b6urSzFv
zs$x3KNCulb@WKwlH>8ixxZWEUF;eTxGvLbFj)F)alpqcCh!$+O20-H6I$)|s!(1RG
zCwpsD@+nDHCP;=w2K<HSJKGeZxAi+sMkw6hB6hs@;@t+HRygA-dn3Cq!2@r5dt)%&
zXqqK;>$aJUwJ^DFm>^=nbry)K8hdF{Vt>1cuMwZ9xV6@FT*LX6SlUMP6-oKVJ}-|@
z;88~~z!R1;y8r2!M%-~)o^XfDN2KiHW^)ssNPlL7ZhH#q(oQt|;i?7@X*1xB(^Ha;
zcd#pa?k9NwrP`0_o1eluzn0`>2?iS+rn<%plC&w}CE}k{0C)eU3#Q<u18o`~#3dED
zkrJ`RA@hM9ub-{Cjzn+LhKUi&8?k2W^Ap8@<*Y1p?-21l;)U?A{So|}P`pZac!f?o
z8Gkjezb_#~4^bYZ@K^I^ubc0+76`wwa;jK7;#)^{s$So}pyh0%NW~#fSf3szS-*-G
ziu_`d{l+q@md7bJ1K`N(XBNwTFfMM!T|59{hfO8raz6)CkS|w|c&){CbeS-)V70Cc
zQvN>?`-uOSb>tO*gl1DkA_S`gBl$o>+)#yMTZPrz+s1E<EkDp#sYayOoaZM3{Yv_x
zNFuvXSMZQ||9}Ay#G4@Ve}Pm|??#_Re?LNjZAsU)5UCy8c_NtnvSv|768AbSoxEC0
zd<~2o)(CvLer<2_KI0C2n@ju4`S+!T8bLqLjAXm;bO-Q2llh+ka8i-V9&C<mW7mA+
z^{&aqMT|?&Tx0e}o>}tLRSm!`Vc>R0{I(L*(vI}~HwsHhq`#>+WfgmM!S&0}CA)W_
z;9nb9T8ihNL4(yt!{`-2T06-twmDk|_wp2v-p^dvfI$^u3r<31$_)wSa~*yKz0vaJ
ziK;-&d%^lm0i8Bb)#Za7hS|LOL;!wYbKr77SJI>jwnc@32~Rp_cOkGVetKb^9!U01
zxUm|Dvr)twtUxsZCI&kXVJ%eCX7p*;f)1EBSSlCRZ<jOzO$<?eXTkUU(DKv2SrPv!
zdn39Zpq}hhDRfWiL*9F4WKZtEX9^I%f|8DAny#a$*BML&zLD;7`3d=b32;C3o+HzG
zjy7^&GGVW{@%MEAoJ<8Lwa{`dUu{C2T6uQa5tR5&6#I_vJ60-*M;j_E2@V(i*cbfF
z3*_e;bXG0{4XgDn&#AaoS~L5cs`QgpoOxz(rmwUgwY<-C&otItMB%4M#n6)K!O<qj
z%$oqpRXhW^qfqWhGr919hO;Cr=-zt3caQ}gywpJ4=rRD3v~#GA+F|{R$5g`AT27E>
zkqxeE<S?tFWTqz9!=bn9@VBIpCj!WgNcm(6ry)~-X9wVh(-w8BuvkD4%?7dZpBS*%
z+9Y9$VME&6m;6Lc7gJ-3gcc_G9mlR?Ty9=J&|2zzwir!qHOcFGo__^=?2JE?bhoqU
zr*@#^`Ym4mkw~JF`Q9WLCCH*&-@Fh|!%0#HglcLXfg$fjn)uJJY;wykR+h(g1k=l!
zf(xtIsBrJra{FiqM;L~=;j4Efo^oT2p@O`WEdX+&Vlv=;gKyTLqMSbpqHke+eyyRT
zl3@A@(9RwB5|)0C_#@&!@qtUe_F!c45><BV{a5-g$;QdnthL;?UUzYZDL@Knj|2PC
zSu?7{W*hb|8EFRvGSCrru*DnAKyx9}as)fxdR(!PSA1w`^TP51&T?$`kbG*y?6KC;
z{@M6A3nwXw#F$GpSz^{MyMJ+mS9|+O&*7INx}pQw$Y4cP3M=ZTm9;IV^_8BOzxMNf
zhAt6*;x~fw+h~fK8z&(Nf-_!n&(2l`D8&}-P>0NG6-yArDtt(7o=n{S52}3Q`T#6a
z>JIf;XYF7+sLQi?!fgZ;md!{`RKTkYw$3>M5;U3&3SZc3v5)2Y^ym!j>^%-0?y|I;
zN=2Lv5MJ34>A1Z;DuB{>jX6Uw%jsDN*aueo4b4_%e0pJ$yvUwkSyRu0sG2Q0?~R>K
z>X<X9<ek5I;z25|U$7`=N@?K;`WW#GLo_^JkOoklHg8a^`KP2@FjLQvgyd@Dy?SfO
zPB4t57PArkgUu6ct(R-JUP3jPG<N5zY78W&Z&Yix5oikxdqJAAGssx*q_M8Sd57j~
zy}olAfH%sQLa-H0Aev+FGvZIyOv`py6A!xo(WJ*CwV_9g4eB*XSge`5Xt0(y0OUi&
z_lPsY-u*Zw{-dX7Y&N#@fvn*>-3YN+Kq_nr)d~^Z@GI{Y&>7EXN%g#t>cQr?RB0H!
zaQHW5)Qe=|8ziwho^^rBo+U}2?nPh)!GSRFA5<`<%Ie{^av7m&(=BZug|#tW@CyII
z@yJ2&fCDaZddp+H*!#<SRGIVU9!Pv~0uU8xE`K%w>)=pO3ZXq`pt7i2n=y(48`ifV
z3CLt42WEK3)`W>0jW{|(%UM;c)`|SB;a@uba-CgV#6riN&nD^;`7~HAdbf`O(tc+P
zQ(`f%I9W+@H$Dn0@1&Z)TGzpZhre;>i3(}3$v-!Ahw4S@Ag~u7{3Vm&YVnQ+IhB)?
z{GAe8&b^0B@IubK_|P}FtmTFqj#{zKVEc$45&w$#1+C)+QgNboDMgLiobpe}jfgHU
zO}Jgq98#}ueyJ}0Ur3BSO7;MoiQK!i8&3|uI(5W2tK>OY44c;(bVbVP$>wXsKO??H
z)D+kE{K|%kCOD+af<4(N)CRvW@c(WIq4X5NvA<$u6)Kg67QEdWhBfdkBdZm>1M(Gt
zTw#s9kzKsPv)?fC);eQaD`{1k5UL<G+JoWVaF_=+eBuian~j7$y5o_Nu{Y~R3gX^m
zQ01@9ul*TXa)af3J=ti94rFq)u~!KP*OCGkNaWN)Rg!-kDRkt`Y%@qlMfxCTl$h)3
z-y_gP$@K*pTm=g;L+N!8ENYd!e^g{|w)m?sdU7xVaf6AyB8dOaqU>?tN8I!6BLiwN
z-I<h0%kjp!e=`G2D2N(d*|RqVmJp<!Cvt)f8R(e=qbC4O5CLbnjAAo?4&zrBqx})_
z&xk4FKR8l~m_k|k_XB`*vm8qmeEmeOf%Wy12Cm{(-w5m?aF?EjWA^<CbwUkHT%7JV
zaI`MiZPM_wWhW)a^=Kk)2_jBx<c1HgarbJf+GEHHW5uTTS|0KXCD$6YNlQXnnQ19q
z%j@Wq&0BFD!Ienb@cj({G@(CJ61>nJpAq=&KyMp0AQeG>q!PHEMJf5+<&N?qs;r>E
zkd9Vv`C^T5hs_#>q|-sTZb()xfUd@}$qoAtb6DyLlHyZBrW?<d{6vjP_H#sqBzuuX
z1RGwn&AT-Nv26HntpB2W*19%kUAghr+Px0%HQBtl-kwmz`ZX(}D0GNiE{Wq}1#4pR
zJ1^BqD2OHJ{uPGjY^joH&}Zu6+eJvFAG4Kd8;BT*LCdv{TX?tRe6^<h3481d)rwdj
zps-5o3j>&zEbNkgVVgq_Jn;+~5ulg=!8Hdfr-28c>7GET`!7rpp@Q5l3;m6~$m%B?
z>dBDGfb{Q{y^vHtA;s{4i%ZNn!6NG1CM=X3QfWz0gBuZ!y;8x_N+;_#raLdTt+&>y
zr4`^)CIJiJ?}dXlV*HEV{O=K;ZKdC`8iXjcM|-D<iUtQ>1E?mFF?*@EenoCw-2e^$
zNPJeU)4wpct{xlZ|5CnmHsgK6{yOxvkCvlWmOHAUrQEdq*+!6`fg8`9?#dGKehq-c
zJp?gm&-$z9Y-coQwGQATX;KT3c_Ic(aCyfROynnbM5(f)WZv)~N26voo?ljQFu_~F
zMI@f{7NI-|9qok5+f_D5sN!GB^(>&JaIDn0#8mdi?39!@lEtsJ0X_ar8SagXhK0W~
zKD_fA2v)D>qGx+Y{AdGC)EtmUz$Gp(?Vcl#^vO(Twvk2~71T?_?-BpSmsPD_l9*3c
zSST_FQgFn6M*I=+H^|IaOF05$GFnr}#u>U&e%l$TqX@2xZ7cb7!!7%(BO(6gVA|B=
z^hjSPC9kq=sT<Vq)n(5wFEKJMZHRd0r<a201=s&CgfPiRKd~HuU#tco6-0|Fw($V!
z_g|v`7>)WYY<kv2)50wi0xde|1;kZcc}M7$I^?`gpMuc&3}g!mO|3~6r5%vKCI9{>
znDCJdc)6WL;I&7X`4U?nDZd7T;*y=IhEsi1L={$%U}s<^uXcV!grBpJN|Qv6OSu##
z>Kk#OiT%&!eZ%DUztQ{{)?zTPC3M=1lKbDYf6AJLA!D<ZX10jsrHg_{DBAk6IZZ`I
zDu>l$3nk~T!JOLl{RbvIuvHVZXN5(USn=z+jlH*`;e~-BN1jNrJxD7y)=y_+`EQTd
zd}qoGfv!Zp^4HS0$Y)3L+`yg-KwyBNbND41_nrUz#t_LJslE90Piq!HG}aq2Nb@2p
zE6W@<d*HMIXI`ZB)o3S4rye<#L&hN%c+CF=Lr_~IN6qTgW~4a+4O^s}9h#DKv?PFB
zKC0l!vJ0HX)$|vVgf{dVRN2EXEN2N6)lyB?BzY#1=Th*=k_Z(6mGJMlg%iTKa`XOY
z4$8qO;Q^;Lq$0x{!AUVH(s>{ue1ip4HkZ?|OXtm>Mioq$w`%HF<Igtums7+qlpY4?
z$d-guEh_cqIOWn9j?qdN4+Op67#V#qjZ{)b7@>|Qi`a#Ezkc0+wLXQSKz2r;S9Cre
z32E2thbro&mC<oMFVYI*GS6v;`~Cb>7ONgpLBwRIcrwY;#u9#_<l7?_n5{fCn69qR
z>{m>|KriWzhac?-6q^}{q#etyyj!<ck8hu-r)?kM#ee;@_^`(!6!AurSmY|8fwTPp
zZk(v(s72&zy0W~6`VPu)s%S(>djJC;GXQb7WC=`r&coi?z6Kk)d&=Huq!f6M{-v~b
zhKZ$Dj-JcVW>}dELaC0KH?xtEJHsJ40$32(ZxMg9*KfQVUYa-z@o!Sl#z5f6)=li{
z34-HgvyQ7ea+nI_1^My;m-fC+{=?$3*Qi_5V3wH9v=$OA>-Qt#zxZz_B2HnifG(j5
zANaXs{d5KvCbF~4pd|Mw+399||Fh{t4%os{&H4-==?&}U9CicPt0^Oi1HbWx#K03g
zyP99#bHw%amUWia2y-$hwGjitl7$n##wDrhW7rOW*NERucJ>bAzhlaPj{2}RVO4hv
zV_tfjZqq}pynlotoMG8n1$X9j92jeoD>>B!AqBz6{vE6luC)1<$t}$I8fiRE5GJ~{
ziWqiVqbHFRM@!=LpL+I1mi{9HFghVO;!|UJU}0sPgRv`xf4{PS*^&4c78RD2KX1Mk
zSP)}?;8;A2cC;b%44W)i%{3n_Tr9evnZR7x$mk+F|DBaC6v=!SudrZZgjPjh1<?vY
zxLi+#_m`6Cu=YV=mne;o4Q@%PVcrHR0=A}C_NVp4hy6rHL2_G(_6&S)Q`%rKAx62L
z#WDj*|Hk!udd_MZ+7HOW3nXFr^WW%1UFy|m<S@&pH+p%ykwli*8tqi3Cqc!@JPA{)
zm82FTOjH0-Rixg8U+R@SvIMR5<<)7Y&V~wW=BzGS&rjCNCp#<BEa$#?rJb_n(EJf`
zi}($r`oW~}N1GLzAfSv>EsPkK#aA|;a)ZXcqCW|oeg*sg&ah{tMH?LSayUORJxr4Q
z8w*ida-1tsODn(L7(6n-YgML^`o1~MLMqu7cU!~R%J@`}FE(S?Y^{QW8L4x5lSTXc
z&M(^g*N7*}*RLEn*HoO*h~?;MN#B3qNXqo33F<?5qvdZy$1k8zaL%t_Vm*09Z+nhm
zrWDrQI8G&wa8Fck*BNX<%S=HB!p{1>Yb!m7l*%dS2auUC{%nIe)f)MbPFeozQv=H%
zu#>Qc+sVS{TULWjY5-S2a6`P28xWaZtsTHjbiBe|maw$WMF`pbSf@Y*GTS3KMEpT(
zx#Sg+Y@-oo#5o3O)6q`<U~K`zx|3F=Q9*_ls$`%Ia7oIq{F65#og;oQ4e&tp+4Emt
zY>vaQ&Or1Cf^WODcYDSRCd?x-fWqEtDblg9wwLe3rU(9h?f~7hABi=Km`v69E4whP
z?_X!2<e9y=D<52%N#37)wi-r380>*;e~WmG_(7w<29YL-XJ^t`fvt`^y0FF@0eDGm
zeuV`Ma{ihT4`+%YIgJ`bmPeaeRj^u(jS-YgD{o|)S+E6f{~7To^9;ix{<xXGYX9CM
z{^0HGBkQ7`nBb<qt*5cb9!f)RCkQ+3oB+r^8%HFV+fcrJO&@zQb<D8NU}l??Ir3WS
z7ZQ?3>&O$x+yFo$P+Eqh+$F3>bb|t%Fdmh`Ug5tUxIoJWJ$y8cX0o{1;`BW&>rAMT
zte={#hKO%8pB=4*uOOcH>pl(@zK<s09m#qJc41@`=pXPCf0_MXzb38cBfe(gs*aR$
zI^5%J?SPkiT7<t}ZCHcED=KmA)_b4_IPh0bc#^9<dSFS1O%OZqw*xY`G)y@V^?Dmq
zll(+&7Juc%6U(_datv<By?<l~ZY2NOL)%2eCvM@LXMB(NSH#Pv16}s#zann9p(<^V
zQbH4%kf^}>94Yp+kj5`omDyWrx^eN2J`7b8gWRdJ7#zIded7SAWY$OgK@6VkR+pWB
z$$FUz^Ozp-o?&4M^r$mJo@D=JKaCZe1N+mVTfYCcDPs-7_0a&DG&4*0<g(#!9E}Yn
z?=IpwMVjJeW<yU&7n|E}XN1d+eLfhaT|^wIBy5Nf76klYB$tW-%37`w-?R65%oz61
zh`$kf$Mm(d4fmylFk$g(uA_dL9VYk_qZnA={bu^HPYLjwA-3RZEfbx-+8*EcPvx@L
zh`@UeWB?5(I?upFq0HbeZO@UzKTu!Ypg;5zuM8Whh!PWlc`>*fH*)g(cj#BKR0|>U
zi1VvS!nMpfg&X!Bq=r9SEbqzq*9`&bUw^l-H4GlQFg5zsCOYR1xa0-~0NxQCsUFsm
zh52PY5Ku^Mp}s$}R~iUWlZ3?POW}h{?daR9n5=om821JOPuWBmNo7<76yz$YolFP-
z=k@MSNVn8#;~u+0!20rUWMmWkV#7dHK|Y<Z67#Gw2iTa}p|wupv&r*nGLUYa!HqV`
zu;I#KO1SSSDOYaZ|1y!MCTKlI{5_ouwfxYZ5#KiFk=@8p{~1w7{J*WL)Dod9Yw@@*
zff`gMfQlq_Ue-QZGxbIpX0Vs>wV2?511qUhjcWuuK((FOg$5D#jHs+nCv#L80=(MT
zoxo8(^0hPYqRqO1GAr!KPIs(JX^5vn>~zewP%5hLM(3(wPZZ!xFv*L^uEH6t8T`tE
z%hha1u)<VWpLnU7${0H%?`jf)=L|u60BY%mXl)kOk(-y+AyF~i@ZJxY_kokECHT&B
zZJ-C8!bgO@4dZw&DfPeGe5-8Wb{4^6ZR6Ok_J)O)=gM25Mj~5qP6{^vi5q!A3iH{W
z-#q~AfLT|}<pI)3%Ipg#DR;mF(d{d>UzL)i80#&&%sYu?w?UX1&5emsiOuq>R$tir
z$@)eI?D>Rw+}4dC%78jt^oe?xdfYmr+V&X&cXD|)m^b2&mL+^Z%e3)XAmgu0WWHEi
zLTTo)GfbTX(81;fTv@XRQl^Sjph){f|N3;Wa;_x1;e#%8HawV!IsXU*Nx~v8cP1(>
zN}Y{X4s{_NBL0Z@9<iefYoE?9dbgME-q$}^d8vOVj$sLBC?axPXJ8J13d^F;X+1@u
zc4L~sjLP%~`KF#23A%5f=?(Zk({w75#ZUY<6SX8oJpZ(i#{H@fKt&!2RFcnx<-9X-
z&1oA+x5u<0=AOZoHaw{ZG#xCGmbm^8Z(avU&70be+SW|PoLmKSCH<%^bd@x23?@?T
zfQo;8zqIz1GY?h*J!aOTViwcO8bByh+>vP)_Vx+oeus1%Q?mae#svJ;obb-SoXCT6
zC9vAPD~l{@THgWVI{Q0!^SC>pnl-2`Z4E2XOA?rKsI$cosn-@Od7_BbTA<21fp-|)
z2<oahz-PM2A0n<1e*ho14QNIQS>w-YJJmbJ3eMRcN?rbL*5{68CcYmc-m)T+IPBMm
ziQOA<<3D*-;AjGmY14oXWE!tz`;vp)tW9S$Klj)5dl2XaE0>CCHtKoUJiw~%sqWEI
zau?mLI(Qf{ne2Z@=$X;G-;7(818O&gL?tzhuOutWdHxCHYk;qV$te=!D=8e_tf60l
zIZ|!$wgZ+<Y=Kg*5kCPFXKOF6)=a!A$g_3*UyRIcP`XK$kiZxvHLnprB8rHAM1*nM
zg$Y4EQ~|XHjXNA=W2bl|NF<{OW{&zN_=kzI>^XfsLRcP<d>Wopo=ss?0l4JKA9iv#
zCN^lO#^3N91J{3B4+843JZXUj__G~|MT|_b7ICy_gRa!BJw$e>f|p9vZ4DrTa5ZuA
zZ#LRVCF&i;;n+-qJlWFET`U7GHVc?eA<ZUx9Bn?#BFfa$tpb3AT07cJe;`1r38gKu
zQc~)?2v;C<oDJ*<J1raVU@s_2O!tAz=O;|sfnKCH29v!-{A=@t2oBTg6ohFHJ*1Uc
z1f2;q@78wJFqNz`0b^^`yad5sc-=v7)20is&kuG*ija*@fdf9{t_rK2DXqF>La3R=
zI5F*bB8;>|)i)Y}W<d1a0crhr>4TL9Vl+PD_E9JEg%?I0EOTzm(M%jHF@^U}WeuP;
z9r*9WGxfsk*x+%EK`NVktA6{L&@rJwylgIhVJ7q*@lW>o$Q{i%*1;w<-2>>tZGDUQ
z8)0Ndu+kfbkQ>WYOoERS{VGzhFNK+38Db&=f7^H(l7M%sdUk|_8+mY<U3{6N@*7_s
zNPjOZ^`i+3H);UOzg;<4qv7?Ji2sVXL_9?NUx3678S#kfqbG@YAl3-+D8;?3?e~D&
zSq$ky5nXl1hQRX45N@(~t+|jrG=5|KYfA=vS_7cM#y$3&+}XriwMlc{)~}N*?NpVv
z&I>85*%ckxc;3iX7(KkVVpwoflhp?I9s}-0VDY}{dp+BYmSpbxwf78@;HYI_W}cIh
z$HCUQT;IJbT^k$OR}pl6LP2d*aAyqtg(O6(tdZk7879q@h%Q`S0(Wa!@Mw^2pX%ZM
zy8oJ8>Ubl0gC|>>%6=zDE;d=LDb3=R+k3Sxvf30G#ikD~SeU`ELjkP$r?rXYGSmWJ
zHyVMybQ3JB{uJ>`!~u)Qy|<QdsqZ{+;-qJt*wmd}CMg1|387A<%uPzI73^q&zza;E
zp>rz$Q4Qzjm1o{<iU?+lQawXVzF^v_1TCb@&+N5CGr>)BASzb$jd0=MOe*#Jrw5vS
ziFhVxbS&78v+$iV#D<POxJfnoPRK~lHYcyP+DNkhpLr=;SV4LD_m_wQV*O-l@+5=4
z+PHZYffcA1Thx=6>-E3!Y$yDt)@(9=CXO|cQ3~h;>p!9aJur}>u&UxqCgmS(f^TiL
z<HAy&%GA~yjckhV6^BegYfpF@%oimmSNAM=PHh@|4WYPl{hKF2g=Js?+gzFnNyZy%
z{_-9GZuUm3(%z{3&hmUm89!vtAX=|A(PzhA_x$f0yEkN6_-rY7%@N%brVA=?7g`0s
zeDqEa^^d9RxnfBcI^hbn;nI}9$g3|AzeW^r-7R;tH@r4*|9K~BNfGPHrhCWUVq1$&
z>y{mT-TWoukBC=$osp^<XK<i2^?UbWGJujOd0>SGp!68=$+TFxl1=>Rl^-tb^%uL8
z3EoiA=oSvppq1rYi%WgB!S07_6?6bbyOe}5QY6&|43hM1pzGqnTDY|T5{Nx~G)yq;
zpf~pJpB8FO?F)s;>Jt#X!-vkKNeBAB4lG-VF6PxUxCt5c$g%5)yPpx2lu#(oCW=ee
zUy92<ZGpVm8Iw9JDMHTHNY{`%zC_$?{+6C!N~yepsaA*rA3l;3{9;<9?HXPH^d#CG
zkR{Y`5)+l#dF?+YYZw(|7w*_|z-~<T+^Ap=#=<OWgePKMWo?Q-Ttk`t#ANMc?iUU=
z`k?<PgC9(;Te4vfhSz(XewjsIxdfFCZg!~8G#Sh1RofXr4>z#mX7pK%_IK(Sb4EiJ
znETNts=gBg1Og<(iYx^#Sc3^1DS*-28;xd6t=&Ap0l!(ovnbn<e0gaBxk9fe4p7>r
z#%yqJTdG3c#z#V+7MMKZaSFWm5o%+%iGVuD&0kpLNA_a0qtmW?xe6XtMDsfh3CsGu
zM(iSfg_n5Xza|vcOx(VjAYd2q6^oHh@<Sc8n4_AYFH4j-zOi}e$s<e@Q3vxUy`A_P
z*H^AvQ3mM2G2-vw-J2!FpZJ&S`pyHxlHg_ntSRCh@jh4dA{#288;)iKv=nUEo{jt#
z@r`TT&rfAZLB}w@*2u>KVDuXkEApqP@G{Gne}PN78Tr^)pKIbkuT03fP|O-(Ya8n(
zscoXuE(?H?r)a504n;{qd9+ZI*T7t!wEU%|FRZeeoR1KkNfIGmVFyGwnxNxs8`SU<
zc?LLGQrf{eANfy{#j|nKenbMaHu`b0(=vnJTH;Z%1o~~2D+`QgPi!lxjY*z-zqnhG
zf>{d%Cl>k9(&fKC_gtU^puE~JE<M#EKi=7$<Q(8^L3Gcx#K)EdvA4}W@;Sv=^8UR6
z&ma=Cn;^ObKHLaBC%d(YF+V3XGD!?sB=dhnlo4MzI${y@krJ)S4v<pF8U!){MGJ~+
zdt<b=ImCnEg#vdsL%S(8%n4v22<7QFSp9*5a)%yvN1R9yNSKnhFu7z&JRKo8PMZNh
z89)VIZlHAKx+k`C&ekvRPZv@vRgc1=N<~zeEGpjZN2NW%jneo#X{oyG^!O@t^eD`B
z^bfXd1*Py@hB={bn9$Kn&N5++6aT0%l4oEY(4_|rH@F93w<GyUvH1uC(BT2)T9sC9
zOJmgTU+eANMKzX_DWYbMbFmKKldaa#&dia~k*_RXzd1`sMf{0l{}S=9hygm~%3!ta
z=a)k8Ms|5jr5F>pGp!2%GtmAU&A1ChdE$MJBYW9K{1WlU=Hgf6z6DPa#+Kg<$$mnc
zxxu2B2DfxBYb?sOWJw0B8p>$7U&(IujB#kJk@K(TzZZd95)L;>QO~8l7IR!HK?`ui
z6&JmD-wQpc4|Z4cx*1jmR9pCy?rws;oeijO$Ke9P;fXu!co50Es<i`{N&9{UsoWT^
z^n|eM#+T>JfC^yBBe!?9a(+P!ZLATi{OINd4V3l`)=g2^N+q9xNmT!c_?2Va^6+~U
z9i5FyN;pK-SeMA!*fh+A0{N2CSsOZf!H*v(O7>XnVATuRV{!l&1h0m-0drRwVL;R{
zY?gsaaOoo_{s`V(q<hPwEoIr;NA{yCK@c<t0-KzsccLaZO5%-2lkF(|Tdua`VK@BD
zedlYKhXyaxTVI}P!d|Q1_!KrKV`tp*U@E+hn@{dveUdF;Q8;ooCF^8k4H62tXYAFJ
zq4OfGMyms3!+63<>0$v}Y7+%=!cVB|BSX;g867NCVNwK<5IK3|##<4h4#xE>+1Me&
zY*?=i7Ouo#dlsp&cyKY?oY#O#UZ%1$-#VB-uR&}NR)E?l!Y@(sD=c!!>thD$`!Crv
z(t7}?jhiTW+#{&#geNIrEAKZ@SlU0?Kg&XQG<7`Kq~B|X;70U64F*u+`V|<Y5ZkW6
zTzG%<0o~z1YPTo3Ig>)>yH`g%Mf^v^KTJb9@`V}?yRdTa>uD5)l`k^_e#Q-q(EA$*
zvM>(0GT=}XvIED|eI$c>iTGi?f_kna%Q0f-uj?dgwyyCTkXam}Y;dGa>wO)AJTgr{
zjxbmvd7vE7{Ui9(q)J5bckg7>14^q64q9MoG}pCk%FoTqZdyTMLOT@(#07N*qLQOu
za|9ZeYJ^nT;fW6n6U;NfiXxtZo+KA7p|e{v2kTQXW9rg9%8e}2z9VnE$yIP-K?1g^
z$080q&dfud*Eb^3<!oISs^ixYpCe8YKiB-7K5r5KE8;gg;Xk1(TbynSs(4!;VKE3d
z{zT1>gY|~Hq<YK%kgAU@DMM#9Bz>U+N#(ns;@Ctmo$aj-1xbX|_=*2)?bVmQBiEJe
zXl9;clRf_};#<TkQD%gB4Exa=-9Y!8@NS8URPwu}nQqD7?y|QL1eeedf{O50z`k}?
z)0I3d`Tof4yGLFete%&Sa{jz7Kf41s(;)D~ru}3;8$#^N>TjH(We=XM6Sj@`g7$r>
zfh>&}KQkf5jIiK^%{-chQa<cT1M;>Gx`A|E&d`}}E=sUZB*?E3zgu(pUtcb1b(4Z$
z(zS|z5pb;yP2Nw>nYm-5VN%6#IlwaopxpKJl>DA>CKnu7Z<9$c&?HF!sw18u8H4Aa
zFlLoi14YkmGXTlvl#L$n=if-`AB|c|KGIO`P4=}j!mdL}D{5q;4krwx>Pt^ZdVm+L
zGWZt8Ar}Of0yL-!cyB86Z3KNK{R*6(vdKQ;Tg1P3?{l6+RD{kqIuZ+Ta7|7%(hK@M
zD`E$<(PwHM1MWa<vDne_qX00ow<o5WJ@eCYpo;2={fL4+!?V<^^Np9nJrF|`<dQ8A
ze>~avkdlq)cr#E#0Gu{&{ylwD&^x`wM}W<2;{1&~q%_*Hq4atnlHH7g%PIJgNu^Jy
zMmi)rlcj8x(aC8BFq?d}<cJ6_xvaNX=ik8(YOe;;S($ebHFs+rt%{fjeDLB1dK)Hw
zH95v@^Yrf1mxy+BfS+*2k6bD5;*dT|KRjADcSYgvLTu^QDySPoXT|eRHW#<#Zx@@c
zN2?$ld4@Twm<?5NjII<p5(Ks|i(m2n%SSOlZS6f|WS+sOy1lb5Nx|#_+g>2SFRb*&
z{j2r%-R9vf(v%+&Kh3@5KBzQDt0I1+LkMci(UVTpmMJm^vi0u~KLERD#DF#F>o$hy
z2|8l$MaGWA@ZP2sZdg&r-iX4H{e?C79M?(U#IOEA7gWK9s}3x2!bS|}VY@yN17{l|
z+4CBXM|(|QviOZL>%A2|d#hk7{mLu&1=;0<CoW->8!%K)^>4TC0|@w+Urc%PXk*;o
zO$b=8UC_WB&=%3>a*AIgzQbRNV9m>jvY(n7LSwb2Jqs~eZ8ZOg#Q2V#J_6Tjli=)4
zmY)}aJL_@j`Inm&GtMTmAKZ~hf^?)0<xdP&wCg+lNgL4X?!#UHfsIK8dZRHrBh&H}
zEO2^P-huat@tFk*)&gg^LBTZqZ*bp(+1eGaQm+Zj0Qf!N_bvppDZPgC56Sm6ioy$u
z0gE$|A3WH=w{0<Jk;Z@>*+9kay=1Rb^GB$N!~qk2riEsYKbQ#ONd{0uAxuUc_XdR~
zyR{7+a!qyjXcl|nCuTComW*^`o|o8$gLSS2d;Ug4zW2!8m#s5_`wWq{x0-s`t0qn+
z3j6C*5$kt6yc0f4ar`q!YP$yH&6E?5qoZE}rLQJyxUZkIhRfzD@d_NU$jcc#mDtaO
z5a$~4J>n}C)~7F!kg$3~2cRPC$#RUHZReih9`R8T^O(L-{6GO=#v`b#f3Rrr*J9{+
z^^CC(maW7Ye@n$K`Dbxpl5tn`p8SaTFN8$u4$Tjb9^pWUE^!KqczgaZG6$@piL*U>
zV$H>7gq=Q;p$D1fk~RFNA^qB_F9jYo*}~aWa0gRSDT(jwO@sCN;=RTCz!4~w5H+`4
zGnvD4uK_G)a>2o0O$5FtJL&<B!y1GLQJif6lqBz^(b(l?D|KO=bW%ZFQFia}8&4dD
z?M}$bI)83pzf(FoOyFS3BB&@%#VRk^evSA-JZ!kBQdh3Y%EVudtf+Qkfe8OQ;(u*&
z=mj=(u_UaBcp&lUIpJ-lViNY8ZNY1zf~vkUmC{bd{Qw+MefzI8NY4f|4wua$q{<NQ
zRL%Eb>Kae<DWYV&eqKaX09KN}=OJfE=ic&|9Kq_Bu6BSAq%6<;S4j<X2c#ra)w!NS
zd!aabqN#2)DZ*c$%ACHJ9%_DCf8H@ta9ppSQZ`lMYAi}!I){(Er<9N#+4wHKi!kxX
z#>1<FLt@Cn{GMtPztCNjQ%I6wDpo26*0`2eTci9lqm@3g9NK(U?e!EJ55EV*kNm%)
zt^MEUwS-BJa5O65$zG-@CcfgBY^ac0qEAWs)Iw!UK+1_KcadR`<xyXG_tf5|VI0gB
z#<aq8U%8R2`36u|J>m)1dLYOob?~Gm>_or$^2BcuO~iljcIAt)013F311<2IEiZU)
zjW-h}-bRouOS}Y!MGgJm5&vNWs&~xW*-qgDqgS<FvLhwmCFY&Ppn{uxL&(yYWL>hS
zfm4xlcr{?LrS>WNSdiTWxUMn{X10nw@loAs4Gev0q)eWEb);{fBEnjAN~`yMCr6I;
zFun<=p%yw-b0;Fk(ZIk4|0573!bAtXVo_!rk=SM5FZfz*qFHH!kHo{v<~Z^hNA(`@
zN5o7q<BidMx|e1%cMA&ulCYFG!@_#IFW}fA0}wMn=Q~mj<y#f26E=})<Njw;s#li2
zRe1Fd6W)G=nbzYb;@5}(29@^M!ZvvZJy^0)qX0SJjAyd02XfktBs|#dBaCd#jy?ii
zBI192RL1Djs_zBRR7ZRzx!iAFTq>aJN<iE2*plAvvw4mxV9or|iF%qc?SoAa-d_AR
z;&a5`+2@f$iArw`OtBnnvUszlY{LEza5g)NsU;wFLxd7E@&?=9vJ0+>NxE3?0YG?j
zuvu*b$?JgW9QffJfem!gjFcQ}9&A8Tc|^y1+zi6AX-1ZZQ?OQc{b2<q%+Wt^$`nnn
zw!HZPEC|+DieTcPE53iR(X_EP{Sw*2!MYY22&*#)Vj(#{uMt$R&Lf4ZcZ8)ATBH+Y
zz3-d7G>gUkY0U;LUmk#{#pWy$dF+wMq#R3VO4w`KZ01xEdtu>TS-%<Dp@o8wEhQ=p
zHE`a*p=umhzVpPs^hAgxfQh<3v+Wa!Phr_qZBLOqAXS29P|nT<$;@`O%NoIp(-m#D
zbP?Sc93c!NCO5f-(UQ^!d*h`xqPF(h^LiNK%*OtVxJ7*8*eK9GSa+SO9rpm%Mf@4@
zzazdNpq>mV%2Ya=teGTXBbBKN7keOpUKnxFaYwlZTn6_R@y#Y4zS>r)wzuD;z(imC
zNM{Jt{Lov(XC_To>m;_P%2nG0m&q(bjO6fd>hSWo1v8r@3LMOeuGveuB2~f7b_!*e
zJ>Km@711+5y+Y|wT0++0RNk3qmllDQ0`KSoQb$lj;j6Vyt9$_}sDc;5d}03#3?r+!
z+e20dR+Jg&q&wLpy56vJwb=+Zb25GawrAp{RB<zld|tzUA=n+jr$fnDlN46?u<sc2
z{Zk|IsMxE@hT?pMOA}eN@tMdRchr79*{huzo_sQK{r&!TM(#?fT&8^P=ZKa;m$?HH
zg1(ufxY<t^l(@bVfHr8@(&*j`6_bTAZ<*7kBr12{1=pMmQ;$?19@z7ljYy!&fJUHX
z98m!tS+e*)BlZ!$@T0{?=;M`NIiZ%R3|LI;^z*tCwoO}{XO7+lOkKimmMn1MI(9hC
z6Fx*uAQcyw=~vnq2Q=Yjq=q>2Ce4DypSX)e$R+-_H|FeWyQPSACB;Zsgc-I>{cBI=
z{m;Pv-eP03UjHQ~Jy`4P4D{J?sGsmI2>=u**LWYb6D~$_fm(UO%&u80N&~o>Sc_%N
zmoB!jRH*ytjd&9ayE6<};2M>_seqAVP`SFBRlihtzCnamobjXvwL?Brrrn0prUw4w
znhoMgC?L)G7@6Gj2z4}ICpX4~BzIr&<zp(>Z18xW`RU3C(;_>17q)S<S8L4{3NQAi
z@r_T1&SYnu7j^os1aVotBOu{oo#eu*eitlkv@9w)fX)F=tn$GU5*5Cbziv|)f!?{c
z=9_`XnYr0Ke-tsa2Pt-@<eUlS1!E1W>?d5jG1>Aw+DP{y;?Iv>NUK#;>*38_<DkA8
zbs;55geb(yrdd>^nLA4*mtxTOEQ?*X^v{S(#IN>R<Rb$mUpRfYbr+gGBMhZ_FV<&m
zkvIMp@ny3aTKb_L3$0)dT0(;QG;;v8<I9zKDgD2bx#VGU{g*#_q?Fr1XYK%wh0!5N
zBVYhmz8Lc%3++=qvrB`L^WSiI1EFteRDcfHX<b3qEu4^HhxPwSjsI*W9P9OK@fjrt
zW+Wc#M2-B`#?G}I@0P?bY<i(LI&FU>B9L!-wn(B*@a1<uXEit(&yq)dwHf~EsR)RQ
z7vpIE`jn}5W3sLTC3_WlV4`5T2Y|`OUM!L7*|s5gu$PE`Wv`SA3lWWB+|hv3Gg~)8
zHBM^>e1RUBpx#Hqzyx*t@^27m&5loysXG((xI}&f0cQM%v<Ppe%Id74Aa7<Bf>(*b
zyTJ!aiStKAaKY!nyk9}use^U3snhrM6H%=i+e8*CHF9OqcU}j8D=#cN801s{nLuX0
z^sh|EU3v7*9znLYH!IL`biC5>cczBlzuDM8)!XV)Dj*e`jX0A5T-7ZG>t^|iAzjw+
zFW9r29m9gH5*Ju-Jd4feSM7Sn<Mq^wrRdp{gzd9DurOk^r&hDWpA2w$1)-u^Fa9Co
zE1m2?6U3e`-*}3!W17@}N>~H|6;U;9`UaKvwTv;i^9%R?$a@<lTe@LI+r}fh`etI>
zlSM*#t6~#FwKP%|z`Y}$Ba9cSZ1(wf1}Z^En08;%(T>H-H-E7H!`hlH8e7rP3c_`E
zec5>r1r}sVEzot~PsZ%|l~K-CC@5h~I;hpwN?oZ6Xhy^I2Z>l#&>QU5PuWQ6iPG4s
zy)3q8C8gtD0O2L#zaoBjA6{6kps-%D7ict5UrW?{+T_q@4#!duJrL6uyL-Z=ED`w8
zK}Z2ccs9CNCRW)@z5ytkr@+c!j-+GKi;KKcUbtVUz%vB4Xe(6^XRDoF$-mq6F)9tR
z7Oa0~?q=jk$Ia+9;;SnF(F2}jAZQ%z<+>XIIIMq4g_^yI!p;PPaeecdx_gOa(lsof
zk$%5hc6yc?*-|n236A?2@k&AVGl1&6DZ^FJ%;MWf`lqoLXA^D5`X^*_J&zzF<%+_u
zT2rVKv7%&D@h6i+&xrXt?q$1^{0V?Pv!XX!{0_4b$h+UUL>#64iGWgV^NW<$B~RXQ
z+dHh_UlC2j7X#j(txtF3BUjTi1hfIrKOhh)4QeIpH;{EmdMqv{%(e7u{h7Wm*;C8T
zm-xH}zaU7vA%vS!wxv%0)0L@g2h+?Z_TRwM1_dN(B4!AIo5=<8^?x&-UV_a%jB*f(
z@;-)p02tuPoX!|n9<>i2>6mY_8+c_9(8SfHSgbLomiEY>nK6{4T_a-K)?B}&2@SQz
zypg1!Om<+3Y)|rbwfe}-`fYL>P_NzufmzzKG!ST&sR-_S*c(g{#Vk#D%`tdmAVeO(
zCSDrw({x`2RL21XKN5idMmqf0!xtu_4FJW2Gd@`>fV4zaw)iI_{NBi@YgXZkz~GBt
zIs1qRD9y|5DWeoPxq>*EBd`e<)4{d6@@&t)v{?ot@285@#Lq2I@Q|MB-5SA##ee}E
zt4Ju^@Xi+I8N>(y+(BWT(t#}%%-fuH9NhO`=tn}&4*QWx;j+m-`$A|(MZOg_pHnD3
z8O2f<?HMV+6cJi;a?YD=*c}5(o(PoRBhEmlgC$g|L+3thV7I%7f5AwV9Jrkgy3A?#
zneX2i`=lP|EAILRj-MHqn|!~AIgw1{9a9|$U<aF2zCj^Y9Cg*IM$-<wQv-cS5yoE&
zu0l_pD*bx`_|A&J#oC38wCJnrM6^-K@9xO}8V_LiBJd8{M-<Uy&(LC_b=HO!#-X+L
zLfhE{XnTmHiA?Q~`H+gzS_^7!Y$oSqzD4xj#s=(Y2ECclO8?Tqas0?ef(v`N>_Y6j
z7=QW6&z33w^=Y{B(v2USz7|+ApyS@wFL@E#8RLVkc4<8MK0|N<MGE#R9mp}rZiRC0
z-!BngJQzu$?p1X|?2QY^Gnfu;FU*n~6NFz`rhZK!R7TQOc2f=Ce77!`W{ra+bZpJO
zmC$umB|1>nZu9<MQG0bntdk9?7jsve+qr*#L^Bc~k;x!q4KH-w>;v$|0V!U~v@))3
z+oqvaSsBcw>@AGxKRsp5z=cUG2C{|=oYeu&sbgHo`??Gz(W>&GrSA2U+`q8p7}t*=
zuDnCAcd){L2?aJoXZ2YCtH1}ZR4)r~-;PMEB;ZUbJCETVq4P?);(=1IcNTo8f;(D>
z2$oa&!(UmZ7yM7M|CMnUdvtC)%&tr+f=4%DFCbjO=bEW9vYZFD`QQM$_BA}vc2Z$a
z1OB8nee`9mBG^N~HJ0A*0gHnv1qPy2LSqVVC_|pux_7Ic1WfD(f)_4pcrf=N%cj2q
zbRRqmE7C5Y*`il#?b8w{71lU*-~Hvz-y{BI!{DAd=$D8e5hH2Ih5;?O{wsL-6+qtc
z65|_CU<oqUECyFbR9=|~J(&`0LP)ToBNxTGgr1WLFy(P)NTp#D!7X*z7aH$p>|VLY
z-bSmuX7{f?go@@X(OG+VkqO`OWae;xeu>^>a66p&D^GsIBJ%)N*s6$+Z4G{Vc_z=_
z1OKqc_`U5=eT@M<e*S;-;k|K=SCF0b-P_HevweRwkFRWO`LvwzL`5Z;#k-*t@q!bN
zeZT}03PicnXia(z@ldDZgt}<EfTG&@lfBtwhsW=%|8B?vkO0tfet#0iPe7`PY_P)?
zll?D%n@=p&8+4YQr8O{d+4Mh^F`vqyB(Huznzl%@zOQvOG9-n_I>p_yIkm{HFHAU+
zY>MhFNy1wrkMEy)y@oY@U+eBTV6+gIFF;ALa*s<pkWLNU&l^(W+yM%2FRT9mpZ}ib
z#oLWMN+CP$e`1N3n%ISTPST(y*O&LGtO!)1$}6puGT18(?icTw@emx&3;TX0a~pxh
z`}F}XQqOXR!`~aLDr5W?b@(Y;g4ryw;0!ga*<jcusbL5KmIfIj;)OS*?FgWIYuMfR
zEUk-Q*f}K*0rsD9#*z6DNl&ksfMz;8U<*>AJZ{d@6SI1SD<8lEd*ac+OOG}X=n-v(
zg837vl*H|ytS_j_?!SZHlX&HU#h<JK80$Ld#OvUGqTp}L<_hZ$nl{g{9--inb>Q_^
zGW#R<<*MS1^_v-b_F(dZ(oRJ2(Unyit>ID+M&z~F*kExw`A9;x^bEdYssjgEx|r|0
z7W{sMJ6?VHDdL$U;7GSb1vayn35zW6J%Dd#j@_Bcu+BxbdH)_(dcJw3Znm6)5oukL
zgw94G@7VdDM6Vq_ciHocbw?%?(AB0d)^=0s<C(46dSAJu2<(o!VgbZNDxJ>}M5KDt
z+1|xmV;k?**f(%*_q(iFs9VIJ+~NWs9+->MTcg5;jI6{4TLZg0Kv0-7*w6uwu-q3i
zAoWk&j3I0cUnlwhTg3l2;&a4*+S&~4SP%9%5=l2W=!xpZBji_wCU{Ti5lxhz-ob{{
zOhz8nW`L8uFZ^{)CQsbttCd2<s4NQG_f+SWSpSYL(IejX-5gbE15PfRO2FA>H>!x;
zu+Hj&3M?wGL(cXT0oC`$`v@mAr0*ZOF_~mZ5l$|#!bWTAVad(Uz=Bt+11onq@#S%I
z1l01mK}>$K`Ogyxbw}cK#!nRMN01Ojf|n6bb|Ad>Ru;~`JepAO3<V>$bzqRy+q!}u
zaHA^2%$9+@z^9l0MuQth)hfy@3;7Ze7?nt|$ku<PviN_Q8eX*1U=KXmEBlh2A0R0Y
zC>?5iZBOpMBYJGN^2lJ6$1S}7Cw}olX7QMr&;V``U+^ArjQABv`^b~~D3LpbJ(ze(
zU62y(O5}Z7>!<EdqGrC@a#l$@I#Ve(YY-xONJvd$&<b9`t9f(w0Fy-z)ms*K^JwaT
z?R5`B(DEKV2k=ia;@PHMJ#5C1u%d&u{>-1h+GLAdMKLhAV#ga8)QqRz&?m;>)SCym
z0AuYVo^aGZBd$mX-)$J(Q>MibE49a67xba;EN!mX-9*JQgI5*@_9NmGvt(-=^Tk>$
zn#~NRL4WXSvyfD^>M5l!$MZ`@!^?Y>U=;P8OT)Oshm4z@q{2lqxUe$F4T4ksR{2Mr
zw1d5va57#<IaOpbydrG`yzolSg1c36Rc@+i3Mvyz+_~C8XqW<2TUB7=BOh?HCD>b9
zs|#LitnU5%UFq&=Nb`byUe@o8Zi?9m`{dYklvu*1v5f4Q^dbEwg;E%5Kd^JNr6NVd
zYeaCsej_ta>`&d>lQCx{iy|Gzuzo50OmIBf8+=EOqj;eKd|hq^;2Bf<hL=}1r(kk}
zW=_5-VVN?3C1PlTMQ_#I9>0*(&>j$IOf2}pYX2ATSbp308o=JPk}aku1@B~eX<`4|
z4M4Ti$^-{zmTSk|ABd*O8eADwbFw<LdRbZ&ZY9O;4JaiVaz@SG@Cv92di_9Nnsg$o
z0I?@DOAAyJu3B8Ov+93+mSg>oh`;0U1-X`G`%L1pBQkFjxM6~anVjGU!LT=hOr|rt
zOAsX4o-m^V-*BOTvR}WHeV{-3fu|Lv9Ps|Tk50dqa7OA>L#YbBs>|a@PO@Ewkc@J+
zZiL3d^<ZU)`{@@nob9+B@U2B!)!E=rf*iL;E?EFp%ak*ij0;ST(IW>iJG2sRPj(h^
z70Axy`y+dH0Iu}>)@M@}<)48JQaDzHV@0o-XG~O7VMX8|!Il-4%=bOBuGcjPm(Z~y
z<xw9-jdku!R5Px9;KapG)Jm>!^9z9a7V#&|Nx3Im`0-~7$2%&mXF~GP1`9}Jc@N-)
zO`P0;WIBFVDk6uf+n?FUiUkt%<}GI98j={O^H&-9{LWHI^HX8`Nh4%wf%1jzSgf_j
zk?mNQYqt3J0HC&?K2<6=sY?8U4dp3_#*=i!xc7)ZBHD;A5r>GsakoFS4V>**EtUI$
z<1!d<-;pU@K0<&U8P<_Q*TCDhq;|SevW-11R#8rFWX5SfK*~(n^GC!!;%^cE1J6*v
zg1p(Q-b$0^Y!p#@(8+~6FR_h^)cOb)lQ-ke1Xp=EuN~b8qa9(O$<|-})k}^1hBH|X
zpfdVHnuX^MXf%>)2{ErM!NJ`4HW4tT?zTv)RulGjB|_x^e6;%1h&c+)N$E6g!hh)i
zS{tKRSQCU8?h<>?*I$lWTEEVEwkH1ff;r}mSZC%H?6^)#=&E_w?gP&_Z`#k9l(_-{
z7glj7c<Wq~5#WyIN^2F=a?TbKUX21x5U46^>Xxh1C@F-v+jQW)QS<^DH*;peA5QCG
zXxe=JrJ8ju_;vX*0gyWfFaKBIA(oVT50`quz_0dFt>hySN{b@U`#9uwf}@rEL}_nH
zD8X9q*8F!gRJx}I7JmOTO|pxr{$k3Zoq31c^W1-?Fb!mf$9S-S)L`BciRCi9I<3(z
zuzuluzETCht#8Bm)zLV}ni;EKfd5&)N^Zj=-#h|87Bq9ly&oyWzprJWH&YjnSi%j)
zu;g|+UcT5lf(ZH$p0PDa2*kb)<Zr_oL`W(zMEoFz&6Ls`>;IXdXLAQKrKnXrlL#M&
z(z*3E>41o(WG;W!@b8~ZVY;D$P|Jgqtg3A9E$-wh2H~Pt4xNN(H*Q!=^bA~X`~WYF
zh-b!qIpZ6oCGA?jl5N3c^{FN9oDC0cJhMIvtMEHacxev2V@E2(8F>Jht$SLl*kY6g
ziBo|^4(pqjR8MV7VS=q0c%v9R`o&d#mzY?Bkiv>d(v{`>jRL5%ShdxoTE>vYM<RlW
zku49#u)W#R9QohdI)fDZ-tk0wY&m*~asqFg6j7IjZ8)uO=9fA=;)McQUIv>@;nf%b
zl5$59m%{#2vUv}j>fqj<k;7e$cHVEw(DiJy;XbJjO3P#3vzxyM0N2}h7@(@irA_tH
zfQkD9?Cec?r4g8?^kv7fE<!|li_&X9;57DNngTB9(?^}aeaa1~mHtXbIrDUhm`~of
z1ca1E$rsjJx5HYx><m_X;feFrSV8W>i*}x@O)P(YWnjuY8zTKo9x$U$$$Q~u^L4`E
zLAa?bdJ1SK1Zs;-EmgzsWO|)RCr>=iVY3spN#9x{yLd1ynU~e3EC%%C3xW|k%%Xz?
z-;+dqig=0muZZt9&Z<N;y~nrrxX#OFGw5u^LWM{PXm6-DtMv<E!j4-@rt0m0qvat7
zvbp5@>VqjwtPq@q<q>{Hj1hlE{GZMDNl3<eBWu|~XDzI{vYGs&qCsJORxSQi9JpZM
z!FZ*+@lEVppuR2Hh$Bkg7gGi2H@`E{voKsqxneQ^v;hE10OZAtU+QyF&>i`8E<#v}
z&Ho1uy@YUWtu@QZlX(*iOnA<do!rvOTPm;QM3Ph&6~Ce_EHK-H$rKgR{`w4DSrBI+
zo_AJQxKhcvf06(YwotX@*;3{!J4(EtC(Ji~q9oyz_qlQFs!t^j_(*X15iy~M+p(F#
z6uT_;`vDB1>kUEj6;Jjn%R6G28`rPXal*o6y$L)TnCvQh{+-Et3j9ey`RFH#knJuu
zNm!P`@2|v)$+Vd_g8Rl2h_D(>OVJan3+n^#EYrR}e_@)5n#X>Cy?-@U`Ibe`^&<>8
z5Fwz!BnHKr|0%!<59AIC_i`^%7}myDB}G`@rxRDMsNtO8S>HZFaGl9RYEFT?R}cBI
z$M_Q;R8q(Ngd4k%>|O{gi}LK*NY(rOS0uZ;v~zK^O2ChZH+n6`&HLBjkpjqGLq`=>
z)e)n3U;iDmXJ>HuqbVIaBFXa1s%a<pzc+BD*5e++Y33Sg*~;yw<@G0{QLqQA;Y=K=
z%ep1HFyj-FZS57uM{Z}MfEhgGE4^eTKkCxm6Cn4nzW>79wF<|*8N8%v@(We=eQ}F7
z@X^HPDUa#d>^VbEH>jQmd`7`vQdH`JVTtofN{OY_to3}8LFR5#L!~p=8lN_EdPa^;
zi%Tg|rs%KRIO1X^PRxZwERY4CcwO#!GXhtbwT3;sp}#reUf(xs2;vPJ1P47wLfh*)
z;DpB1o%BrV6S}A|I-Tw6Bls{{=hYs_A+M~&X_t9#H-iEFR?RMoLMYIID?v<DAUZS$
zOkjd%X~2|MY71R#eI$i1QLZ?e?G4t%T1vgt4nJ|vl{u~6no~wvs$a4Iu%q@knjUt<
z^(gsPi^K-}@2J<^_?M9r5%^8nNRw@W5Nu<-y96h_%@KLv;t7HtZRcl_z?)6QQ}^^k
z#NRRrj5dG3QF+yIz$LZ7n{r)8E%5|5MmUdj40otDI<o28dKQu>*b4`_=29v{6%k)-
z8c6;T3=7+W!pQU<vl-T-EY!XulD9A!WMjAE;5cDnO8dfKy5o^!=Sw)QO{la^YEdTd
z*3dD4f#so-^;fe_XZov>cv_^%Y2oKb{7)V6g{A5&Wp-NQA$u9iy$?VH=pI||DMZ)S
zmK}xwlLt_+4s2B&yB8Lqu*zD#fCYZ*S90mm9&{w}$l;&gg35nOq-=RsxwHUQ#JY%m
zX6Z|#jl?<hEYR(vUZ|0tkr9=qgg{lq7o?}-I{%jgS{gZirI)3%dZS82HFQ>jfaHBe
zq>ZfTi6y)t<T$OBXwu;AtO4~62ikIC8sqZv`&W0xBO?3>-+W*pj}~6X>;SfSzZ<O-
zV&yyApdCj&ryzQK;AA45y}hgVA>u?Bi%l8gkr%hTTcx?Q`2<HG@_BQf-fSX(uw-X%
z0<#*&o&;dAiW842_!ztyMUvj1AH9=SkgwpJyhQ}b^?)HvYYI&2gfU`==(8~gy2r8J
zHXmN8<wVd|A5otc{x6w;(QjeIGfi9RM-WeTYxvv%(RzMj<VHqAm8t>cO3pIkG2(v!
z6wkIos{TxyQ;_^8lmuuMP8@B0ZD-Y*{P~rEAE2F8+@7uQuEUcAz{X1U5}yU0vo}d-
z0zmH>KyU3)1-rL6nkN&mEl-mBuo54#X8|i~$z2@C((3rYIZZ&g7}tW|F7bP+UCt!c
zA1dK=5-Vr}J@n09s$6C%c;LN%FjMkq);}AFZ}Ag(L&9W^dN#>(0edNMC<1;P!H11B
zA8Dbftv=L~$0SAUp6b+@&~e7;96)9l+{k10LKDHCAlp39`0&O!k~d_u^QHrtZJhFn
zh%aaOi3I;>1HY2*kKmn0?5rSYG<eJso!&1Izh>j4CgY4QWDO^}2o<sSV9E^;Du6H1
z-x1~54oPvrIPTxGPE;Y_D>*`GQ~e&WnG)anGZPUg_ZCBWqj#a>4W*wk`winSn1{Es
zl(8l2Qw6%`bbj|<MmaYTHa{)mw^oo=qMxtd1clNekob)xsDqBFQ96AX04Dg?lL1hv
zYFqwr&WgCR`RX^s%mt^tHSyoDi9S^*_!_aN(Z8c~S=kv2gQOdu(tJjRN1P$>N6Vda
z|2bNdUf>9Hq88|PrA>9p{HHd5kB1o23b;zaK|S&w$?h3Ij|!uri$IQ{#%UIu%?%xS
zM`)TzNA_&V@>hSR(t`Y>(#~k1P))x}|Hp(6+c*OgCf-8S-f+eRAa26FZXf{34V7Vu
zYMIhkZVhV{)Cj%tkj7Ze@0qNNq}V(FI)?8imcF57nsKskP<nNC|0UyiCLFaMq=!zM
z*}pbBKzi)i)<QIR#h(X1w9$uAu&`3Gyx}OHNiGVw`Mi!S1YD7zEwsY{VtGQ6T3LH?
zzIdwi7Q_l=v+Aq}i+0Ip1SK2^6aut+lcW~LF60kCQG~3llHmk}F0TLd*GJ{QFeoWq
zS7B{t!$*gwHu7)g`WOCbBF+(i5Lmvg2VzVta$&PyWb-}ttH;c1Faj+dbpPI(07@gH
zHx^$Pc{fSqXd(v1y&E^uW&j`%%8KHb^gD2Et!1#Y2f!<#V?-!@pd>XKZB~+*ZItnP
zrVkXBol6j-CJay1lf@Tt5g1}m5k9P~A~yHGwoa9TlRE%+D_m%4-KDd>dB4|+1$cuu
zlpbv&qugPh2?yRs{D}CIlGVwEXUG^2(4ZR#c(gkfWYwY%ki-K^Qm@xQg|8DJ?#;A`
z1!$(BL8@M_;S<nUjBji8)f=vVLxv|GTAB^jsamLDQIh;k-|qk(SAIy9fSy4-vXcaX
zs{*M4V`*{E+nF?J=jVEwMObcH!pH6@y-Xyf#YYaPG1=9gYUtGlfp<33GnetHh<nBg
z-wER#T!^ZPw=4q+lNszt5F4f?E&k*eB6%)h)2G60_PCK(=$PeKzMzBOeKgK#Vv?E!
zD>L%OhP$nCEI9%^Ba&&2cUIhcvhu(-3!#`)7|?j4q#}={Ggf5|RDWY8n9L@K=rMt6
zb0%rY`>un!n=0GOV_*1dAI?#jc-KR1Lrf1C#h&wX08b?Arvk#0Y5%4B?EwBe=&W21
zEdSGyY#xm)kPkhh#oDHD-*8VI>D$bSD7b&|g45<tT+FvN9K@EW+;Wh*HG?u*WGQi*
zg;ipL!1n0qtM&zy&`+h*4jO&r+|6tF5|lrGMEno1!$_rOhZ+12cIBL{zKY)iN4JK^
zk;Us4l!)c4HxMXVlj4_Dt@kFVAJ#EQT$^N1wMh?W-saGeVtmeM*vimBWt>xQFVyWY
z@?44=7C_V~u!lL=S<qPw_`+bjM2YLNi3cloNg8tG?{sSrKUv+Nw-QIezdv9RO(w$6
zj$s&GzO;V?=5V$az(ht{7)*@xg<n0*;0I`InR+fmelW#w!*X6Jtn4|7Q}$&)eLo{E
z5x=29R6Eq!rW4;7SDEN3h3nNl+gK@E%{BU)eMUqm_d8(fy-`CAXukXie)@yxajJ-C
z%w{ox{{aR_0?=$z8av)b-H;QMuoJ-pKCWlk%&2X405uUd13vs<Rnm=4;K4LcDy5&u
zH&iukH!0Xim6cNQsI#atUNP2v08+5@z{n%0qK8ht(u_Q;0c>#imbm$9a;ym}Qu^Jm
zS*VH({tSQ?M6AG#J1cTd+5WlzEY0ID+opo{V$@IWylVfM#j;xd_r{+ri?+Ad6y36C
zC@s$&adFDx7rR}I=Caw#rRqcnnt0^sT?yGe6{PFt0lr53AkS0(No~Zes)3R1o+{wp
zOA$tzj+_luFLD)2iN`)+Oc#3v_zoys8`j8+kP<@ahINc;fQ!bYlo@;<SgIE4GE8|B
ztY}NJds&~si50%tg3ne>7R0PF_$VK8H7-V3TVoVbZCE|2VMv8kAe(tGq;+8l&h~1n
z9D%C=voN9IPoopXvGkh`=t8&HBd(+7CL0@0+8TmSYA7ynAM71`I)-}2jV6SO&^@vX
zFND$MbLAlDev^;zwvOkOprvk}64&{CEk>S<B;4Zwdp7S3HKv3(FJhJ)iN<=}OG1y@
zh$ri|&Cl=hokv`+I%`B;cvD;MfF<$m>>aWr%XXyn*x8{@7O%BdQ#Z;WHaD1azfRx6
zGT_2m<t_{o(-TQgza7R)vP*Sac6Kg=hdV2gCx=&@nKamP9utVM$RuF*0H8wtp3T%Q
zU#%meWL0w^jyhSt8=djRj$~^IO1(zllARR%)<{EkVZ_>PvlgF$$PO0fgjZ-ut%Lu~
zD|n^-&}ezu4qtP@PXC*7*PLB{Z=<(c>)YM4AF52}-fIb0pU}-7!Alpy@BvBc9C3>H
zid7{JLG3&`x;;XW;C6cc*=DgioA)S8C1K(CZ-fxD0koMOMxo)!l8~Qk<B<azpl;R6
zbB_2IT9lWFKR{^-2pnN6j_gGRe^6QeKX2yGfph!?h$AAdPf7dHphEue15Ycet1vbA
zY&yXW7N@ctL)o>;r;5abQSBQlI0=Z)*8P2eg}QOTy9|Ii`cP!`hL$j8`HjGw1Go`)
z_Y(0ABU4&hKH^C?`dA8UARkDDj>aNM&zujTkmX+JlxRt%OC!xI6K4$T40s?lDzLeh
zuMIYwrKUD9t^u6vL~1R`*jvuLa;<M!2Cu5USDzMMeMXO7fh1H?OwP}UdB0m1+KvY~
z!)4Z)@!pHTvP8*43bOUHHK`u?TH?bhZvBN-TGGBpl<}Wgp~IR0_VVY1F)V+h<!ato
z*>47bmN&f|0Z}Rdh<dMJ!t*S&BGpzqOOJHw!Q$w$RX;NXF7KxzcEJ}k2co7&Lyful
zr?!$+g_h?Vb(7B~)tlB1uwcP11g$3k&A`7br{Yt@FW6J^WwF!+Ie*PV=?vDJ0$BZu
z{FeUefJKE-l7nsV{qr92UlD(c_$@_nGxNtwCO%B-Runz_0QFk1mIM6R;*Y-5Q<8sL
zd+Tg%EK<dTlx&maT#1ZxgdQ2#`4-2bRPd0hg_DmmmD*k{C)L7e+U)B(dWTj0E8IXs
zHnR|+|7{6ro<Y?NvujONd!e9PK#Kifck>K*fNh(tM@Pz%#kVgWy8j43i0XKa_!jXH
z@tgUg*%IBt^ci={D>2d^)blEfm{-7nD%<z-KVeuG<F;nY!=)x!%=FK7AJS}CXGh2H
zE2Mj1vjc8!qmM$L3l#A3Lq8&Z*y@B)lZpKec>LhWC(A%zP!x_C{qUC?DPedXtm-pZ
z+9g@?wAqHIDzwBqLb?l}e6(8llto~Pd}=7ReF|k&{J#R<Hd0{Gz<ZWtN)P=bO1gft
zmrW{?$$APV%LPZnWQU9cmefP)&jy!rW5`!a^f_-*KqZM!1;$zyd1ua4tq92$r~>s)
zeMKWHmImra07+2j&N<xDBG%rTqJpzN5?HD%^yvFLCP!%k!-W^?^=3r_rcwC;F0Jpp
zw6z>8CyZ2VlAAyu@f{QJ0#2Wb^RHG`%MYL`L-Hpo8VTRm*5Z*z!oq0BaV3>fLHV){
zqf2q<Pco1S<2ez}26p<gX+HsQe&@yx_I5qh<MibD5)K9w-VA-bGlzF?$>C@_dUrq~
zM)$-`sgF<G-5YpvBLL6V1=aC#?Adz23%tv)_MVG4_?L(yRD$AO!;189iAAPDQ{r0j
z6bCLIM!VIp3B%_3Rak0Szc=*cnjP}aYg3Qw8lsTtM9RN7#T)v79fz_pjahR4E#6{g
zQ%6qWyM>1`3;p+J5JPY?K39N91jB8eLKWbRFWlnN*f|)KSI8c+8iGprH8s{9VE1aT
zLcP+STCcOf!eZozC%q9YC4loy-jEUYrj=|=cGIFzyCKMLvSrlb32*F}2vXWfXxYz=
zW?C`HYGQaW=17GJ(TE%0KVYIq_WH=+0FhsV1-8Ti$-hZO(^|GGM$n5S$)~dZ*aLRo
zTQFNP+9BfWCWqQmT2()^NU0mx?J44K3=B((coTa2DqWii*MFcNdA}Z{q?2$0qc2PK
z1H!Ga<JPYmaomc+vgU$tA$|76kYRHFGZrRKv}2t{Joc4g&sFy2z<VyOAZq^zqxxp%
zFck2LISxEn$|AwhfQoBpJoyjw4foAc1yrp-4RX`7Rp1+o$J+pCkA`&S%!`PA!#fn2
z{TA591DiZr%DacYcw-RGOSb+}(e14XsYsi;1`0r5Pzh`#@s$C+22oxC`E@d<lm{T0
zYH<CTOkrds;^L~6TqYU3pVrAl?;VKi0l!#f#$W58ELqSG>t#@7VP_p`C6&F{T!06>
zOiMzbvQTfqDQ`s&yv!3W{*5rQvn;rQ5vbQ+4#>SS)r=HOG_H7}I$J=Qfmj^zG&g$(
zTLUJPRpfQp4vc&Qdz=UtKRLSX`pJVI6O%2-GTVFwMbGzj^le$W#!4)UOT9$AM?}OE
zk?d{#Mr^sb(d=oLcFri0ZDjHC)$b%;v$Yb0HTAUX=&9nu8G1%YwM5t_a5Hi?^ZnaJ
ze2#eJYO^tN9z7fEeZzPc0P>A;fe^=@J+CYeO;kt~;!3g9l4Dd@tFjLHkOV;N5O4Js
zu@@!lleLD8n-%cj7P5mW-L!C139{U|13X%<x^DifO(QvzwG7mR^CYaaf#DN?cI4@1
zh@_U+j%?q5iM1aA+dWYJ_lP3m2_fA*hAfBVd&IvYD#ECyF;sNVzniDaDVUnYD?#9c
zb;V1O)kZu={F-f^{7=PqI^1RjhEtE{BUw^@_ycM|b%zVd#t!4}Iiq9dd^(%8*HdMa
z9`%`!x-mfEo&mT>p(mj9WYkJc8WZ{#**$bt0v`w(Em8kq&04t%Xo8~qYNE*_=eb}3
z8pd>Nun;rrR;NT?74aqF5qs>cV><ba_^LfWce58P37KE5X+QZ$dNo-i#*A<JgueLw
z-;9D-#8VP&2|=~Y1Dk-=xe8$Uy&L@k2kzw*@k_)Xyrb#_8JHYkIp7{gRhuW$($v`I
z`-<JTKf)`XpaOW`NUQ3IUm>3oHTX=`e!(CEO!__@k*Y?RDV2);^-<krVir*YB|96k
zRjcd<i&%mo9wI(Pe6fZZC1LfZ1FlT&eC8mv99K1zh+TVFXOKITZ>pqpZ~=R(X<sA$
z&nEmVEK^rDQSk5*ATBHVl)aa%H0{2MK_}x9dsBkvO&D{U<Emd{iueK;oNb2M<7NPu
z&9<&avUao=zcE?%Z9S5IpmeYt-e*=zwU&;0%F%kP=Jn#qb+=|G3Y`-BYy{viuQn1!
zcPy=n$mQDkEuV%l1cfzF*J&4F!N7t<2sx7-j(Dh_<}<@0jLHzJ8h}9LgE77Ze=cSI
znd3EQ-hUt^tgY;<!c}Lof!RonWa%nWu;qvLApLi<hsluT#s-X#FQfUe3ahx1#LVjr
zDAWBN$Q1URfyv~p*G(l~!Ex_QFg4jKR3_rxu>L-M)x3zk5d?67A<YT=Ab<t~6$@+J
ze~7q5d^dV?M;N|e{04$h#Fg}{WjU5@|2^WL7(!&Xa3cv)psxVIJABjLOe-lO-UzFO
z!R*<*st#SB#h@8_MZF`h5x++~*$~(snz8ve4k+wxHcK(-4Bo7*lX+MtK@*N@fr+mG
zSCugirl{%GK6FHYSt0&fiaC$8G^^_TzTSLaLK$XbnaN1#99W2H{pBl<AmvJ7T~LcN
zKEo((5a3cr$oTghepNr3kGjBSj*2=UCKD}=6!A}FKzSpZ0_Ywi8!`SX{i~10T=i=-
zTtS}BaL#8ApJ+36$IbD<XErgtAc3kECEIjf@Be7Qp~lZiyi`#kIHT)J0H7$ifTIZ(
zqP6sWvO2@8ZzjLdSP^lC>+0Cai}j}F2!1lLbwwfk%6>X(|1bd5tde@*)F7cmYPQOB
z{%LIT>j0w=rb5{>+GgX+WlAp2RB8r->ugnlJcXPw*NVdLWCfH8WY<xU&Q%-*UP2o+
zkTYuz!OTsy_?G1R3k~{(l&iN>wWIv)GSuYDQu%3}OOuuN?D0aQPU`?#ASC(4?zZKg
z#U3v|bX~uguA+_88wW0gVHrXV4oy@vvo!}7Y`)A<eCQs(vEM{+GrRgs`dg8(pV>cU
znddbH@yf|;Z8_!4)Dpdsh|gB0-ACLazDHbPgyaclqY(3tDBPdynR0TWx849@LSfEU
z*PTporzG&eUM?-wuWJ^k;8p{7U70hSIg2&#o7*U2_q=%NGk1PA$;IFJxu3Y|+=Lfm
z>nmsD1xD*;jbv|l@@l<`LVpE;_8pIO<yDIhoZ@Z&Gf?mCFls3XZiLbPqx7?|XPyoI
z&lZ5MEaAxm;1X+}K^ZlK&rHgr-lZolc6T243_v=1wR4Ot;5SONjXh+7;3nJs(Y(xr
z-&GfuO6WO<Hll!+^y<n}?yzu;0@#qjNv9$WVHjgynHqms-@j7en?csa=zs90rxVB`
z)&?y2Qt-9ch*zp2m2oukUK;_hv<|7O0SD=VTEo$~l2**1b9sZ2<{cW5K3o3+1A4S3
zmzK+FU6)_^VpGYIPfh<&&9-&i|Be`}qn;q(0&?}U^$9(gAq@O(YfN&P-T#xB?E%88
z<gCj3|37?WIVwwz_Hxz|H@GoMLDhAEnY|5FdS$+*kl{uVx5vC+aUg>+N|GiO>%}jC
z4(~Q9x}y^H8PnPUqU-EeX2WnNL!-*Z25W~BwUm5XD2y5|XM+Ek3^(mJy{BmG3mbVR
z37Rpj!A7|z8KlaUYKFbBq7{45;Ys%&b@EQuI}wEL3M3}#zWYdoF|pv29T<Vho+%Hz
zO$xoV?5H9bRh$KR=OY-f@Bkj{O3h6cn7X9vVG}AZj-Kv)0OQK(Y8K7X;PIK?R=I0q
z4TQZ)%X}*!T5GN@vEDG?&{#{VdIx6k!LWvpet=3`++<wyY}(Ea__lF|mgsRM@DIi|
zd}0ucdd2fm01}87JpY9q(+=<LjTia-FP-LZ(14@Ohg9>;36o4{=;h0QMEoA{4>%j$
zVS!gUL(p8-*Dr~F>eCrH!)oCP{?qp67p(l5!Jh@!S#Cto;g(S-1^u<p5x+#dNBkbK
zkN6G2gVXSyDBi!nEXIok$|tt4q=+g!vYo$s4PoShBJ;1fUMYQ&B~T;P@;Tw_!`Q_M
z8@sJ*Py?IXQv+F+Ls{<#m(qU3Kki}ncbwCXTM%yQ*P%-$Bnq+t`>Psyfkkah@xNh0
zIuc#PA7+J<^%vD2-M<R|s<Pbh>Ky^BjCf`*k_>E-|Nk>0BEI6GT0Fu7{B>!-Cb7SQ
z1h=<WUoLk3P91((x$m0=C|S#t4uqX?$rIWOuLLB{t0NDV{6T(k!V3lewaYN@jCt<4
zml7{BSa+{_ViL<u9MB(hQMl%!U^Z6))s&~ur6CYe7!46vC5k7rhcdf+S${CD?1Ch5
zG-devN6o0xU{wZim9=Po##_Xn*(~3gu+WqFJyR@tix_y*lUg!g>{Q=vsN#oaa`~MB
z+L5Cp2}{o5pUE=|guAYP3NU4C*764;W|wNRTa2bOfzd`0_8Rf8i2siGJNec-5xYjB
z{yE|aVPvxz1GJVmgPaE=B?1t3TvG@VdI~QVHV%9ycT^i^e)?V_{(q#$zcNl`+2@B8
zwQdK1(Ne?Gq^m7)chAn9vu5CetT6z~5B3Tw-@Fnxm~_&M0(v88e&OXfgN@kj*w=nD
zgKQ%{M|`CPuqOkNy!q9nf|s=e{UhSPm^(Mw-<Yg)RGB2P$gW>vHZ42Skr2%IJ@M)N
z`u=wmw<etCh|{cjqDOk<Hhi!=M66i28YJ@-qsY^*84Ek}?>h>Rf52kyIVnFg{}Gl!
zjp$t*A~cxL)%<003u?O?Fup7IEav!ZGj;R(@2$bW6L&HGJFM^_6VW6pAe9CO`+<mf
zvN5;QCIwStTxS!`E+^*QCKJeR^B6MnxO2q!h(DlIC+0YI6p_ZY_4^rA7*NM#jeBp6
z-rlIfD^I)H3j`O*_A}PMBN)DKLdgzuE1@o>oxg$7e;8s-?!TsSpkO<0M3py~5ARD#
zVqg_<f}81x2LrygAVH6Gp5Oh+Z!m0+PykZ8t0sRynD?QQwLm%2BUSinf|tgIeB~6v
z$UDQAFZ|#5%~t@i#=$stEA)RF{zhvZHj8wyG#Xs$s*MaxI)=ap-RKQ<G!~mpBr={i
z(k$tTuLQ6w-^?3{JgIIeW7U}On+>#(SL`;Az>Wg<#iI925AJ|6{k$%sk07J(;K<7A
zs8YV%i0%6<1C=zQMg(zS*=Km`@}pTinIWDzLT-G^ayFI<<?Htx@%M<|SR_pXDA<|<
z9RACC8cWY<d$xftv*GBTQ+VD4V50pRzF(S?EKTB40F0{j!)HLXXYr+K7M^O|bBAgF
zZv^0(MV9JD4KWG-c>kFa*~tOFqavd2ekRAx9jHokfddX8%f<S#CYY=?%K+4z+tbhb
z9`PqtfgLpN;+r1U2vpU%i*2q_yN(!CGs>cH8!Ai({MlX@rimJRl9Q32sMZeb25^`N
zJ%x=tm@P-%wtm-1)wr>N%%C(Je9h`mqEOVH(pt8@V}VPK*SMxJ7uWxl+;8TriF~T9
z_S|n~@lpxbJpbO}(ar>DZ`nxl8TtMHig=Fr67fW1?@#vm%Ei5iJ!kj-iM_d*R<s84
zmC(cQA30Fblbta_H+!6sv^a?Yh2f4Ph5L+Dselfhbwd>X{Re=5g_k%pAbGSIQRC()
z2zige2FvRRv?N3)Gq9wQsH6{JXK9MCd&6>`=_hZq4Dtp&y2o0Mmal1P$5auxaRfwr
zdKu-B-KxoBAJ;E5st8>|DJ>7btX5@VT=3>E%e^wf?5mCKSr+^suo)`+f1x`tw;vTy
zcNT*jfP__G3%~!C%7A=}yif<Ecw%*PC2E0J-5YKkK);VXylS(pXCR3T^hf~cLCTHA
z;ox7C?<5EKof~cO)x9zHQpU>ka{{qseIE?qe#RdZ)?&S1htJ>^-i)NG35%W4^G7=*
zcL%y7&@2w{3?KJkC8i*x9kcf;&*m%DKJg>sHR2yeYP7s2>3_TRx&!S2Sdw4AQTXy+
zp8c9dDy?Vbgq-MR)xeS4REKijNjrnO2IAqEwlSAB*CdZ&Gyc6J58P8U@~(27#5@BT
zm}a`5TP3NFCIXNYzsNY;<#UMm8Znq);}G#%nhHuDx8Z;$8>P7X_5uKYMEn8n2z>j8
z2vDV$4kO#yOoQ2KP|4y=9FH3NIrD;h<zCC?y=^*BX8{Th@5E!5tYU2=dx{JPKJgBR
zlijfHMb+rq;9>apTY}SrwQ${R4BbBU>KhcID0->4f90HYtWLM?q+LD(JXe#>>nsUl
zQmI_mvzUYuM~bzL$?j(RuMG{y8yT=~=KyXy;{7{IYu~awqDFe$8VsDs13Mn7AjqB8
z6l$HeFGw({?!bZ_4d1`oe<S$sDqplDpVEm{rZ~QTQ=L~5?d>`JXEKBRI`i_a7pUH&
zocR>-HR7wKV*w!9pdfZ6&tY%<I8&1;4Wj3t!rI#!+wO}ElPkz2-_sg>;s6%$&yBd(
z)4?F>xiOTLTh9o$B%*gFvX9)FvX-1fQ~<b|Xs?8BUTDlu0P=c$fRsX>%>j2-t|_=p
z8O6pbN^5vx<tl3nkZB!Zdy~`K854ft3eQw?g`*=&9|Ht>HOYES!Y>WqE2pBdTs&X@
z0eY;oa1*?ku|z07u2cBY_@4?Su(R74KzWk|u-U(VwGlQdKUer>x1!_v{tHWuRBIVv
ze+4WiHoIIe;RJjX!`hewQYUN0{pRQYfkUQ-uZeY?VSbJub>WGyP`)u*0q;T=*rO=`
z5&$uqVe|%o)B-1uR<1X%;9wpRn(i5C&_M@zNo>3Nr8K7{iF{)hPlh22Yevq?AY=L-
ztj2PO*}6=swej`<UAd~3>x=-%))3xG1r(NK$v+ju4z+U}`1i=*=Z+d-ZGG&^6~91C
zl?JX711#*I$Z0gj!h|&sm7CtbFaleh3c?$@sm=~Y{{70(!HaQD54gtzx#(>@i=!Ze
z>G0d1xX<4B%$X^VmrWhNN@kOtxR%UnfXldM@S&hg_rx8(^S>oclB&bKQ;IILi=VK;
zg5!FzcAh1?-^`8Y4k*C>N6U~4g4qvCTk8zL%obT1B%u-UXT+cU$pwA>WvwFeRXrk}
z(OFmScjI7gRBsAP)hlaB?ZG<g->b9qH*CXtkm_g-EZED2#oOCC{p<U;h(9C#XT&d*
zK@O%z6^NTmrnO=J2dh+e9F@tcXBYlXj>Kkj*DsuTxlVP5MjF5p``3T#K@#tH@)ns+
zV|`t@_p6LpDh)CVNy*b}6!8b+pTmIai%lm|4A@x0#tLv9wdb9clGQlTnnmSXFmrwe
z{LhWrNMn*<`Lzup(0eKqkt1D}^T9MN$r4UBwyCkkfr*r_w54tQ;q!Ki5z$bK8MNu!
zNZXA6&;g#Ub|b53kYEgWh`~C`@1GJG+sOtJlooDO#F#c$-_58tj5iZa2b<2)twGd?
z(H=N4rMdJTOnNk3&V~e`wQeglflH1vBL@4velNDF#<JdkiB?VO4SU|Uz6u=Hkzrne
zLf%odGkm-8{UbS&nmwd$sPG~E<{4BHG+0OBL}v4cq509k^<UpyBIZ9L{u2wE5#7Dw
zy_awPi1?GcOV+O>yO^jWsulhTA5=%Ykdx-m-`JZl#2>zpL0o91xFG~fD8v$caKHx+
zHu9vls>~Q$AJwXE7CL6mi7qvKG>geS!LqRLMk-a1GH;l`zXOrM%Bd>YAn<uJrYSd}
znHpn(sgCyUoU`@w9zV*!uJ*RFo-b(qdb0UoArlNV<8&MJ<h2cV=<&FvNn`F`E-|kG
z&Z{=n!o;}Iu#rLeXSSyp3_a~l6)&N_q;Zkmg+gBt>n6_C4>oUDQ&^Q*Zeas0)QzBo
zOs;KJ1E|=jMT-8b@n#341s&g+vrf#8+U>|iGquglx|8}Wg3a&`r4^opH*OXR$Be!A
zPY>Z!*fMrDkYG<8uUX3=N07>cMc@%%(OXoj0oS)R2;boxI&{PpAyL5Tx_SSe#nvWz
ztodnKd})T6I6wu*hV|d!_Zl*z<(aSR0N$raWnld(Q<c@G4A`K-Cu3ODg{F?nb3{Q}
z^aE9Ja{mV!iGo}Gjkq`3`*AMoq2Q|F1PJVifVjl^ON+<1^%l4eKsky^2fZEry@ka4
zN5qntBPX0{JnoD|U9teObUplE7C;KD;1x6)M8fW)Fw*lv-avKnljOXnH-AXY>HZmw
z=Q@B_o3x~Hj`+#zQS(m)Nh9G$3zsrjOKpWk=bsL$?!oBbmbI){&e?=O$wzT}GNsB9
z!Cn~X=+@)ZN2J$171%cItK5T1hL0-P9dFgVLB6z1AuDXcpJH3?JX$cIxAB+wm9Tci
zVzZGgz|G>4q&8Ys`-URy>dv>$zJ1{81E;z)XVl`G>ue7vLp?`hA;kGCuxw|0m^h#-
zVCz86(!=43Ky2&)ZFrw0$^6M;O6_qY?Hz%YV>)%`uPfZ3Q3q>EQAt35XEtP(pPqq=
z$+w6m;%me&Hc5Sl81p~SYDabpqrsn})f;b2U^p3#9NG1y+a1X~pNIr4hoOZSO@urJ
z<LaIb2RZO;XfM5*_sZ8_=y)>5xg>Eq8F_bEw{j?;4CG;tv?oud1?#|+xdT>vQe!nT
zEs7MLB`#^B6IcH-SweR-b;XQ<%F+`6^Tv#X0|z~<sxb5KYHdmoXv-WAu@%}OHAT+W
zN3)^A3iIJI<e60domm?__rFoc?~O55nN#b$a4G6lNF3;3;}1%68xyFoL`V}K0k|E_
z=#_z<v(b5xPNvP>@MJkcX$k3s?o*xo8yU28_;Rjw+6_{hN@e5bC*EB567d;RQ3f`E
z+DmE^8!=ozJf<o%31<QaGp_ev+CWw6XIP7xBOpdNQNbkvpfaSWlH-IwY2l#MQFOoc
z8$MN~_KS(L7h>_xh-bz#B`Hi{b5hSF+$FE}8i8yrAZEJ)DGshA2FN8Xo}}cjsjuRG
z1}4*(_tpnqTYYihctsWf#Q}@!^@CoUC#q0e9@Yjd70pj0sZ`7ufb!+qfR%wQV!Zc=
zFpEE!$0#5I-rR>5c<H0{85Ptq7xrIOq1)yxVCQgWd!1MqzLhiDmcq*cm7K@0={{lY
zNWWxnG@F2L(jHW{_zehB6};vn3gn$kMJg<LIk7{H4eGpq`NAC32Q06!{<-C`FZ}(y
z5l_$GnSM<!;ZsBd>Q-~WwiLZab2QoE3nOnIU?J<Z{fP((9Q6*baxiDKGkQJ=fG-hm
z$W2Es{*!YyBI`J2<$qngWg3VK?i%qY+EP{GcAU#xgdxIF^^L)bStIK*n|r^18c5pZ
zCU-;+74&O<Xq$WvGnm>QQmL?9fBR7lNo5rkJyix+w(}`%g31dx%TFwh?K3&C#N;>n
zYIEm#H^T`jzj|qA0}EmW>rD<!Pr1iM>9~pWnXJ1i=|uD*<OKxYV2QJlGA_c;4?u;x
zYB=sEp1T4~54ftlf>}{reYZh(Z*a{c$EanKi?#J>Ri3b$lO6QhfcJI{qa;|Lt%A3+
zHr^5^Rc78ZY-57I@nD_A7oKH=0*lN5CD(O#yEiUO3}<gBDZl@M!_*k>+5jxByyPJ=
z7Dmx~+5D4@qUZoL+T(KVzzamDCk$?67Sk8_%b8W|vjT)Z0`YpBnJ%233YWvOdj0$d
z5Nv>WzJX)}+`S{{{+R(f%a<R_T`m9eYU%fO{_o*R9#?R!B`xZ0Arl``kp%WMz@O;X
z_yQQ)Szlb4ZGn#ZltQAxlpW8k>js6T(woTfy^Rx}n2*tLvymNvil8H&PW<m93nV{V
z8$$VOS6g%>Pel0(sIW1Pcboq#(2yTYjWJ@y!r>P)r$j^|4W~5vlX;{^R!#sW7oqHE
z!PG#J{4ZH@mK>g125@6m{N5%5O4qWtjB0l5tQ&Y{09UYdDz#Vi7*)pSZ4_Y&psd8k
z-qAIhd2PcVDd5G(90&$s#G>T>i2*NVE{ESU^Bo&pzZNCdF{H#})&%RoSHCa{vDwVu
zv_I-0obgh7Os9tAo{d7dy!P)u004jhNkl<ZJJ6PgKSewP=x1C?$2$NTn<tkXpouM+
z`1#g43!Hj9Z5EOJhXT>!I~jRP3*Q?>rd$Iz<9BZCO>a|I8^VLSWAf;?qYG{+<v$zg
z-&ox0aS0nqWP{(8C!4|1>LC|U{On0wVeO3t5Asjgl3T=o69x{}Y?(M9ZDH6{{oa!8
z7K>;<qOb;}B8^RAlrls5lXsf5^N%Ekqm8?h;6(zb2g)mzfAhek$?vB2n<Ji4bS=0~
z?AEy6{~Iju5ko4`fd4b%6@sWRN<T544&&`@wU)bk#J4J;PXbV3{{6&Je~tK#@eVvx
zOBre-16wfqFE$TvvDu&e%7OVZ;<fId+U7R}v7&(!5uR2<LC&D4zIDV~cpbVW!h|(~
z-mm=E8qSts^y$dY#wwF?>4h8co_fd5)Z`KIiZeZ!#^sp7r<nz*%}X~1B}e{z{C`By
zQs~?`42`{FqDtTRko-z*b-c;w?uyYslWBbB%m8mNS4XZnKm>HWMf!{3(`^LtU~8i&
zGhocZk(aEQPiNpvpq1aa5*r_^mu*^0sl&Ei_4*)m6$Y+ai-D1SfBEz_nNVqSLm$Gc
zr9$^_ju2A{LlZH4Pb&9@<;~gs>)~%0%xq{Z7}l&rZMT^O%DK%(1oi;W30|wB7}L7_
zOD)rxWc*iF^apboc0}?g|Ne~qN(n9d=~$hCX)Ff|HOc+gCev>zxlY`(+FG_dp=*9}
zFtuAvJW%NDEU+fmuOKW3S}k>}IxX)I3L6e|wod*VweJo<eg#hkrjfe<i3Il`ITg`I
zeET5<>IsD%0LYft&-_6{z%$lT6C1W?85IZ_m3~~!7mfC!;T}hj_>Z3v-y$NNh`#3)
zSy$7z-7*GXHmp9TIbLs#Qvtb=bTiylK#i#!ORGSRAm}mWpf?b(^zN#Lz9DPwaiAlO
zZ5z#i71q<D7*s91u#9}#l)*_x)~2M+WXVs%QopZNOlPZFPrSSJMqoQy`B%Lt$@gCf
zcQaXR$-&SNftGWgKY4*?k|kZ)>&oO9q9mL30XRo|kN5)c*jpR(f{!IrnAa@CY?V@V
z&;W@*cE8VTL1m(jYCVkI9gtL}YUb?WhIb@^w}@Y}nW8g1QDX|~7xUzcD)So$uvp*x
zl@=fYKgrQ1(-Suz_3|h_NcN-&5MioTmH(6LOCHs6dtZ3?D@2Ie5~fUuArpGDh8-17
zU3PWGmdnk<Bfp_?&nraW84kE)FOu(%c&ve)_l`oz0$#bnU3LovN`VnaU0EYrn7sSv
zrVLsNl!-j%#aPI{@r!CH&ncwIkX?zR6AL6AKn)l=WJ#cul@p@nD|&;<O2G{`Rj<*L
z%81(70wItd^};g0W-+*je>{=O7gmk$NZFJG*4g4$1`a2}-N6VqLE_nJ$`Kv_Y7>Fa
z{9Z@Ak<pE78Cw|Rce4qMK^?BPX-j8b%AETUas7{mh6X!WLwX+aFOLX8_9mVBiKaJD
zmy~AevwvAt!vp_PLFWj~zF2Ee?){DcH<5lN@ae&VaI+Zz!i+|#%-KZz9WG%beUPnX
z)wPQ0=eozrY75Ew*ES_eg~Qs0?04ppHu~`810}y-R!Bvt!RXoN^-H;yWBIf?nOexj
z=)aL%Ue^?&N`p6AkX(qfn8O<&YU3kjx-zVC1J48|*c*#?x1Sr8t;yOrYx8PhO&H_a
z{uTi7BjOnGiF%<N-3}{Ma<nZ(nT=~MLEUdK>7$+gCVTP@30-gf@JhUOE*0muV`v!L
zRGPsR>}yZ0yj%lF8J2dm01PZup;y#(`wRol#`%t{LTg&lf%HQNJW=sYG!ia6t>i$J
zLC?XuNDD`Q{wqq8BxsPVW`JOqVj~Fv1y(VcA&*F27GO3qVx`&?Q7BvnIqq_k(!WVj
zcG>O(RuS?w&D;i-PBP5bh@URTlvL=QM7+eG>;ao%7zg^j?pb(Y|Ach~%W0l%N}m6D
zz5$|V&ID0Ob9O97Z4U5${f5A+0^`C&o@~7OTf~1w%v8n_HzHZu%ng_PQyJ|r8#|VJ
zA2l;-o!@zt@v`QR_G;(zW*vBsR<bbz>M4UrWcIptpf%3v0&Nr|5l;~>5x<*$qTXEp
z9tSAP@nV-V(bBjir~j0`eRJ71$(jpB!+ekU@2nB2u=ax;@O4rDULyWv3R7zXUN!(k
z53CW3qDNTRi5E*;$8{K)2m=byDoYNw|LWC9Sg0FYDthBa2KvA$3jXWHBg}?OZf1Ww
zVoeV<b)qJkeB{}Wlqgi0SKt4h{A?qAovBy4*rGK?qGbs_Mf`(clHC8z#78Ap)}xO6
zzq@o~&R^HBtl=|IK&Woh*=2f;Q|I=hB=RldJE@o=$zYn8+qw%&zbzF{$H{v#SEEW|
z%>yP&SeQw>s(Q!&jvUSIqcTLnDyY^WJ5x{0Tg1OWNPpe!#$zY~E0?cv58bNA>#whM
z^lUxC7BzN^3Z7(MO9+%dPk0jDcWJt_864QG5k$b@Jy;9J0NWuN_hKWRk}A5~Qr2Uj
zC!dJsQ^b$VfrTN^1FL@_d{^c-FO&*4Kxj|Z^km}?mOS8}mXXd3`%QG{gP|6g(G||*
zfk?PGmVZV3Yu(5eIEwFV@5x5WyC5JeS|{tj1FyWB{Zu@B%iT4bLb3vUpesD^6ly$q
z#3ddx2P*z@`R8KCAh^EB@a+K5VcrtCR8S>7qcBfa-%J*-grWE^)H1GHR0qTrl$sP#
zXZ}XTX%Xj8fJy{oa`@`M5<sUbjflUpqA#!r6FE)NNVjJa+>GvWi8v#U(Is#>2R~cH
zcB#q^xG~`gC)nLxw*M3O>Q5V?R2uEJ9GGXD=36>|YpwcLA-z^jd?L1~IC{-e>5}b{
zg+4JnyT{?p9PkYaG%Q@IL^Wd6`U?XDVO~R8e9Ko#>1qU9_UvHkpP)y61V<8#9aQ(#
zCSs}1QPM9v>)76ZKwTg^OR5SkeX*A#9M|Jmr4l``Y47F__QvqNu8UBrK+8GEfy)O6
z+q*4;lTApAF(4d2(S>q@R5@ft#MxFsmA(mgduB@(>wC<Yih{@9afecpp6Ox=>+h~M
zvv~F3UQwh3?-y?E++^A5{kA(SP60?pD<W@CJS^6YY$~GdIjDO)k*bM<<y86NH#m)s
zR)mvPkIp#8d99-lYpj|%ZUx=vH*3qEu<%?*P*~1eTIW_nV`*=w;JQwq3M+Al@O$Tk
z%r+6JHLRX|gaG%v5pBVXU(tnhHW_o{`vn2u9X;ZXFkG`bBd2!*K$L*^miQvXqV2p|
z49Yj}gq>ap*UH*PpnS&yB>`N8x|2;xDKTe#AvU9jAKpNScffK3MPbF#7MuGg@^7$Y
zU+s9KDNHwrSTS5D3~OgEr}+`_omsOdGn0)P=;DK)@C6I{5L|Fo?kS@{3)2}%bll>z
zPHcE<kCr^Yyj902pCjkL;N&#)Retelrgs&P<S7Nm_<|2Ah$%|ylRB`zEVihQI5YjJ
zqAIYbUHq#}uD?HkD9YA;RHV?sp!fy>8de6VtoGM%dPXa3)RssmHr!e_!aF}CUB}L3
z8i@@E0qMA0@u#X`e2?f5f@~0oI)fDxZKXskNp6m?(m^_&JiK}=29Sn|a95UKxrN`L
zl#?%C(eJd@eh_AQg&n5Ye1w^x`()F>8#;Q%h_BSN5)M>Jeh+X}#FU1!rz-JCPMX7?
ziNF13mh%qe>g`JJ>-<?Pn3v41EVbfhO%k=i#4mEgzY~dOLYGSR1^S)0^(Y7ds2e@-
z(puKAD`A*0r{KhbRF>^e#w*l}bn_#S?EgR>HSqA>aXk~^3*KpzdoMqp!XvXO5$BJf
zTR<HAN5ofCqUy<8d&B59|2d%=II~YBiEjaOGU4h!MSRKX;I#>;-#9#1;`YFHcCd&W
z&SyCljj=FCz?jm?h5UZagggTKuNb+=uLdykh8(e23%P`zYc{O0{!@`k%Mmyd$x1!>
zeGLaYI<ggSpnS&2wuLN#M6tE*VfFRQ=?x?{JW;{*3e?Ei9?iUlb@=mTxLGNzA+;>*
z9t*2!ZfxMPW8c#D@e^5OZRa7EaJ*Zq)ocMhOw&FC&WH6Q2*h?A$#()xWyI>dR$=JG
zb=JajGM8|~Fh(BgDa+ubF&;Q2t+9O#|5Z(z)4$n*(je~y3sl*t|7Kl;Rt0}1Sqv8M
ziiWd+p}uThR(bk6Gc{(@ur3btkhK79eg=-qi7jl+!wsgbOdQCvu%Fmubq$Opgd@Kq
zgmB;a1`4A#?=1<9&QfLUNd5>V&}HC0MPUs__#HNB{Mmf98fZmC1zte9rd$WHJl;~H
z{l)5jVSN;;(ry5Vf}xW~7GD(HSBuRZnE2*iK<G)QrDRj&l_iH+lnoD=3lq0BW47w*
zeIf~4^n<~4s(JGf9zZ<r81GD4qV(gsey`YMoi>fNdzUckuh5>`Doh3HmT=i||G~L_
zw<fOpl@l4L#y}}yJ9D9>eEJCs+14S?yaeHyZM?C79r^2EO#I#YRHIUEFFthQFy+ll
z>(9&*U7)_T`m>Ev*r;I?W+H0)m}yFTM7C1lz>_sBEGE7qbZacqo<U}n%~bsG{*M;>
zcQA%MsolvWVMz_3hq|1N_Np;&xq%WWnjj$gH}xsa2ER%KoJR|C8#lk8vj!rdwT$%H
z1hC4*i|osz^_$4h#sBPp_#!2f{gcA!NWx$7KGA|}NK!YhZ+<YL=nb}G&sqp~Tx~RQ
z^8JZ09B^qn)IAeJ90z`Aqkz)bq=ngDVl=X&fvU0pC;$%;-=HdI$Vasn7W~R(4S<y0
zseIp>uOD&!6~*cEdI@_QyQL-qi2yWEW&Wraa>6s}w3ZCC+*u?Y03gZrOOmaYg{TaD
zO;#1y@fSuvKLQ{P?!F)ckml_r;#<Veh#%|o>+-0cF0zKub%0LlY~Wswp(~ppA-zmt
z?^)TgG~)Qe@4qAK5$(F<(O10FfmHmTJ_qNH!|F6}2fRRu(>_w!>NgM2Xl=ZYJoFVx
zd&DvS+)TkI%<ar}J*B1cvuQgwKH7j5usMNg)bmvtles|wNab4b(zoyE4*!mFLO!fo
z2ap=6&_5%7x7isxLp==*)4R0~kc>>NPR~$K$FvEf&kk^5O&B|)&}J;5PdV84HG)%u
zP?~b(3Z=O>^<d|rXKe)&XT4ya+tmb0vk5$g42MW$@~2f<8lqZoQ{5&9tPH5Owynar
z&K+y^mPy$3>A`CiA?Ctz*8GXv&y+p6=n2&mN<;G(eDG{XJz-HAFy6Z<+D7E1m09Qf
z(I^G+WXoS&kjUqVUl9`r)$F_sr#kk)_)K+h^jPj>sqC=!;c|eg+sWfed{wbo8%e;(
zjY*|iT23$ETC-9~2D#Y71I^Son_r|`Rd{~qT;~*CVU2LEGb^dV_4b;*4dWk3P|i>(
z6@EL2FhP*X_rJnCZVbhKGUn<lvE{OUABzeI`*fP@-a2|^#T}{mecK}YrP2zDkKi~-
zT(9d{$N{=6%tb$1^NsTFgUOrn0C0}@hJHeeA@b*H`r_?R4g6R#5H+u8oK3-lj9CmQ
zu-JkAlN})ZXkC5k&Ft4HSY@Nr{svwtXuB%lRx32!8(nZ(kB)k2Yhu2Mt*<j3uO7Z(
z(A$p1I}kd&FQy}fPJCzyHzFuL*@8C~Op4Wvk;H~8+VP%@43!u*@G7|swScYr7g0h*
zp90rEP@B0KQ~B4*jTEh<f+#Tk18$`Fr~^YkA;L>W892bn>LkNDfK{ntRpL3ZKv&DZ
zBb!Dso6~3ki_WTF2#GVP_{ROK<M_cSj5BQoqiu7!Rsl%`cH<G=d0OROVRHh@><E;U
z<|S5d46;{bQ(~7lPH}0oymm%`%|>9Wu$N?z3J&5fDoZvhpm125Sgyklc78-dtA^7H
zuVSfrm)&u_1+%$_<#S4#Hw&|i901Ax$d**DD6D)vnu@T_ir5V+*O(U{xv~T`v+2Fk
zB5)XpsSv-Ysin6%>}}KfCfwpPu4_jY`5N(OL>KW3)bk~KC7ROCrH|mrR5(+L<0jn8
zheXh5)YtcjYsBB+@NW<t{aS|`rUcw^5{_WcJ==J*7mDNvyu!qOi}-{y66$HKpYOvW
zsw7nJR=;01bIB~mh%Y$c96mlmwOu#{4VNk{DLx~uNhmnUNR=rRsHNnRBm>Q_?`KHX
zf5Ir_jB7H8S1yn&{t_8JQN1Y5%P4fcTCFlU04lbY1kkJLYHn7{Ix@&-gF~;aAwoID
zY+~kjVtBp;+G-WIuFPY9FhxuUh&yNYe?^@84}jZX8SDXt^USom4dcCW!9Al8mIm4<
zLn3=v#QpmIMZG^qbP<2>8pa2d5d*x=Mh#~+{hj(9)I3$;*d&(Qs9`LN_XT=n!h1LN
zwvgFM!AT#83RQvYtN^=JD(2M9=GXV4qqx9{jSTV|DV;X%e@_OYKqXySVH0Ss#Mb2S
zt2o)(5`-b)5mfhuu_{Rbp7B`{0i1!}7Z{YDN?x-a04~<0+v25r>`7;1&gy`LMWj6f
zHr?0~mfj&4iZzW{kYPRBB&8B(Ciqa<8y7YzFsh|rtX{UaiN=M|z&VFD*xa!iDfA~E
zxMWvH{M1W!7ZOs;M&|a$>lFBgHwe_6g3?Cs?TD*bsFv!+VA1`{72)c@1O-Xjl(jqn
z(fk5`{hKsjJ07w&o~ptJ_txN+um7^!M=N(+;I9fiehmxuxb6f#AjH^XB(s%lm))B9
zl?@r5=+@q3TZIi`Y3yS8_vL6sq`boj?d&j1UAA9~7zTU(8#(R8vYE;fnrnstmxp|1
z@Z)SlK=x2*wZ-#|^>3+F?YaIZs_C;$BZ;;55`u2<VefQZd`0--GPh6UcS!&VgYQxz
zpIMzfhLp5mW_M9o+vU|-Z$*k|N<~sc56Z7ccBM2%sRhPedHxS;KsQ*F%0fhoY1HPY
zr}g=r!3uk_1_|q1lR54=X4U%P1K)JC47M;i-JbBCXbER>{~v_W7CSF35;w+9Cd~g|
zpTR~cD02-n-Nxv>F>^pQ#qTo;c5e)Qj);f`*ZCcPkRZ6{h`&dC;pp59{#`f&v2KFa
z8qB6KmR9ZwK(7!vY!)#W1@CLbFDzPX^LKkg4W;#JCl?Q_UlIh#DMl6|Y{X@L{}Y{v
zY6==6ep>!`0Q+9n?=z4OcmOy(ar(oCX(`PvH@_*%7!A11SlERmY9sy9&Cm8GyAmyq
z+G__O=F^&v5!G{r&Kmhg*<hF^A#a73bKThyR~bRe++;^jpXE=_>}>_-vY2BFn*aav
z^<T@9BuSGdC<YxJ6cL$O;O`#VoqhkOS)#q3nZK*6DnZ17jtuRE<;TvZrl*{eilQPM
zn5l}2u&}U*9qFWAWPptgNEl2~EJG?N<A|wP*nb=Gh3u}jR+g8g<!N0r0YF041`4#n
z;074_0ETVXEJm2Xf1+x5$EE~HbeLdwQA8c>Yz$NxE-=o&SSfzWo?o>-Z=nOv=7n0a
zwyPzFi}TxArcm&)2RQgQo)xv@Jyqg=_SOo~Q3tQ!qBiz3$2AF|0>G7)_KrJ}crVBU
zUlSf>GM@O}LX4;$mvDJ2)$P@DW=GQ~H1=$W5-`0>9!OmKHR#K_rDRyCRd31O;uuNv
zcqR7Dlw{3-)RxB7w}`iO6@3CeHHN+RHjA}3)44c-?>?wQuyQsEphy4HL8FNeU8GR6
zgyJb8sKZ^+HAx*13~PBxQm`wyUSL@uL>ji<070ezaJ8q`sfebIs1A;1CWwB6Cc0VQ
zjQS3^X!S!6HY0bw^ErUr`jHFHS7j_v5Kq<qel;APKfurA+-EE1^sLB*c$(4B^#;86
zM(30y^3yW(b_oLP*>lNh_LPJa(gqH|CL04L3nLqwYIh~pPF&H6$9^tqD6OePr6MbB
z#B9%6X2MX-Ea9!C@K;#O4UdJ<eq~+?5rvfj&OA!yaXB0z?L@6@4o88CNRDNqMPatF
z@2HK_S8Y!KK66@X!=xR&M89^RgH@OHpLi`b4(-_81$S|cc!~H81<JfstwsMHNpQgx
zEdWSm<7WyI;ua(#@}yp;-+uEOJdA=thc7vBAumLQ#cN<=c1%mtk4TnNSWm!v;sQ$^
zMsY`!Dv0~@9UV>3Q74!E=a(0(!4Wb^m4BfI`L@~}s3@wEiD_s2PHlkZ%wlFVfXcjk
z_=~U<zP78_8z#>i(18Qob9z<5nvMK@4IdjG;!s%mXWq>Jju;|-VlnU8z*I(dB84xY
zP)0nyY;D9nr3h}ar(%xAB|RDMd9ohbl*3dLM{QMm*-&Yt?o<TMEa4U4sK$-?+`Qt3
z9zT-+Nrfeyhx$oNa=#CHdruL+dN;Z&Jmf~CWeQfb;%#Xg?Sx?tcElg8;cqk#cm5B(
zY!${M=4mAB^=oYHRNfBonS~8J#>qf<VV!!<>v2#DZq8D-RzTR<k^WCLn8whp(oeN_
z3#8Nu08$-tvXl1#7h2k1Ua}f$@o!tJ(iJ@DxiuvA`dK4hOQpN9rlyASYs2RcmZ%hd
z@dcdI4u2|CG^fG`I%HpyB77K7Qvca2@2>dXf+uZd(x<{&@GjUwfm1usAJEg;@RU{3
zPv&+yR5NOBs7>#a+Mn#F1u;n&Dqc*5g@y6oGJw$_<FI@vk-$+HDYC$Edeg+d9WYuI
z^lJJ8-oeMW&-X;F(&o>EmGwTbr@e*v#cMLuQt=h<5kH8G^Yvd4GNoDET)zDVz?e_<
zE0xKbC%#INb4y|hFv<j7S6d5IZ8dF)kW&OuST9Rq<?ogqnZNX%K_+QClhWR+dGf*x
zYvKbZE+nO3lwc`M+VYKsncl>kV|@z~(X+KNY>&8s3UKru@%x&>s%fM;7zNVvb*0r|
zU)B}02FS0t-<Xs-xcI@2S0;&(ClP-XEa%l?cq-!*Lw4k@)bLdC0LC5ls1J5=@%sbW
zK?iER*btB{Dzec$LHa7;sT#{;CQGGHMsP&}z&bLhvptq*1avi=&ZH-2u?4kdy(cqD
zG0_sKW&&8jvC}VaXja)`S~CFJ5lj~<rVY=NyyN7b4V8Vv`8;s`((^xXBsJ^cU?wyx
zaUbjr#rp85<h{3ESczCFY-B>NQ6)FOGxB;Coy*mYTy{rtzey-84`>ZxSk;qzE^1hJ
zfZQ^GU`_9B7KV_$J~|kMumI6gSO|nD|DVi4J1ZsMR!_8i0ZcuUhg4i=2P>GVBNgON
z6JpZ6sqzBeFE4_h%;T20k@;`-5&uWT14g6rZS4_QMeIAPjt_RKH~dUkBV3Ar)(VNq
zN*8)t&;Lz9SlB$dfJ`1S><6IKHH!d6()4zE8f5hw&TVaavV05>QGLNnU#ahgWk82y
z{lwfng8k=fQ~!{P(<c)YEIuz>)|qHn%8#ohMEN?TnC*a3+TgSw&>j?Iu&Ihh_ETVC
zdwhuK*@iC4P1bvym^R)<JfWYvBY=v0%Tyq>^17rRH>TbTNdim0D3kuZ;B?iFrZ(P@
zIPnK#P0~vc4gWpj1b}S#>Jf_?*YGtAVg#F9Li;GNNPMGC;D!Gf8A7r(He)hr&yGB(
zF@N*tVw|DX3j*+0GsG>el{0}qtgl~RG#Tx5v7EBUF4bAGIDbmN<};(y7F^L}EN~=!
zNTNTkz$ICMwnp7yb&|fM89>b%mN?4N;>(c(cuZIn0o*U9e~=hU9M+lPfRF13f+$K<
z%S33FK7BDgU_ON}VJ_ZjWSQ{cPgau+yjqhS;10i6Q7^kNJhy~~mM$tyA>|oVC@H}X
z7n`srOqCjtem|@ZS6HCzxvqgUH>1L+ST@=11TVJ3FdJrxr2984^F8MfRbTZQN(RB*
zbM8MlaR)10wg8_k4}@9R@Rhoa<j)gEJK)|Mb~IJu%~A4qlalv#7(4zugJAPIZfSEa
zJP+^}Jh_;s%~G_zFuuzA#%sv*g3vMWwtY4RsLASVijWgM)WJ9QHfCq>d2QsC<G2M?
z1hK8eJ&Z{Va9VpZbE;W?F=qpnyc9zX&_n9tx?;a#CrcnvMuj6CM{kW>Vt(bIT$6G8
z@R4$f80HGkaWHvhNuAI|VSg^ZbnjcsQPf``@zyn|Aj|%1Yt;&VqyYV03|kh7@#_3d
z_^B)T(tX4i{!+!XKVYG66sl^FN=>|y@_|wGK(E`6*S-cv>m-~7J7L03Dd7rJ>n#kZ
z;ONyx>0j{*fdhF@Xzap3)nHDq<=*xH{~oWDDdLTo_@2!j@dpo<RvZ|UnnBPN6yueJ
zkvivUPfU5wa;T-5dt;J1m7mq4vS0HrLU3Uxt1!q{T1Q)J*Yf8}`Z_W0gK<$c!o1G*
zC5@q_rDjTSTnHutH{T3g+0n15w;rr#tHLXcM)IdiP}<0icM||g-q&D`3*(ywQ~yuI
zF5-WpWGt<ATH<%gtnmm$jr{YQ(IJ&dXlg4|rb%4ETcuqOJdp}Z$}-Xb5KuiARND=|
zc_D@bIZ=kf(n3gcR#?!S*pzK{7SITSezf~k(Qp2#6EemfX0o7AR3<%`itoxLROXw9
z_2TLJJG+H;+0pgEMDA%lN@j*dFXpo*Q!ZZ6qYiv~to_d(gKha(Z!Ahp5-=s{tDm2m
z9SY_qv}S*bDw#%DQLP7-T4g--9}YP4*JKoj_SsYO0_JQs=XdKb2><aL1titHu<JVr
zzQBMQQj>?o0Zjnxft47jY+kG;d!)Ydl01-M!?70c+K>v=xa7iWni&<%U=3@%&0LW)
z8_5g;k=gC{#li1wsEOC)N-nUse!kQIRIv#B>0?9#KDoyqPV4Iz4LGr2YGFFr@TI{#
zRC<O+WW0q*rE2<z<|!a0Wwo8VL5F-!fS2}%iz%}sdwL7Is>zl^q6A5pv;?<Rm?^Xe
z)B36PJcwjhBk!i<{%7Oc+brIZi<deqj1<URzX0Y;8y#4SleLg_V68<b^JMi#5$LMP
zl}jDCf_a?)2-)AP0@)ySk^QKKl)wIfdFvr>Gz3NLim+$7#(_#a-^pWkOF=?Oa`G1|
zvEMeMUydB<ctYemj<o;`Y}N>Res##J)JU~q|AJNAPzGCc@(Y}vIf6ZVq1ul6STY5)
z9bG>iTcxg_6F;u9d>gq}kHA3~AWVnQ8Y<PHUWg!^TId1mJ0!J(4b*u;V7D-yb;o(Z
zSs3i%izNEo0)V98y5f5P{nhqE-F_J<?g{oz)a!=Z2way)l{AIb8Lo{&kp}CTOG}&Z
z{rA=4t1_wNdE!IE#L0^d7)@e0n4nn2bGhU`+mEb+CK_LcKG}4TE299@SIME560paO
zyFlwBJ5OIWH1m!#4gCbAKGVA=QmKOtyt=uC*60Dry2A#;4IJY(dwP8%dRza(=Ddvt
zR7O9*QvTFr8ZA}3!ZMRR75J0|EScE;mF%dtb3bCmVV-#LD#&yvnuos8(DIBkSNChT
zj^RCLy5SB>8|J1k+}UWPOc|mg>n+%)Fxpc6i<t~ktFSee{1w+Vm>#iSQy2;hXU^|v
zzM--6KcDNn^&p$l@Yx7lxHGUkS$NG*XkndON?bcq!4s+cj*z~03a21qePMo<$_N!%
z%La<$Y_f*is|NA)$PTX-E3SqiG76aBg#0k2NWrZZ-1Y@G^S++NG}1?N2Ed-MqCE-#
zZQF4@6iq?vMAynh614?VmgebhF<{XWwzu@oS>_LFR}b7zCjVF6pft$3v8PNl5r09K
zaYzv8fIQ;B8S((21yu5#z6xty6}_agZfEl6S=jakdiHh+`qtS1Plm%JO1hx*xMZ_f
zMW%rtpNQQDs}1D6XF-(Ou-CmYmU{y5)@Fk(Zhp%dDB!T&XrzpnC}wdLm36+U8eEuW
zL)yIf?D{7<UnV0GOBw__%rmUWv-of<TNiiikffhg#GT0y7x&*T$DgY3IkA3{p!A>3
zGAfO&*({YCZ$!U)leR2W7?N;JCd3kHtKQDe<heuQ0Q~<;j?_Wt9jMI>pG6M|S;X_a
zM+Ca*0gcbj8Uz<N-*SFxTRd0wcEFg++WU!r71c9x+j~p-w8rTXmIFPR@MGY>btEBM
ztF|qUK!yDu0P%BtIGdv90@L#0>Azc1!_YvbtPPg`&RHDRZOFmxHFRQ2JufeMVnXHB
zIIhKRx68(%15b&b7>s$uI}{0p>ngO0g()q7Nkk!W#wr7l4an(x3B6`juXb7&OIMme
zTI|!sJlQ)r{s0bM$ha!j>Vge=LUFYxie=!R?xZ#lQ-In<JKW*S_h_6_8h$jHbit16
z#5TO=l2w8YWXKaRuQqov(!uR|mGmV;YdP!ch?{?Eo=<UJw^L1IZF39I6UIE?`UHaB
zt=-`El627vRD%gQ!uY2=1u-p^a92Z0Z#-qy%i4<ya0fK&Xnzm^ezj>1DytX%`2fM)
zTLydP`Qs0!*%`rinK0a$Y@@N|*po*DcbB>U7F~gux5+9lH9oK;A@2Mb&hYPry2YDq
z#4+NJh!gW)#;oOO@y~d?7Q`T~zOaWmWL_fdMm}>i70~>dGf41cg`GcNJ)1OJ)d8Nu
z7h?XF6Q#!V`p>w?u<qr7UmGmEf2^lDsN-9wy5-kgP0$-zm4}gn?3GENCInK_1!YuB
ztqDy}ES~U8No?xz$BQIReMsts+*$$RY^P{XC9Ze>*^qD5eD}n1C?xb*^tA9qli>(Q
zI>H-AqtVkdFmY!pcJ=tZ!SeSGtiR}msh|MVxiR3<1S{_duK6r}V68@bjGjv>J0mIc
z6+C6Afr%70Kwf-glc)&v^&jl-VB?I1niY-nNx(>bWP>x&UJRDAH73nSw@tuz1bda{
zdIU@QHT<5tw12SM%2)OT;Q0d^Bh@meXJZfB811PYs$)rok{)1f-qBb<>7NF?y2!`W
zhs(>A!0#7?JR)T(v&(!9Lz&W*NL)dNcbI0Co#k7A5;|(fm08{iTbL2jk985SqsBSW
zPFMqf-y^<TeZ2fE01HQo>QvW;vN2(}4297}yukd{#AB&0CZ=TG6DJq1#!@B`_%&bp
z9#QaQlONVty?B)*UP{omxUmzUYa}X8$t4Iwj2K1%bSP;el5#I2q>_qBI)K97+Ho$f
zmguC5-(b0;^`Yo1O7_1+UAXWcgZ;>bHQ&JBxDd-_ez#YJ<fJtLsV+@*^a(e2JA8`(
zAQvG@crfOrr@gaUpS}TGmD9XpaVL|)56QXrUmh%HnK$qbMnX4G;(gN;SXq!D--Vp1
zwuu<CJ@&~0DytNY*;8pM%?DlZYF^R*^T(yE6l&brq3^7|o`Em=mSTAZ=B7b4s0cOn
z>jOMm^wS;HtV@9#pTp=wc9_&h#G6fuERFdY*^V7hEMLS^3S81YP%wvJyOba;4xoyN
z6I(E1m?NG*O<g-`rOo>OPu$p<&EFHZ)e)A-g0+S3%))Y*R&f=rX%f;##KKDR`ojp}
z(1B_izGb0eQ30an4)3tmnF#yfP!&+4`66(b;H!s;KiK4@iHSJ*3^1(lI+*TP)!+>j
zQrd{58bV<QY0fu)Wk$I~E%{ezX1BAc3ng8c10m^sy#ymiu(N*L6Pk$8*i}i3(!-}D
zSp3s`sk6Ji$L22Fc}B+}|0Rnma9BM(qC)AV`;RNKvl5?owK072H*UC#!+Mm(WUy{p
zmXas?jT1QYwtlw&&>6Ycu@hH&H1>cS9@h&nJCL~uvc(Nn0B1qQ4o~vvO1=caR_4Gq
zm|JI1Ht&G6)@tKb$yWlX9}&M=ogn=8m3EdSY=iC1)gIIuNhx~bNjF#GqmWu-p`)?U
zEO{B3qyimN)ejhh+U~B*Qks$vkEB*5MevI4K4A3soba@XVZ)3^v3(;jSG`y*JDPc^
z0C2{8)l{uVoQ9Z=$*Q!AqZ8C>O49;eIeOy5yTpMei+80`{=l8<NKED@G6CR)46HVy
ze6rD4I$?`;tck<ZTL9?5gyNI6kgrJ%N$j)m`e-c&b05pzvYct1LFlD!&5<xzElU@H
zymoQ(^JV`O@fT~NPz&aq6aH+?UW<03iM_dy0qWjb1{{4ve5bsU0dNI1$UR3@lIjWa
zcA~cRx}Jq2n5#5V`X{Bo1MK3r&os!jZJscQv+C7n5~hi)@kB4|D^E6hv<VD-whJsw
zrZ9oQhGo88Fs7^o)s)Ha%aJwY^qt+mmY_%i*sTxXf@MBgs!$<)?5WX<;at32_5HzZ
zl-B#~jhUOQlv~*KYV4J?L=8?iqhOZ-&1uGwL^$B^115K7SI!BOI?;FeLBiGYR00(}
z-muV(bvQ5XUsB@0zSVg$Aq<<cT}*y@KO$O!?u@37%ellqz|=c;pvv$&e&=NT(av%T
z^_$ebc&+LF&TQc$e(TK+L!Wp*E#W!h8UCWejh%_@Jv70H$aWEf8he(}o&(^1w<mg(
z<eCcyQ1bO95}3&*h|~t<FLnkN_OnIgR1!%}s38jT1YP!-xP_JCb)*AgI4D1#BA%H=
zv+#kc)f|A*IzoFK$a6$bNXdj@B|%az?`(R#c!h%rycXBrn4_&Yo)QPl|Gh{2d;LT;
zrB4G#RpPbErWp)mhWpQIV1>=O8SQzLj|j9JqNiR!cGjEzr6nz?`9NdTX!;WCakc2N
z<pxi%V3|g|V07OfHilq=6PefY!*a(=^d5|AZ811WItSiRt_v8VKi-zL7L~iU#DFcK
z{<i<O05DoNvEtpx(R(-QE`=Wj%;+NoYE9+3Km{RiuM8q%y%zx5CgLlD)vl(esw{t*
z)?Xpr<UJZX-GzXo$}*E9u!=>WQ_Tl>k}&U~ccv8_sX$b_(H3_`eODO9E}H?2l>9aT
z!ZV9Mvr7*gk9yrnT~LSIafKJcZEL)7V_RD!0eHaU&05nZV=D^t+fz0MdjX&ZX<SIb
z=lt-1lTA7NV(Abo)PG0bcQui$jznIHP#Mp1HPK^OTS$>5p!eMKL<?8XG}_a0Sj(^U
z72SSWD$$1DX}Ec>$qh!ZE@F^6md^yx(eU!tT0OhZI)OVXYg-Q4jvw#1;Mp{#_Fk-9
zZ8h=&7}StB70a6PF%i<J2rT(P$69u#W4PF5MsodwHMvzwR@~vCgX6{LRU&b4ql#Y(
z!mYC`RB!x#7z;L8J?f1)5I?eCQR8TP7XJ&*tFx42Tstovz8CtLYc?rBMJH0MLw5f=
z@}ioauZBuaK~x+0#)Syw)(6l4a0i=eE&kyO_RkaWgH5@gY#Sap$!7w;^k``jX6EOW
zDs4dR>cvs{G~@jz6HPWoRn6=EjymGVCZ{sQ%Cz?a_%+A1G!P?RVnG4*_)QIJ646^=
z+u}vWtOF}8N4hr!^-LPRz-8Uxo)!+gG@h$8y|nsbIve@CHBxSI20Gl*hMti(T!P}>
zU=VZR16wY2!qBg%e;aOInL)EQAwdN74t%GiQyPDDMc$akAcZxhx?8}W_{VX1wihq4
z#2rG6c?o_*JR1x1!b7#E<imv(PABeD<`gzj<~4Bxg5RxevZ_+*1m9aypO1buz*jli
zyNx0U;LTmO8F!-L?8!=R0^0>C*4Kz%nUQ|mf|^=G%U*1-ybB(?h<K0qBjWe9l2jSo
zWIBqlC6Fxtn~{AP`%!i$BwKI^J(i<3+S~bc1xaU9V-9@RiKh%nB$~1R2e|F|%D-4M
zabXd5O6;%ThF>DS6Sudl{Aj;fXE$F9K0}8e^s%%=Txlr<Qf}z$77C<$((i_G^RsWA
zJ8d$}pboP}<Gn{fupoYyZDm)ztg$Y`(F~&oihNpuECJ5N3R8v&Gu4*9cnQHY=0*$C
zKu*>qeO*%keZ*_Tcai}0&(#oNlFoIO>7_O_&r&+j`5^>&A8|(=v&yFMVn1u>xE2cQ
zOyh!wo+A5sRRDtq#8-Z*Fr&#A!GbVY8)vX5tvFb1X2>ESAu#Qqy6EHsD|Wif-ok)+
z$q;&~!Q$dJSq<=Htzr!!vH|*(7I!30$lUyitgQp`R>1EX1N}msCOtoMcZxi-7eQqm
z@mUeIpznNRRl^gxV{Oe1A4aV#R_<)iL*4L2AN>Brx!9Vt?DpgPh@W8p)pHiO)QYV4
zJ|bYn)6FXk-*r@auZEpl{M<QVF($lRkMR_SBzkk8e)1P4Jpbtm+3|rI;H!<K+fxDF
zex`yG(6|DWeSxJ-&-Aj+RN3Dn{tnRESr}Ab*U28~+9#jiftV|ZgFD7@DB>N~$L1CQ
z2Akk`AsSCMl;pq@;x5T|NJVqyQ3fUDr?5tt4rfrWQ(%n(kmyQ|N+GFat1}K%J3g^R
zqML&h=wwhdEuoL(ya)T6$rwlF$P){`u%4P-I@xxq?jbV%WG%tzYc7p`wbAVtlhbUt
zoPnE~9KlG!(BsVJ-xTRzLTy|kzVkqtQJ=q9n$%knNikfFOy_JHbC87Wjp~?<@r8BO
zo@F@f9(1MdSrwq>s!Bt7>192VUKO|_U$4IDGc<5ZJa0ei!w~js&C87@I4VphTCQ=J
zlE~VCpY-YKF;g`<1L7NF&PJo)8luM>OrEn(MHK~c#fLQHkv-{qi-)Zf4AfHq+gPWA
zv<F)&r4Kd`;YQ)ruw^qcdWDOcNJwi+#5)2&dIWmH?TnX?Y|4O9mc-Kw_ICS<(a?i3
z(2gV6l686$N2&rW!9Xg47ZJ}9UBp+i!FhLn`wuFhF#4dkM=5QW@{0_EQb&A^_+s5u
zDh&11z20#o-I_t^tzqzpliV%=l;iQ03P?VrwOqPj_eS1bWciE0zt3l7Yl+m6rjQnH
zHx)7zg!U&Y%@wM+;0Mn*v)5#hP_h99J!6G6vj+Zfw0D-tCtDot)s|Ru<7$p>1y)-4
zum+*nU}>8eUx~g>_}vVG9W76l7*>>ggD1SM$4QI*9v~3UP_$1rV5p?l{UhRs`;ZCY
z{4*YF!!3-aXWIkr&RGgYwZmqaCvPYWyyKEK>kO<Q==8=V?v1u;IFcQqYV=>8B|K-I
zIB_uv$=rwX72(18GaW%?THP?(qo)6B$5G9kel5nhEODrPqGIua3U;Q5sgCIiHze}(
z&Q#T1_WT8TL}hBN4UYAlGSXa=-mUZKo(Al#^?#O@nVs$AZ?isJ{7cJLOB1g=X8aF2
zj*>G|5J~P>NTI)B9m56SM)QP6gC1eN{YUnRToZ?H*5Gz5QqRr>4-7d){9m61@S5Cv
zuzT^^Lb(hBi^D(ihl_F96&d_1yyTaN=d~6|3IvV3kjzEI#Rw;`NdM9rK*JfXaf1Pw
zA3)r<q!VodL7XD~hNiiqfKZW3)+Tkk`W{GvbY;=Sm0yj>>1gjftr27x4ZbzrtTghd
z;^?Tqe6f3l8OmhIM$KnNvhWna7M67kSo{E{x`(}<SnYffSi3(^11Ol`c8Yk2_!be?
z6cesu5_~O)!82IinrdELR1e2-LvSsR3^^lX6BuYrHZ?9i^x^<rh%8S_!sTz^5bx<v
z@4vLXY~r|WEsIy;6%o(LGhw9MahZKZ#5!73`;<cs{<g$C1NEx%AgMU+)+~V5zqAJ$
z4HoxHsu~l9v`9kKDyJrlnFAioV^&PI{bo-{OmCsY0R`Xn0fwlca55ZrZM`eyQa3n2
zjY`Zo8Qi^{f5C!@b=p|OC@`dVd#c9b2#I;yz+OLdX{wrQkWZ6fsDZ@k3_g>Js5ClG
zca%XdDqXgOn?0NNJt88$*(0(t0Ze7cmb$?eU$6x|l;Gslf0YLMj^u0iEX~f+$E}U1
zU3^|8dnIJEpW0J6o)f1oLC>{+hsfO#H!LM&ZAsxD>mcemd&7kmXh3_S@pef`-Ii_u
zraZ9MCj^YTANk-adxlj93+vtu?6JhKW_|9n<0v`MBg3dfrnSaS44k2_JTBs9fE9=S
z)ku<VS#&7)^JaYj{Qz(AWC2**Q5NhV%Tfo_1MPp}^E({4I$9dk33m}sE(9q|H(O&|
zt+lS6X@^lT%^aXE;zz_kp&;htb};>i$hUq?gZAcS#1D(fKAF_jH6D{+{{gC?qw2Wf
zz7kD${^iVBVQJbG9DYJ6l(un(#U~YGR8^_KO$EF7o^4}2;&dw9eF6Wc4xG~TM(Il`
zvX*^5KpSnSV4wKk5BFgOGT;p-$E8vQ{K=M`+VFd~>%Ra13$n=;QCMs93`-Mqq(`8l
z+%+4|lQHIdO9F+J7gi{$?Hcht;vDfc;v2Rw0zfKIdb$4b4J%gx;2P*$H9>i=P{w{f
zB7Vbj4p`w9+<U>Vek2s0GZJ2?oNNxNQn5=meH3S1k#Emb?<L~rhzAl{$xs*N`<wNj
zpol)uUu?K<WfVm=iz%5?Q}f~Zi)R~rx7L*Jc;D~^Xy^jD=Ps{CI7wr1iMWgSmdMz`
z{*O4Hk}Llpk=xqFw4TBitjT-CbHpPJUp2+u9%`v&g}3W79IWqTs`o9W#21tj{{Hj;
zpP{;>h&k9F<dtgM@Y`#!$I{Lx_9v_9_E^%_i0Ad`D=9)f8gnsX?mB%cG3E*E0|4_(
zDPO77g>yE2rsHod0~ieWU+`P1j6SWWAyx)iT{9s`vy>xw_+M}pf{8j+SHMVPE+Gvg
z(jZ>=%?SWd6Y+Q8ng9Kn4;cZ7!OoZD#24dmKeD4}C&1Z`8`*Pdca{yk;;jb{pcRkP
z+AL)|JBYZgA@C`2;Dw>b3rv2G=1Gd2Jq46D3xEm>Zu1B3@s&H%gPzyu!O0hOtsEkr
zh~DDR8xn%MEE4Ew_}p@wOQ55gK&o{CQ|sUJ1Rnt<Z%%sSOD9{%#gBjBIh4d-ja2^|
zUi2lI=2CD~wPnKsz#Z)iCCjvE!dcV+MstR0T;6gRYt!)`)?(1so&&6srL%tnN`>^p
z1qFaFl-jT6Or^;_BW2Cl(1hRVt&vAaVK53K80)wIaTTOp5A)S58A}lq6$`QW{gJq=
z3@va#FHrB>h@Y(~C-p?7ZHrjA5=uXSC083+lAfN1Tv+PWtL5b#eEj{o7p(*`4s3O4
zUAZ#)IW5^`PftNBK$rkm)Qni^emzZ52=pHN(FHeX|3C1@JA1fLx_-)IwzMNXN4(+2
zO5+7HDyo64mK^MgT~w&M<|TNe;Bec*(b5O0z$WZ`Yi8TABB>9p$<fYMw~;3qxMuZh
zW@7NxX2q9;!2x|iZQ9N%x%ineA6WJltfbs^#!c_gEJbqt6}LUwV=!7WkiihrZjGLF
z6;~u&qdiqqwXp&uF;}au34mx=vxXe`O8Q(HcCPHEZkykOzDgtyr5S9;f=LmTCc%^4
zp#;1as<9IoROQ9DL`EBS2`aX~<9X-ikSCatDeFQelX(ob5QinRkk+sk6p;9n$@*_v
z3_we_Tf^VMc+<Vz>l~1umti@>VM?JfvN#(P&s>3@N<K3p=8oka!T#Qje}Z}VupL?)
zzyc<8CUOccXKN)FH7TdFfJRPyYmX)`8L-pP1o6%&hm4r7Bu9C-l4AjgtjrnY9hg#s
zn%@@<S>yC8l7p0npMXiDQDEBqIW0VpRu$RbKk?9k%D!G`MyzO=odzIor?nJ(FfO7e
zNNpm1UnA%qVq=5FpJ*Na-w{txW5bf^OxM3QlrYj#USrhqB+n85veA`)Ufx_Ozqb)q
z1sAWvT8GH+oPbz)kFA;KWb#B^%XU2oMyJ05+{jnonSN~`LiWih1mzB|bQlUB6xKwt
z2f965!R)qpD*0~=;;D=%*I9j*{tT1|8Y1q-pxb0zU=0RZtlt3GZ%}^Ch3{X0u~!R8
zl?Ski$DH?&7(pbSjaMq9+MXVqNhM1*sWNw@&YCTz`L<qy$?#Q&P`I-?{LvmvlB$^G
ze~;#>CCb|%NAn$76@h~k<OSxV<zU?zIF(?kU(1k`IahWb&Jq9Oaai|MBM;_M|NO9A
zbZbANRql}4>>v+P2dV&4n2Q{Nq6P0!_)8tYeC2l$U#%cl@c9h^y<g_Lk=*}C$lhaU
z1$W!f>GrX1gLEG-M*N^#N4%l_6H1-3AFizja4=z_1UeH#S$-rSvgc}dgrUhI>4#A>
z*U$PGMmjkk_{Ir7c_J_?C(C3|7Yf`8R9CAfY^(wFn$VyVefYmJ?Pb2PnX!MhC1^j>
zu!@E)4JK_cp4Jp;J*RpRzy_nVdKRzXp9;>;>}P+A_`@@Z$rOLOyqeQLSOf6RHZ^_q
zNQY<p{f$HMKs)BF(B6_ORLT8IHYd?$Z;pOOqYt?El*aFIcMot@AN<q_$5dp?&w5F2
z{zfsNAmwX7Wg|%DTha_X7~3P}Q>1x?IShs#(+oiBh#hv)+2ah)WX4yUvXa`*4)J0~
zNkqz;0(e+4YT$PL?mS#)Ra)c0#l}=^vd;vX*n)%QPO6x7GzE&(z1i8H2k;c}uZVwI
z>*Zijw8w##CK$clea+%LN3=MNgTd+t9@Zq1e)0owp}BBt(-gd~OTBReKbgMDCE+BP
zrvm>3({xQvZ}5&ij|>)LE!yj3Q@x7B0Xs1y2U@TJqqzh%eCA(s67UGgaYg`I0!ZEo
zfJ1WqDje0zJZ?|bao~zh6v3~F0~)M#b~F}dBrVX}F6%~?!k+tA;Kq8$tGO(w%XIg6
zd(Eiz2i22V@v9DmAJ*BBQ6U8fW#Taa6@7JTL6YSz=79JTNIR|g(p&fP6>PMN_zT~7
zv*ZEIRtssaqZP#M&c@l?Vqjre&l_7<vyi>%fh*(VQ~T{v3<~uR7T9Ird!uF-8CU>!
zTI*tLv6&Y*?Coa&q%eN@E#g;z_Xo++Xivt;XJ9I$G<Mq!dtRD2VsVO;fzQ2k8+K%8
zGl~vChcn61&4fik-?1?z=6FHxp^YroqhNIZDiK}m@HKd}-gdf80<bh=>Oh&-@`GQ~
zaFYy(*AfJl-`Ge8o#i7L)B?cP?s^{B^f`<Dz^V&U(2nQimFpCyr_xcfxbv6(ViUe5
z%N5cyJa>^v0;8&>*AT>TXI?I)t;~%KmWT#uBK6_YFO~LPV!~e`a_)_oUl@So87hOE
z?b)$42z0joeU<|)p=!S204__twb~CZ$toxS2}(57o&&GakW)x-mzH5FJN=TfONCIA
zImg<2Y6<Tmeu_9)18s{CX}}ZRb5msk!qh%-1o$|s`M+4gCzGtCHK+%?&kjcy5nph|
z9XR^3MsSnCZ`rmqAL62}<advWjJ7f6v*!SBZ7Q0~`4^fr^Fp4=ILE;R`42_}W>i20
z$5wr-Kao&hh&Ts8f>`;R11ydFQp09z&lK9=Iy;8GrYfSDcRyR<^nwXK06>oHYmvSA
z&WfPD9a7atTTn?Vgi+n@D<%{i-V3HJQPy2Dj#>LxM!c~c9T7{czC_lW?D-9@w&4S{
z4Mq@`RRTs+09BjkQz|BNxPp-9<9u9Wu#((uXH@wWzM-T@S|!GR>jOAM{59f#;N^~{
zg4n~2OJlL<<<RMw6VA@kuF0NfsIp3#aOee_K8q9{2)Tt3N&2N!f}nrsa2=lBjwT0k
zMrcTIjBa86DOEWn&1ooK>$H5Dgj9wJ_ogMNteT#FShaT*9THMa*mY#?=7K^H2mVeZ
zyTec~OOEYo1Lto)F5bDbO4+;%7hAc;a*YgvFPK#>e_om}?Zjy66hR4o7vPJD+n)dL
zd&EzyZihd-1%L_8;b?4)LjTU1VPn~A5bGsS$$GSq)Z^aIRw7L+xLYVhX$@PJK=q6r
zA@P#Pkfm|hEV&K+VLs9t>oKgaUq0-J{dOp>tEF&4dcV{G&7b)r;swFlb=lL=WD7t6
zZj_2mCARsdzQGH<P#NvBTQJi6B`cV}<s;(X5fc-UQ~o(P(iduQZ*+feOhi&zmY(uo
z-mkGC)|VClM;nrmkzwlm>e@>3MU~7Y>qv{xg|#09Crp99hR7;ylxvgt&ze)M4!NVv
zLG5TRP@;KT|3wjaPtJU_$<NYIcDSThhSp_-D0?G4EAko%BTEbP3mI1%@yL{_o`gkl
zaDJ&FJtLtR8MQn2O!YR}WiW33!qsJH?8&fCjrUU?v!${%+LuOh3p+UcntY+PkXz%A
z>vas9EQa0LP+IA6D<i4pUKbqxX<6ysLEhc*?lZ(>nT<gssf7Z7>t<EcJbre7OCt8b
z=_#$0x+NnovOYxF>KwF=Bu<ihZrH*TmbW-SjaBy^xweyyg^)(7%7$)*+lf?a-m|M|
zc_wpG8Sgc}TBDk3t7l2hp=PJr5Aa+WlK`>jaynDAd3Rw(2R=bnPF%%>Q+#g{wjw)F
zw88IAs5YeHlQMsP_r(_fmO)HZM+=(gj)=nKcV@r)oaN911p5WHreL)%u>Tud|313}
zvsE4lM$%w+hB)UdPhHT6GV=i|9=YeMjYj%|l19dP45p<TDS)W3<G`d?Q$S&?L&1kH
zxWid?_7w48DOOs;yc%LDQA^#MinHQLe_cKR6m~dkAj8Gx5a`@qc&5g}`|Sxfg|)s8
zoWv{7g}Ttmc}+GTOgLUrnlFt}xd2P@2%yD9U#<D6vV>oHwu}!D|4}ci(S>z&9?2q$
zbroMM%sJxMh<(J*COtf}V-=>J_d$hC4LE|`rQlUf@{y52rT>17z`2k3BjO?A3&^ym
zh}SG99xkqbqPp1-_fN)%9U%fT6=c0G5x*ivd@y03I$|1|*OL))DbREfK01ac>rNZi
zQuuoLOjIRdZ`AQ>2%7U+f}`KRP`ax4zc0j=iG*Zt9li_iRT$%b1#*@ax@P#^;21Ya
z0MPq-QkEVjIq)HOc(@FIG~mPTtd8EYq5_d&b-+Rwzh#nKNh-6&@9mJ(ewYkYeE6T{
zF&E)qsK8!~Z7FOaS{ec}*RNH)TBlBLC;xzre5{2yg&p1lnyrGkS(&n-2ZY}akMs^(
z`d~f`zz)!PXKSQv))y~rPH!f>Uok@B^%f*TW!ZJj@qDAO@l1z)neD`%kLT|ZKQPa&
zH8Qp~QL9B@c|#v{@=Zz`zB2dd2x<9X-QaB!IusesY~9FV@Mu2J@Vfw!6XM7-mboX-
zS82R5o#l3oU<<}PHk6$1tX?%3eBLhet=s3d2a_>EGRZ%1#$VQxnDi9@mq2L01y3@N
z(6Y0cntLMfDeHj5*j))XjXlai0YKW;n+-~aIAOGt!MvW~IU7a;iPTwGZ^tvIcOd;B
z+<lnc>=N(gCNP<7LY+e*uOy*5qlZn)Ff~bt1_FqM`au%bvsh#D<0>~O+1#2sN*gR9
zAlBDE(!;6>Y)w^Xr2N}LDl9JEk7a8icF)pQ)_|G;FcZjV%jF$c#adm?{<93OhlSm7
z!(z-6QmITF-~fW3{r=BOl6K;b)ETkJgC?@e(gqGUd|+$hgc2=ao<ck^%)ZBI%|ZXk
z`h4FbzS{%x{@gqXtxt50m8Qh3t@q(<Dei(oY5ChTQkD%D)$-vj0ie!e$USp>C*ZsE
zw5re*Yb{>^#BQN?3eIc^25xM&Li^b`p~0~K9^#<jXt)ILw)r<VKAEpLz}4<~gwbF&
ze1O$URS?olOg!A+iN4LrhKRaQ{AOPRD^Xb(IHO)G>^CwJ5c!1pTi*@G4z_3;DA}|K
zBZF4k$BZ%W@hYzv^L{;$bfWO2xHEzMPn^nGlU`6$K4%f=5hddmCN<1sRW{fuQv|H=
zyAv_^YK4u8yW3b1c5(k75zp9~+Wt!X-VS0`I_^vvW2O)*U_fiqP1RN!YZXneU(vTV
z>1M^0*bC3Qs@L|T!{u8zE+I`#mUy>TafFr63Jb$J2q(z98gI8yQFZi)+;RVBOW7on
zY-!PdSqH$d(=RScDeGYKj`Jdjk~@_2R|$m{xHm1?#jpQjQZX?U!%}UwBmMeqO`%TK
zi#6e&-VwBy9Nv5ZA^2yBt|fbGV8?1xp$(kwh5bk@(u1CQ_GQF!$A~vd+3DiVhy31x
zo<LW{n5JJWkI$_CX4!Ql7P_@<6BoO<1XY}RJ2@Xva9_bQ;*S>gTbOuoL&Tqq#Lxjb
zqyM|5AB$xa_C1*cDD6QRax%O1CrdobywKkx{ugvmi9b|=RIelO!L${Fc^OsEHkiSp
zg67MhP(H8qg=|j}jE3du5(KHB)1q1UpkBHo{rRT}uu}Lor(j2-uFEBbOs;sp<7WdM
zJ1pbgGz($wtu;y9;?Bj@N@+c^90i8;BVwXWsL3kXWXY;S&!^`A8UG&f1_Dbn2uU5(
zPBx(eae{3yeexm&-atOgss0D=lgEDJi91rl-hTW{201U-)kZTtqZ$`;Gi8luF7y9H
z^sg+_DlNHqH6OAN5sWwu9jqP)I7@Fkkok8Mu9M&YKH@7?%L{cUt<XSPIDR&bwM1`G
zkVl;i{S54%0zgK|+k(0Wb4lVX=6}_2f&~Jf+53A^w)q2lsI7TXdTX)Cmr+`peY&N7
zrn2_d$(r&OKClP(O|;IIR@m7X6z|rZw3EfX!g%x1VnD-%WcXWM#Rl3<lzeTjNh-8O
z4MrLK85yvo2v;bmhIN%lb#eYGqK{al8W*A!=_pbx_}dgpRtA#;1brYWR6y+$1fahf
zs6B2`?46J|i8tRS6pXfh-n0)$zYVa)n+YSk0mTx*shv!1<#PI?sgM?@r^H3<xyiu*
z%^e$ivO3&t14kP}RvP5cF)ld$>Q~9!g2K>bkJ6a9VAV;k**=V!sQovWbI)w?c}1S-
zvgn9yV`2vPj0(P51Mt<B<{a@ONxqZUh|z5=$vzpYtYSi$P>7Y0&LdRANcu8zfwiIJ
z+e_v19snSqAWzy+qCBt9fJ|Wi<cIZL3V_TP{DS>t8gZ^xaZuCxh5rg7^k6Eq+e=|l
zR@l<LdM0t1@8Wem%GP~YZ4UvuQnY9Y%*Tk5X&~wKy9`*7w6GSegQd@@_f>nRBtV|#
z%b8z4E5H<1;!!bb$Kic5HTxEaxzO=z^MqbIi-aY{r`vh+C7h@gJb|w&OF{<f78(6c
zvV|8E?@!hc6ntZAf`BI5iZmMBts}k=y~A#JXODYIH@{>f2Jp~5?>m#h-gC58Mc^<d
z?Mxjeh-nApRg#CMNB;tES(5qxLe;nDc(;})q##x-eMkARfJRdaBjzBJNLB`!<vr#I
z`m&Vc6xkGZ;Oi@@_#M@(5{dDvUBYemN;bWs#w6`+1OIV0K$n-X&c^*$+^`b43a?n$
zfTGe3Kv~<B=yfHR9jMP=P^GFQlc7MiCbbB><m}1bV6Gk`{%Q}bT=)PPXKgXiY4`%5
zvR(U-3IA}h^yf+q>d{86SB8mh-lH*cQ;dQNg3^*m8k5f;KO%lN<<%LFy|cRr3lF!O
zSA45DRWji#59-*UG+ES8Bm+G|52)>Du6MS^=XPvFCIdTTuoqe;_V}g}gj8_^(?+DL
zY4%i8t(fvUT5_N9UIjutVHnXF!9nU+!um5GI+0_)*vOSx8{PtPwkAfu0kDZ(d9p_y
z%j1dNZcSE^s%Uy{J}|YJiGKz0J;H)U_WaHe;|<DShtnPm02H|KDmgzBiO0QN<b%<9
zN3gYOx(gLqWpdHkv0bShH%QBroQ&@LQYNk8(<-xlh0^O$YhKq~JOcqh8(neS<-i5J
z?-CkzqRjUK-Phu!J1D5$z9l{bp*<DEmJ@&E8^tQmx0V(mC-AmXlMbT3C6U^gwz5qc
zNm}+qFrnjMof&LCd(y_VSqlS@-h4x)w_|UWS=A7Wh5x<;D<5Ewr-(=EY7BNxzM)QH
zz(fjqXXM|=DO2yq1EFrSJ_Dr@?X~&i*#{hP5}Wln>-qFNNBjorlcG$TkBZM<LBMGW
zEL!-3aJi!^cEA?hgz_(?yz`-P6OaRbTi5YL&q?(?Vm1#p*Gd(R6aYz}{f-aWl0IKR
z&09F0L)HO@^~2r~B|9`*8<0SQlUQVs4gWA&1zGlU2c)^!K}>6C$B5q|ULyXFh<lt~
zNj7#S492<u<hP*yuM??gOKg?CWV_aZNG&9_(GeNs9nPx<l0B_6pvm5$Qfy;D@y+O*
zDeHd*BX&f+4>pNkY(%m4{v$uW<t$d1pp+qwaMMbt#41;i<Q^n7T0>3G|Gh>3l*u2E
zn+5)KV@2G`B$=ra>0-~T+{2`Y@lma*GZq^-Z}+be|76tjy)i$%p~>pAVOT+#>=oui
z<*SA^N)Z{U5M(l9Mirn7y2pl`=$%gJF8fe>?Yofr1Zn)cJ=0gDLPiDD1A_JE15sOP
zcL%pyBp?3ZcfNxCdP8;-!MHG>uyA0iAS#7FV7(m-`7S<xmCI(-iW}YlWdkvA1_5@)
zOjjlwQ6AS|dGG52guZOTU``Zdx0cz7pvbFeL<D@H=Pw9mwe9HDWY9MRIv8Phwjye2
z&qxz<=hC=8uU}mgg*ESOjdm%_{vX!>B8+p9mncZ}N36SLYqm)SBcWii9OgJdDwMX6
zUE)K+)OI@mrJcFKl$g&Ee^AU=yb6#&O`0YaWzP|qK%WoE5=a8|2!^O^it`rZ831{O
zvmi=eE0$+KwJ<B^q}+4<E|Wn@ztt_n$c_VY!NV^AKw(!ENS$_6%--OY`~ou+&;UbJ
zTHBeHO~GWBqz`ETP~oCR{ntSrG%U<1t7NQsQ_TVHE$yo;cAXOkR4_zAU{}(|K##aH
zOt`2Z>&j0N-y_~lrFBmtS#tNPQVt1%R#v2f%*djPt(f01Acam<K<_BV!cr#rz_|#!
zK~0^JfDD+Ja?!KN05{O1YUD@}Kx;X|84!EH8A|;cR^?lKeovArIzgPBCKGg@^`XlI
zcGf~Sa4H7uw&o<J9CpDJ;YTi~VE><%k9fWsi@0!L59Cf46Rceb<|iE6<+D+aBUhzL
z(qMT^!8uiepPJx6Zhz%0$n}QVY`Y|@ZZ9=n+*#tg=OLH9WgvMbT6ptl30BEOU=?03
zMi?(XT$*Q4^`qdYq`_`ULfYgKDnuMx^OaW)%m9o$e`Y*#!wpXbsqjTxmg5rfPoVVT
z)i5ww_^ia&ZP?BU_`Ttl7oXo*@6wSc*|u~5Y#0Ne{+#^cze8C4VsvQ9$yJf2T%)go
zG^v6DJacAbi1L}K{RqGv>^`@6yJ8)&{mkCJqcXp>AsD>D#MgMmOHzahjB8^Lkr88G
zVR0j8_?ecvTL5TLjLzQU$|{YRj(=%JMKB}aKS)9<c2z`HVU4XS<t<L{#U7<1k1M+K
zF5-WHMhin(f9`(IH9tr6JVm<aDbDxwR$s8q%mEgKQW&We&?Z~U9fmassTc$-(;11_
zqY)%EC}FcUpvpXEXLL|$>eEa1VFbPq7ij2$?-C6x3h=t#zZ#Z&M+LLMQMVku0R#D1
z&!cNN)>jHRJ6P$`aPSFp&T!C5VhdZmgH3@Jf3r*aFbd1g3gd4oGQblNqsJMg-d~(S
zi6Tg@cWXF0a0dD7S1G(PlCB2$%Jm*8W=;C!8S?6c51NSEJ>szq+R79_Ol-kdCXih1
z@Cy&N5KLdQkF=bunDT%F7cX=rygx@&iHx=|3zt-C&$$T8&o}&KhQiXR8t_e(HIUa-
z1^<Znksau(0>+g>PqjSJjUmxU_@|M)@`@U!1890+o+uJ60H~H=%MJ}owblivI6^Vp
z(${)@8}f2N@;aIbxuKB0kb{)mgCaxE)^$s1!Id;Iqh>DnN*&{g3wh_d%WNw^VY&ST
z@UN&T3w!N}z5hr8q89gP&mU6%$BYagdCxl>)uITv!IDbTnhutHO@Nrq`V1Gy0Jmsa
z)V<o+?hicn@7ecI%+s7cJkdFGwSfc|Q-~@%zx{aNeHuL37BTNEM+g#^^0OkI9MFYI
zhM?pz;vKKN!GwnO0rtHRqwruU^u~DdK9(OC;13>cn0%N(WFVMkBs>!iKrGyT2^Kx>
zoswtTs0e>%!8^oHTG1AKcv<tO1^0F^B2ivKy0G42XlDJji#vPv$A}M`5GVA@2bH8J
z|9iy1(`YYN33#$clch4)!ZOg-LWhTuE!{AMl~^;L=S2Fy$4+`!hlxha^I8ThvZ3IH
z59ZaSXv~CO|M3eYzTo~#HczN($Ue?X4)~rgQZHc1jZ~c91%If+BTQFVDPHO0$ol^g
z@$0&eNc!~N?pY0EsxVHhWuFNEBbOemQVsT-XcANvJar%qOZQCw!f3LG0pP!}rAo`s
zBgyL>W)x6RXGC^=#38$Y1=;fK*40a)Kb&nCQ%1+EOF9EKw(QtD;Zxl9HgO<|J{xZM
zjYZnq{NkEc{xs<N5{@`f1uail2Lq~I@_8Z1ITs$_-R1o*R*=hx*gGJvN+d_H&tP;`
zzLQzZSz&mkB;kGGStA?k_saOduo+yc6$wg<XCBuz98q|c4JDSYEcoikk{ABk81Gyd
z_E+;xM-0B5!HIJ>2W^4yJ8XGxApSzD`T_x}IxYRx2M;HakV=*L{KZ$mkUa@;*jb*x
zx96cfsqvI%Kb`R}K?WxN_X9ANs_1&WbivqRmvU@uJIc>);mpG@ydAW+Z`o4MO9Oas
zd8MnoC#>zt{#U@rOa?LlSNGg!Z>soLj8g3mi&ulbc0h-qdU?Yet~{}L_$&Y_chWar
zfXg)!u#ZFpG2wH*GkHrrAti|Wh<NO59k~(ijIPXF-4eK5JfjYPnOpzMdLDX18{CJ@
z{1qT5xSqMO^f3{V=+`#pu?C#>>~W;$x~|iQ!uHf9QKb<%C7l)@>kLGF2s=X)v#tCd
zamA%x?6fZ&U}gHO(gOe0y3zNX&0Us{8_6yvwm{N=-k{_@;;#|^NZi=`^*=@Yoo3vV
zfzS$1T7y!%brR)usD?Yd1OAK@RR10EZ=TS&SpSLiM>(&s*Maw(LGu|I+=O%8VUFr+
z8Vvy#_SCIZh$eD%w6s97u?wE$yao_K3>+DGvNLBm@PRJNVHei0wza_*XZBpZ8(R|^
zq(%JR`mYMiaf{QJc%#dzPznzH7b>l1@JI#i)7w*q7R{s+lV|>5oYdYFacaA|A}9Hh
zeXjbzy|osxX7_R;n~=sP*4H1_{n}ti4R~OS=a})(q7hX9=86dxoV*HKx>6P%2!IO!
z<jGQN^`Oeq!r3cpQPT|GRd6b1Dy^un$Q^MyRS`Z!JVule?+gw2O7E5X#V6op%A%@C
zaZf3{X4{0pzazF>$w0^DC`=uoAolFcJWHs)+Zsr9{nh42x8@s4swt<njz0M3Ic<9;
z02C}&X|mWef5?Cc)$%9qcy?o&By|RjEPSw+pK3GU-Uu7<v14}i!y?0u^r2^tUm&LK
zojFKeUBqt@{~hto!p&@`?=0=ak=(2uXfO_`usavEB~gTcQpSfh2F1_zIIn?q>9Yj{
zZDEaByIM@8xPg+=hYk3?9pI}?zblr1DK1QP$TGS93Ft9e)32~w&#YK|^_0DS)q8sa
z(8hG9k_pv-z{LSp4{k$GNWs1;gP5#5vhYELIr_rpxp~)#Xx5J`e<S<v!HA;sW@pxt
zUvDD*drjmG2(cciV2iw}?9w(yq%Q2I;n)|P5P<>Jfg9_WOs{`hZj9bB;x}t6ZujfO
z8<yqDD=y@WjCc=7xF(;Mh~sJ^8#R9P5l6-!H?TPQI=0)=3>{iFe<HaS|CuR+Nr`(k
zDTT6uc{?5>eq@_)wr23-lA|d>OtVq`nn1_J2DVnId$qiD!x>bfQ25uB*w@~i>DlVq
zJMLI~TIvZa!rz5;llR{uk@vq}yktEM0Zh8hk-wM6->!Yw!1FL4EK@m|c;o~@GG-m9
zs9QB@DQgmf8OXxoGaC0J;y0e0HUgY_9>V&Bk;J-{-0L<1xCJ*dZ{!!wz<F)Mg_%x^
zyZ5tJaZ~pZKd&3XCNt}58T4D@82X6cBmTi;gxgDU)eQl3zP6W$?`Si3oXt8(!X~`j
zf%4%@vOL*i?kCQ9^O+)|K*qX7O0YrMf8<}KWl9mK^xhqbOKZAS6{e(%io|~A#k>F@
zN-_wAv;(f=nG7t=;HKq`6kJGW<B&%vjB4$GM4s(e-Bu%KL}YsE!%Le1sd}eiz4Dua
zIFk|5jGWC1gA;i*SUei-_ZRDTj`(-P|HO4~I4VbsdVst-C1pevS}r)B+SJ!0ms5c0
z^KICJ%^LW@Lzun+!h?T$Wg7CLhBTv>SGM|_A*n5n;DC27)+#>HERU9isP{y|g*xKF
zxcCSU0FnUB_xH}jx9?30Q4jzw$pW^P>K<9H4?N!1p0zn3*16e$ikLNqUb6a{Kh#=$
z8?xcIC)^a)!Xz2vfb}=*^I`+e5#J;J;DNG}F=;z+)P+qxT5)>ez`CVGP3gxSqN<7e
z%6R{hCu%(s!?x_|mUwi}0bWobg+-Z-%_0^PI#7O_&<kxn1_`ot_)1AngO0j0bWr;l
zh#oDbTsX`P8&;4YKhUVLcuDOc6_IRl{{6rZWQB|U8&H1S@#7K!hz#CaZ)Qftq&lW1
zDiy@D1Ir;T+JeTOfSGfRQfmAX?0dp1)-1#(LEsbDJW+8ANZ$_|%|1(=pFbPNJ>l67
zNDd~OjwR;w%%Fe;2ri(I8V=7~uBfb<t?lfR^B*{diDT1}ue=fiw>BmxQ$tdq?kK}a
zvQXNYFDwa+<oaE<*MP$(Q>DKn#7kx9tqr{D@QU(FE$B%RKBI#em}c7B;FYUYvnT6o
z%*2p^;>3>Db(Pcx(+>v!H&tOuhJ0@nR4hvh_uOf&5v%|#Ug^aUb`%+<+ejC%vYLHZ
zKzg77RgkH-isKEzO%~=Q-Y6I*HUgr0gRiLrk;>75M`h#1go8fT(;Np%t49-UZTa5b
zVyYOi1q(Istp^seF;YvAudzAh83*(f@&AeV%5kl@+BX7QOM;tHFe;gawS9@b-<dOc
zU_Qd4h$`*$0W@{A-tiag{u>$bHGBOk8l9|f@dIM_z+wI+;#(r4tf2?qu}aZVJ1cQl
z-0bKGwjem@SRQP;Zb7&iA?5mP4XIkP9hDuaok_EXi0Ab@zlnW5U@$6~PsS&YB-D!<
zh$>qZvHpr=Kn;@@8j#OP?wz>7j?J%a7E*}@S+er92wI^++d~95rW{i}MV_p6A2?bg
z=ie;Z@6o(mISwL)(_4@WsmVaxFG%~KR5b0Rp0y8uqZL8q^<+&fGaaiGBc-rfM0#h~
z#)`NnBo?e(Ci*Je&3Wt>YrhFj*FNIOW`*ADx8Y&1UrZ^Zs+Q3AC42IMWP3*>DEUjN
zgFKa6ycPkVAVWHUG%v=D2HcdYv5ORN0;y@vNJB<naH=|k-+2{@aH(9v$k)ANYWWnh
zW}aD1f>K(}))=7ej8gma&Yk+(nr=$e_X#JnK^?OQV7>k32#AnLRn}X%u?1S(|7hyB
zvprQaEc8l-miCe<=uNoc9Z^ZT25At_*>T1*H#Nt~7v@;*s6NaBgPOm|jDk=dE^wL+
z*Zgh;*`rB&76^E5B>KX(t7H4#8WO@X+M5HK;QT(243zdPF}KmiXz+{=6$tAs&n|2%
zh#+Fx%QoBpi88mYMUdvj)a3D54QarW1vfu(NAJYoiV|O@3^OA6mv%$UokYrE9PQ2i
zD;&r~>8Hiubs~h)H%T+{#3+(8&aARJ>Juy9nj6n7enIu(!IWns3EgCa37bU8ThQ+R
zXcW6-LtB8wyr~CTy)%t$6B_@yi9mhh>L3d9W$FIOPwwCng+2<~n)`_F$t<J5fj=;~
zLipqzFrd<7Pmo+BA%%IdE6tw`79Goov$*(00NC(6#SO;r)g*QLK!$@?)lZ)cOTiT`
z<To=Q#+8f}7k$QiO$_rY67TDuZ>Y2$$@yo~d0q~qE0ad9V51(n;bX*iG&hrd;|BM<
zh(XG{H`aWlc-z|pId*JyhJzkfiQxn?`vuf<;{G!TL?Bk}zP$+^3R6dAQt;q{rT^Np
ze|H$u2@rg@K0OoMm;`|8NSW+`(j|X?Ndl-eU-#BEGLemh(NrTpus8tHnr?02@ZR#E
z6Q%HxtIMqag`;yK+)4B+@3mpjNh*T)ph@UVJz#In@W^n^$8{1Y37b-(i6l%Xub|KM
z6!DYwNTs(hn!8XjrlNaUF*p^{sf<x@;Kk?9bgcWpOcPEU@l(WKZAQa|oJKt%A`bEz
z%#Jy%QhW8*!c>yJbx>Q2uMk{(Va>JSjRlRc(O#KLnC<bN*6I2Gp1@ES8;X^I(o(D4
z85PmmQz1@V+7~n-3;XHp@&=<kIy`C*1F&0<(is?jG~d+=K<6XkJ7xU>ob=|-Yf;K7
zY=n%C?g5h;*{4*BTo9C==w+%f6eUdZ2h!K3t0%Ll+5Hn}q_#=1r}gUj28was8)wtg
z;dsXyrViLi+rj+NgfQ_D@rvpU<@)|xo_4my`U`5xCmZ#h5dw)Bt}ro4p%0vg51zVI
zCI~Lfr;av+qi5#^>wcck@S+!OA}oBMqA0B=VM|D8E&61lw>Td;y(^2qW9Mc}`ASWF
zvHpSI+$?jy+OjbTjM6%MX>-F0OLiK1R|ZQ6GcsbCK*JWif)~{d)0^3(y?hcOTu77G
zp3qxj(^)Phkwu)_i}AS=gF1Equ)K_T0t4xsj<g|fp~N#8xX9@v5~R|+zp9=GYxes{
zzCz{m(frYs!yATx@3JbsiQ>SPQy>Z0)x5}C#2d5e767KFXgMcH52Se|H{4lHIwdTi
zD!2tgrmg+vOT-_`r#xl)kb+v>BM(jJO|sFk$6^=v$@!Z*AYSQRnzwh>bDpjAHCR+%
z*uP3V!uYfC&$QQ$RB*6%#8m+BJ2_ImY(PLG_;Aby!3i5aET4d~;eDkQN{{PiNZpXM
z!&g%uDi<B005bWn3igFzyn?dcXyKtvPH$KR{K)DJM%z5t0U8-BpQ@lL>l3;_0qB!g
zGMMeUi$=>{8p-uYHBLpmjrHhk5)NM~lr!%;Szh0f+KS^kWSe2tOzA$nve<eyaGqZL
z#+r7uBBCQY8}qzR9Do97V;t;YQ<QaX)ZV%%0v8PJ`eIMy3L@e6h%i?C6_tD@g>Uif
zUxEA+2yi6&Tp5S7sDW*5fMLVY4X{FuHuR4CtX#)XrPe2&SRgvAMvY(NPcrs1+EG&m
zE@eV#GSUGjmmYzJ;51n$sc6HNl~#FWVYwA`&}qrimw5l4%rY+{RuOu`IV}k(b0u>A
z*Dn!O#6NrqML~Uln6|k7S8L%r5FWSd3}E1d?J2k1lm4GT<0XHS=>rIp&<*gXf$_gn
zHz};D66?>uM?B+sCR|-u6OfvNQWaAJ|GlH%WXsjobhfB4yGWlG6p-Xv@1`59h|eAS
zJSHjzRcA*N>M2zph}Tz}*P1r~r%`3Ylilv{Sr?NZgTd+v>ICljyyx4CAFo-ojp1Es
zpBw9<8IxWT$;eJDl9<@p`gX|srmTd)S5eY4)q&Iy5Z1$-r+`#_7+HeS8hMV~ze45W
z0MD1y8ftTxH5qWu|Gvyv3i^8W?#|9xoqS9Q%E=0Ur9BFDGz^ilU}|+Mc@)vSesgzI
z!EJ7=w>|Tc6%>S#H$rM<ZdBKM``Ct|b)6M4M;5%G6o1GzqD@5Xi+R}=+9qsJhO9d;
zA}9eMHGA1uLKsLq+5`YhKo@mh9clgF;)4}il!}k*-)|9$hqXh=v@iIuObxHF)@eaE
z$#v($ySK+OEnLWCJgbs8p&}K~x@C(=Mujvm&a1JZ?0aiIEe$=ThgTLl@?Gj*9muS8
zbqfGju-g?BqO;WpLjQ?o<UY%y>G0L41vivyT1JZ>*VV&1xeRr{t4uoK@jeVY*U1K4
zsoHG_`I9Mw#VzkJ?^Fn*&~vhOj4c);_I3nB*V!N{CccC)c``|8gN2{h)>qnNt6J#7
z-V(XO<|t&&U`6_)BzYI{fP`m=xTm<K3?{>(gmq06;Ozzj+`>)1CQ35#M{1zdrJfN|
z?MY2F?jx;%jFi8`WjxzAO6F2$vqnX)u-OG+<(i3v=86MZeE16gu;r{4hBO60x`>x#
z5R!KO49glE`dLqRk6kJw&rlGttW0zUWlJk2Y$zpG$pKVq+F0Lgz+Xvter8<cIkBIP
zY`r(0N+)tM)UOX`k}&ZErJ<a{9`JSH{MGB5KN(v!Sd>!k)EkSju>Z+W-WLAno#hxX
zcooW@Kg+<X<-zDNn|t=c@yO!y1HV%&dqfVn19fE&jH_8(|4hOB3rGJ8t0gaVw(hKU
zodHOpS0VwrMEpP(xJ7kQ8;n@w0fk|hD^Eb$SUX+AlaZ=y2Z*{OE><J2?LTR(uE{XQ
zMF1$FP@dSvj=XSZ98I;3K!foILTX`(6hY~uHFzx!&}I39a(5pbfXng{GOv+_vC7U}
z0Xs9=P@J>~T4JT@#gUk-G1Paq@S8f22H+{xbHh-PGjV)lApBDbqU1WyY)Wr8GMnmi
z0-{8?Ncy;4meMx%rFkiQ1w9nzgGPGcc18+CHhR_UnaAqUz@kLL<GK%{BSG14k(JS%
zBDe;gUH|S|I2rRc-`-4k{RC$)5p+CvNMn$MmNlu|7WgM=Oo{zm69=NB_eAs_FqO`<
z(fzuKK*w-Pk4#ShdRumHS1?+?XCKU|ACwfn!VqQ+<YaC%Q$wjx;V6uC#_mYl8lTmm
zbT(SHaP}oysv|#s_-q*5Xeq-L+278paw9X|hqabvW6}SF!R(B^mxAL*@=RvcKxVe^
zD_PILSF31{&%F{Cq)v&+(9?yB5)4~pfIXGwZ+v)12HV+$f;&@n-dMl_Cp+hhXL6)F
zoYzD=Z|HQ*Q9u(>Yfd3%uJgc^Y>nZ}aL^+*@NT2q3!_uiCRFmss+*8OMM(7z?b!DY
z+a5Vgr_bhbs}XF%^dy;AlV54s;o>U<YDPTK1A$X2j2;`_CY_+Z4HNGeKoA=fILY;7
zD2T}_7J)z6aNDzEekajtlJ%eSf<Gd@aUmU$L-q2`Lf>&6K(^qe3r?%G7W0Da{(T)i
z=a<O8${r6|*j%CD>&EpGROloI98hgcXkmRAk82H|%5Mi2W^at=0RuZyqR+IUhbd`0
z2=Ll`*sD!btr8u#KHu`hhr;>?3!Ij=bHlDJ?C*k?|3HX0@Lh8|<O3Fc$rd5Q9QY%p
z;K9mMfn8kIM@Vs_G+(P?_3Xyp5V&P(SVEu@Na)zV5*XgBPM8sK3^u5&G8t|MdK0gE
zm(<`2;!cHo2x?$oNj!T?oYGMug>{=2_|49Ep^M?B$OiBUo9_wTbCv1<l6kPFVLLB%
zTfZyYUZXcC?8;S7$;;?P5?*kDcht1!dwHfXwzXb^KTYCx57LplD5WQ1a%YlmfC&6I
z_N_M+jx78Gd0R^34^UKhmiY@)DE|*zz4X=LKuQZVC0AbApwnU9N!mrcBSIdH&hU&n
zxU`H9QV?(OK4+t(OK^N}NHPIz?xp!7;wSsHEjv;Zl7;;iOE7mI{xjm6jq>;-qK<fq
z2#VYC2YLSNY*jb0Sa(L|SLkA5ef^U0ZHPx#Qn>*KD8E%E4!nTgIWx~pBx7erIiTY{
zIEYdDQF6ZTa3&YN=xqJODStD+8q1n?SaAbwP}!WrM;q7r=gp5MyZOLlA52O5u}uD*
zrnnJq9}&Nxc6lNs9*sz?t-C1Q!ugh7VL^9TvGCyx*<Tqbx$xogq4WNKXMC%AfR#}6
zMnO(~|5tco{iaUtT+M$1v~02&d`a#vHN8C21bTa-)}Aq8k3q+Xi1?W%lq&{SQcb>3
z&hLCqzhXQaETXbhKoXPjvphhNBslE{_UsGtkC6*pV2}iv&R^e>fu3>c2gI;x3=#!;
zv3mE3l2pL~y<)jH`%fvLIgWhrmNVqT)*eszyndvc3`lPpP1I7dZY9)G%7K>VnH75l
zLf#|3N1X7x`2v7~cx`7`@MTYwELgBb1Rj7!BeyUhK9EZn2i~rQ0Mg+;5X855tioXJ
zmFF2|toJSAza##a5kV?{miFI;7X3vSsi<K{V?N<MMqG;O9Bnp(QKh^fVJ%HG*RY+^
zSfsBj0J|pDR8PG+>PoOnv5uY!-G!CO)u6Upc89g=4CsvCt(bf=7_xrFrR}rN0|*lX
zPW<!9dLoL5AJ)>FvUrn$j>ejIYQrM~$KjM@pat!fcbxJkx*=+ob0o+V=s`09Kz%T!
z1^O$9OZze|`Sd^iR%=(Q746|dC4qU%iU=AEV>GHqMbOzL3f!gyc;W`jWd;$ZU)lgK
zG9rM|5Om42Q7bxA=2wF!z4vJ(s<muMP{f56{HxdQKi0orFo|(%6Hp3^GL@Z$6vQh;
zZ>*S7W8reGEqC{l?IeH4VHJ#@Ipfr{Y#l*zRRk{hnmKk}h<)mxxKavvjre!OZ%Gze
zp@chNrK<D}1~%G=C+6KOMgdfWm;!XJR)R2b{Pwe_Nk~a}FfXG6=ozKnzxWE_vI3Rj
znP60#5590PuIm8u27eapUT<wjJGN?<%!2i=7;w%zL#pbZ8LXbNpnwV_<3C4y!;EV5
z2NSv5Wb|;lcpe-^n+QC!5K?A$ww&p!#y&^;x{BEg@~Q)VdJ#huz0YXyHl{$3uA?;g
zlu>b1Rto>wcyskcR90I^Uso8m2lADPSi513u9OP?Kq`IveHq+=k11}dE48uoHB$l9
z?>yiMRFWw$)BSoDwl1L}bnLAX(^)aB;DBti_TOPS59WFcPJzS))4Bq>Vb88s=I^n~
zATwzU<)#j_#wYhCff?D0;Ab!ECz=(Q#}W4Xiw&HwEDSWjgVv9rs-M!|v^K|L%Rvx%
zdR$Xrh5c8PfnSNrf|nma-!{o0<yF)HB#2Q#Wt^8o)fU%ZqpjOn?%6J7ZwIpR1@x<R
z4PpoyJ9qa?2f5(bC(_(>6v2F;-<e(|GP%JFgR9yn2xP)(Jeb^W{;Bt6HBRRwlS1m>
z29~}xzb{x*0awySo{?=hoA7+(ySB{J9<xt`QuCRLrGw2p-oP<US=ZygKdmyq5(D~R
zj^DS4za#@WD?EE)NAD-5K~6B?D!)(b^)GOjSB$BMc#e36R~k&you<HrS<b~2F~QNf
zf+1AXTzpulJ$SNmX$$FK;t0o*U%!hf*b5_^=4(+`Xu1BpSisIyssqP;?q52CY)@+n
zMWx_smF5(dG9GA`NM+O~eyT#ec4l7{0LQfiPcPNj8@qUMQJT__Z~U3R@4_ajc~SBQ
zz3l)_SX@F$=$96-0s-%!Q+vdS&!4G)V)@Y2uvVZwX4}&!aU&7@@Luf{DsG8SznIc*
zfuOzCFp;Uh({^^pDyoklV?UB>kKFC6wRXOm#&Jp*hzW0@@@=(bn<l>c%V+#w1!4O{
zaM1}@U~fTK3-r0Lhn7(YSJrpWPBfsA`R@Pk<dZ$*enh-4cT9!Po;HUcYYuF(vs$nL
z7gPQ32!Dgc+SH8QVoFO@C0+<dnMQ&G3)_h!)p{3TPSy^S;gDO}pc|7uj<Bi|zC{is
zQv>K2r7*w<{mzawc>f;}f3q{VKrn@k)IYLXHGJ;I=!AA1L$!D(R8V_d_dCz%mXk{>
zSi%1p3~6sz?GM)VmA_BRI644VW9!a#X@cVCWpQ}%`6H*S0+3clD%SKU_J%MPFL4eh
zGv3;s5W0a2Xp<+}DM_zcq0t@!(({H6TiUJ>yvUo5HsZ<1SC?j8K6{?3aAajl><Zm<
z;EzYf{-;8e!qO)Z7NdRLh#Nd404_LQ9_7?@d$GVWyO!_k2MPsS_-4~gWMndxj_phY
zqko#`J-%93RIu}n)vl+k2w@-bg>m919!)l3i2H~KTym3zaTE0T8K?i<c(HDYhkN&b
zg|TpZ6I@9w(3>>S@$BLjdK{Zdw->9yD8(KU`@c~@lDAYAV`^-DvVv&6p2je7`YX^)
zPevSE)h6OC;#aD1r7wWfx^sY^N$W?vU&D~>k9Gbp8<^9>(rkD#>rsmGoNWcrdWiRH
z$=O@$liV-G?YHaS0@lO$tA=*40^59F6Nmw{`ip^WRdD9-Y9oF@JT<Prq6D>Xme^U?
zr?edvnRp8zS}j329rqs7tE`hWAAR+);OnlepNR2-g<QnY$wpDM9H1+d@`jpL*jR?e
z5(q)v!4s4OaCLKRKbyl5<am!vz3s_V!g9(Tj%mAYq!M-Os9)OqcN@X$68qcYwba{B
z?3ogBaW1MLcc0C2((fNZA0xwacC6U|i*=)LRlzMeumx~ISgW^!db##Ks!!~Y(l@}!
z9*dOU*?%^SPdvnl|2wZ)Z!^wIlQJxRe@>h<df<xRySI|Ug+%YTwx7|cVj-CaR;x6X
zL6rp%6Tz(lYe*7Z8FM}V&6k9Qlk?pfzR}zS-LNZhizEPcEVz)2imoS1(aU@WAeer#
z2EM(qk|ir9N_9-`egtgHq~9~t_r%$d{I@iGv{--x?tTscJ3Q{eSfKGU_;cckXQSy6
zg&^k7%o3hHJBy^WPQ+^2Ini@#55~x(6hLKLCm~e9gsL9(n>6vM1HX^>FC;L5ru<-x
z&KCelz6jNTE+z}9HFA;*@4T)bi5JWCsbhAfG{EY+9REWGR+@OH18MI#F&nnI!X;Hn
z0NNRXyRswFmE0RPIjrrcFc)8-h)@!!gWV?`2)_R^;oins&-eKe@s6LqfI#a607;}Z
zTIp!S8P~=nOCq1{Uj=}<=l2@}RsxZc^`2fvnc2oC17O00rHKG;-_moFBc7=s?q5mB
zUVsh~SKMCej;Jv#t&r%v4JMSCL!pao$YV-V(hBu%>9$^Y|H%qnTRhB1#H%&!_C}<<
zt@}7GG=0~&+l}$BM?&&po&LEA*Ic#)8SmKAWbAOPDd@`RIUURk6^D`IxF;J+DI^`Z
zi4Tm}k~AT2*x2I33mNDMsl;xXFkacT1`h967XnR$nKQBCN5t<DUkJa0-S;e~IITfY
zX$kk0FKXFzk&$`$ukicNh%w@?;Mk7ru*0_Z=;YE@u~Z=hpBF4G?L1}<u+Z6D9o(Rw
z7;)My5vI&?=$fS?4d@=QTVe5M^DwE3Kpl^2<k{d%+K6WgXKxYzj`)!j@g<O!THqbj
zqa<X+PZetf7#8;$LXVrL_>*%MK{Y%Cx3+P66^WnLGAvij2&qKA#Ip7oprI)%q7HOI
z|A_cMBkl<Cdz)!8;56hX78FoL*84%?u;&b)siXz4@{!Jf)P%IWD&ngk*b}<nlAZI<
z*RYN6tAWuK-lvYZ11m2O<e6rN9ael^uJ#Ca?$%m7NdtUR#E5XC;Vk!A{!v$W2irK<
z1Xgv?sTj9Q9%vN!hKYmyPU)yI)}mtn%OrY*K}3~XS{T_2e<`VEUWM53K&JzY(rCQQ
z!HB#wCU`r)L3Vw$0)PN;PbIWn>blYZu*G->OG|53w=^Gr&IACtmBQ>>msFXwWmpeX
z)Q9JunO|WqE0CFRbZ<K3+X9FKpY7u-;)#L;H89M4GhRUkF+dWji=^UgjcmqoeT-&|
z{9DA|B7R=ZgnV80(~0C=yarO>mPa6cu`^h4x2j`}>oZ(h22q=iJNSDs{DWc5KMg<v
z#==+>89+m7E*p4R2LL5_ZNwLVb7eIY8EJ-tmbvvy<=6+axOZIkk)cY9%_}Y65|b%P
zv}RwWLs-<IUByum06rjn_Xg#M&-jxOZm_bE>@)k@^M?`WFs&(!Szq~;hXIIls+PTU
zBL(GMqT&itt4G#CWPEV9&I{#LWF6Rbq%Ur;Y*1h#gWxOVMKgB(oeb)yB_VL+bS->n
zVOXuAeA-$z(3+C5w|0w2E@6Y^bW8NMBSJ|)+pLp-&fMY$q*_7U?w;STnHaK2AGenK
zhiMFE%U2zEvQY{d7qpA`8gbw_pE<%iqC$_9<T=qX6nLAHnTou>vd-S-h24^Hhpr8Y
zbB)84>}vkMzefDaYcPupK#C*peAt>2YX18tR49uSX3wq91*X48{7eNzvY*jd`P&UB
zgLt8WB;Z`*fOO)fEc~3U{7b}tkwLuKAdt>x3=KS!bD?6I4C9GK7!9)YV85M7q*Lqb
zS%r>06zRQ#UHSf5D}PwG+>DVf#@?w&Q!p@U@%v|dy0&eyr;P4l$Pb{bJp7!G?gJ_m
zMc<J<J4gJMt>6_V!<Cq?GhyU`I!Cphq@}N@tkR|G(@5wHj=`9iK*2D!2R=Vxk+pRd
zhsmif>pqly#7_v(g87msZeEF7yGD=-U^{YWiTTtSBO3p@*0E3KS;WmuT*bS=zY=GX
zUqXeBomEmkV5B5<&a|Qw0Hqlm{XY?X#4osq8LrI=j2HU-76>$e3m1;pW5idMP50=`
zSw4v=jJFQ!>?twOevJXezYe(jgYgw7>kpXY^y|76q_PP~1Hbk_#cxYg&3AF?@Q&6k
ztLtn{QhkYdjrc9%`#O)Cu=5Qb#n-7dI&3iE)}Qx3@Ad~CYG<D;a3r1aU$@tS%<|&D
z!mmVUXHxh`IK+afbN9H*r-*N?-o>D#`-zlBtAU9xIBi)~!RePab6Bt5>=q>A|FCYS
z6U`GFet(mk;o0N-CE{yD9dV3!hJ`6`rXNPtW^m*L&v3^^ow={p#4rB|L8#fFw^`6V
zNyt665$2K?x1c&)!$E)LGZ)}rZEWS@$AiljmoTtrE~gr^@GZc24Je@pkRB{5wD^ro
z`cs(ZM6!ploODBSRIabT<#A#KeTox+T<ugne*d^WeX|DnZ)EOW#P762ZkBQuNlO+5
zfYCZ^B(<6-c~<tx_lVz_kB}Nq5UwPf=@=>Yi0~pM!2R$xMq!*Vig)V-zEa^iCK*%>
zcK<<+64VeRXjd8IBP)gPd7fvOYN8+;S3$Z@(zj$40)M9CG(thoaqpdYlxAQDyF*n?
z3wtWSWR1CPvWiM=)xtHJa9%6=!HG`!rwM32E>Gyz;Qwe!+THs6ZOGcTHgaNO!Mr~~
zeE<vloowt_LpMgd6#h%>bZaDBWB?Pl-a*4v_}x89s@bbuvO(B23ljDnP4+S4i+3)j
zSTDfH6KK`BQQM<Xs`V3Xd<{zeAl~#0X?EonV%-Zg5xbu#zXhjfFwIE|`?yI0NMW6j
zH9p|XCmXQ;8^(~PPqM)R>Rz<VhI`(cKwwLpOMhQZN8*d0EZZ-*qD%lPO)631KwIp7
zv;k5FY$Vp#-?Qx{=Voj1ptN3xOaSkgAEi?oVA>JrQJ4c>0EmL^30&M<HP?L!R&TP8
zge+{vLT!?PCmRJZaH>^kzkS63dG&$oJDPIriiBnpag6wluHsY(bTFz$NWFqR?rcVR
z9)yYu=y;s_KH@&&gOpR1kheq#SJNWBW96=<yiysYgXT*`0JC{N^A%gu%$#f}&S@>5
z(EeWy!M5}fNzOEwYf+q7TtaP=1YaVy5r2of==q{8?%*TA!Af=u2<xkLlaw$wFYrp~
z;VlfYT(~1OVySenCoTN%qz+zKb~JE>VTBcE)V6C8L`enw8u5Y&?*PMtt(j{j{#>?&
z4+wr&n&1(z+rpE5tcPKFu|sR6rV2sHHwKAZ5;Im7Ddol%*EynVC`=!nDWWzsu$Dx<
zD|tYe>nn(DX%VY7;M?=Q4=Y)fHqtxwA(f3Q>meOCSY?BLXg(EB*+(Lbre#n~O1^#1
zx6dN8Z`|phuf}kN<R~+0EbOc+uTE7U1u$`@TYsQc8zO$U&!`BM@5H@;4<-=Qj;-pf
zi{@=DLhrDj#%%p!bkbaq+ak&s*D^R2`wn<)$v|D(DEOB-!!^5IBA)Vie6=KE$lm`)
z#CJaVM%Gq<gzhXY%G43v1Z!N<nY31^XqCPAFcvAGiqwOCMHeIuajHb^$v0J_9bw$>
ztSv&~hD8!MQJ@Zd)z$<rrHPjYlJg9J5Ql$n8>S`b!GO}}r8gfu3F3+^suJ^#KfhR~
zXX-p288THY(@aSorvK0g8Q9iDtZl5Of3_!~RF+NMDDaMP;OC{(Z)1;G8m-(LYZI7}
z&73=a!BuI`N``u=5_r4p+cHduA9hxOYVBX=dw7ZX!?$rFeD;h~n)v%IQt~4>s9_pi
z#sE&_VtWMLBR8<+lS9O}tc*-`D6RXum6Z={!`<@fN`Ky)wDUb_^T;Zc_OJ8tx~>sm
zSm^Nm84b$!7D>wW{p)Y%#gp=&0DgD)z|x}9;`0r=wSoPdG4+eF*AvaC&Dwz$kPM~O
zKK%^Tq#v1H7*~@^rabU59ooj$Xt8*OAr4WEb5*G)?iSg{$%R#*bz+3ZJBE=pEN#Db
z;95e7#{vQGgFO>uT=tfxn>TS53OK)4Hc!;#nTlh|BK7t-`z=dV;Da~ILAOPA{V(L(
zTg)LS>>XK!l83i$-2tidvg1-)BF4cQ-0rhJBtVQ?PJu*GB}KCn-8>Qqr7r=Umgs+C
zz*YgZH`i&lYXBwl+*`xb#$ffl7T>9gWaxv|9%dr$<6>d#r-+?Bw>V$J8;qX0V(0Hn
ziTNoaB3{=FhAO?I@$Ze1T0#X%l*Z(jCyOktRU%YX6+f`B|3xA_Z`2c{<w*WLzofs&
zurQ;|<J#Nc0CDhhacHF5uJxZ)5%`VW+d&tP>}r7yzh95?OKxETL?5kQvN2x5n^JD~
z)}h)Rh(UYqe=uptiKo0Q?tfZtz`ON<w^lwcjFHKBp9%6sjG-9!0dO#75vW3J0sJVT
zbR*(l5fwdVDvPEy5H;*n5nnBO+;bO&9rX-?9SE9t#Cr9--7OU<cgg<i{+0gC7Phyv
zXls?V?K=7uMw>MVMn1^HF65M91b!w1D2UQ6-uudr2T?&#d|ccAWZmOZg`T;8sk=7o
z0OIJu8`k!AoPyoiWKBq(wZNPu91sOtv7>j{3|Ix`nq_-ONTu{cV{&d8L_x~C<F7j2
zT^Jsdu=6gfqi^hKyl1OgzB2^jk$cP>AxYzV9s}T|)CLLKcVbXlhwn|sKAHf2-t*S#
zz#pFp5l8D~tcl07^LeKd(~^fAlMGBEse<J_+gj9?P@QQGNIzd%nf(l0Xl)$Tm3D_k
z9s9Nu^ZC?_fsR2KYOM&Cu3Ib7gq;VhMAC)}R((%uJ+3Pt1#VfXfe277Pn{gP>}!DZ
zs@VDqd$_R~!D%gjhP>aAzk9JUc=v;eS+8p!EEeSo$szgtyb5Xro3b}Dfw~8)J#DQj
zb2dPmp-|qj*m?cGafWJ=uz@W4IVs|azqxX&4d$xmp|Fu^c^$OWPAx)FwE<tuD>S@0
zy?F^`Rn6R?A`6!N3?_U{ws16l+T|FPNfFLi(Jrf^Nn(gfz7yy1Vz74M119UYKY)X!
zp8f!mG)(q*UjrDaa6}TFm_)Ycy7a$!=u=qpUk?|d&?$A&Xmr-ifj3rGI>A=HMf@Y8
zkN7KjOocisUH=l?Hc`e48mlE{-r^Th3Mnyx!GKyt@5KXwy=A>KN2suGscdV6py%}e
zJI7)1%kuRF9y74t(z2e>+~MLig3MlRno7lXHZ1LYMbBP`*UkketRx1hqj5A{SB60-
zsUcPN(2fq-U&(@94S+3c09v;VZ)}VRD(#TH85vh3H{fqgyxb$@lMT)N3lKD4)+2YX
zX3WX<peOlC!=Eq1hIh;x@D}mUh`WfNQG`6(RQ<`Xe+MzNHJ&3_j*d5+66?|W>24);
zsx32+!tpiYIjKOo4+$#(bk-YKLx)@$inCC0=<7AI;1F?d&x1VSQpSky>rrv^o$CMq
zfB;EEK~xyx4@#1_ElJoLTb-(R*Nb1H)jOJ&L**6^Cd2yo<=jdZujRUQjOQlt|Bm>>
z9gr%BC$fXe)XQO_lDve>Z$VU(7^xCDRYwwdXTs9k^-m0PuB{kSQwQ8w>EY81=8Eg<
zd8k=y+`dBprxZj_&}iwHRmrq8S;qH-1yONiV4U|Ie^YVV0uOhkwy}s|^OtSd|H*R7
z(Hv;Samc&|4pwIp78O_3f?1d+PIs;{NvL3)T4e5-kNfMo1|Y4nSggPnB&}+Qq_NHb
z<V^O^1TA28Fh^R`Xd!m!&)?tLGo>qQ8<ZHmw#0S2HX%i(9i}!1>UY9G%O_Qd{PVRa
z0=JYb3XFL&5&FbpY<X71%>WwEf!ff-Dr-MF^BySzPzqF=26P}{`NG0>6om5uK*gE?
zGA>Npr9S+^veVmZOL3yME*=$R3My#IfEEF4gfVOY-UswMK_WIEra!VGU~Tu6Fu&#8
z^wvuhxR)_Opaa9uj%ZoLxa<D^pqoFF0+;rjv#VkE6Ul3VNsnvqrK{+z!Qo)%QDxoA
zdTXa-2}%nN@>nN(&~I=;QVM6XB`A3$L*P#}pP)ri_Nfe4NqPWRu2euvp4c+jBYqYl
z0Ii>D07~2C+f-*eW(DOnh4UjIy0_UC(lcJy03=Mcur*0;fg29?;tT@5UH?R&eYf#1
z^VNO>x|P;tvXA)1B>sPXcWJk`#c5yBL@1k<iYmQ%`NzWkE8*QwR%9sG16@a(BdVEA
zYIeAq@Z-EJDcy{(hWJp8ydrTDi=~@NH&1U8hAcTM8<>sCh|I@?1+7@K9sr~wd}Y%s
zORJD9%4l=l@((_|$DG1+-<|z;v;W#Ru(SYMqN*CqfeYm<bo|m!NHa9z&t_k-hmzlN
zW3O2sD5Mt=cUDsor8inrN2UnYv4ARqeYH~1V8<wu6tducULyANV-|KcD$a1f98Pij
z{DfzIjyO>=k`EZUw8LjLjM}_!AYw09b=2BaJhx@LFp71@t@iw-Dqz~PqanlFSBzNM
z9c^z%3hydNsrHwrfF|<aN7KOWjM2KMK3%SFUUX$aO;VV+G)89=@m~P#LZ^2_d_LM2
z4W{Z8bMx^Td=A@f)haHluIimtlW%vfd|CKFGRUhv$bRNXo+JKPCt;-}o)gbA7$Uyo
zx;LgN$bjg|GLXS4$R$p<0!e&Zs-6G5^Hanh0F92n+~QA0s+1+E@WO!(aAx!SRBoy4
z%;`O!rtbprE{LcRbhJ3O!s5iuZM;PM#<3}uc6uRY{00}jTi?Gjg%cu>2df)4mO*7G
z7-5jB)%~vAP7Ro>&{8g3zyvZp0yj2R>_5WgwPv8_1i%eErD9`6!^WD<2Od~5Sz!G0
z<lPKNUPjvOp7*LsB8|e3EI>nbcQjIBhU^V-Fn~XASp*cshk}Vjs$7qDs@v=UsX$i0
z0T|51o}dHnNrxn$lG0>YfBp>waBrb+OMgPZ#AS+j1^0HQ&7rg<2y`v8M(Sx@1T8oK
zJ9yK=hG<C9BGi_~z=G#s%q=`ekj@1TIY@!8_Ry1s3gnFmL3g<O3(r;Ri0YC!=Zi5t
z$*(j^jXk66Y6zot1X@_iCCyd2e2&S6H*65+FAlKS?*=cgO8=E@>uF=n$AFP>z89vE
z)L21ho42L6cJURJakwhQUM#10jd+jv1#|4zF{FaPo%78n#H0fkGDr9(I|xT6u(3-J
zFEc^Jlsr%3ApvkY{4==G$br#Z&xB*SB+mnxOsKm9x?QaMU;2W^`fNXasp>Ae{XQbd
zDK8L#x8GMl&^V)m#f>-MW&u00%>tka=<x$0?15cuSk5yljeHUQ3h1r4mY(XN;<03;
zivrd$;>Qx)mX<F_!+C*d$fu8ztQ6n@uRf?YvQU65D4@n>(iP;XVoN3x+?j?>RWM4v
zl=e18A<dND2gyo?#uVPa<H8DlP}-=C0h;tBR%YbTj#za=L8Zc(r;R|9A#G_OdqS56
zyaa(el>+z7Noj2ZRXq@boHrwr!V7_WH^G_XUf8hLA{#v<FHybo6=NFg*Luo*sx2&j
zzlTNHbLS5L!V`Y0Ot6RhkOn4o$I+S9Quk&#BA*r%PS07rn(jEjgtbv~<+_bCVCa6P
zZgI3{0*%;Tk6mVrCy;&o7BSEWRT+W~R<1z_cyR}$V^!He%QKazZxIpkn+^5;Y5i2k
ziNt3<B!4pnMp$Kg`b-5BCK4Ngl^3*?H8DUmLz!$r9q~2dugqh+T05E={8VR3&wptF
zq#_2@e9K!zkIH^Zpi6T7I>_>5g~5pVUysD;+wadkdq3l}TVlnpJQ-U}{)tXIUAZG#
z#ILAUB<U>C+V8ReP*kTRK(mlc&Uj}zZAw8+2KiO2?a4sDnu_a)FBp<#+eQ3DYNvA7
zk<<1T5g6h^g0&zLMnek9h|XZ5dt^|V@FT9P#?f4;K$g~0tC~=on89SdWRuMSR7S4K
zj55VoUqXfH&wU*QE_q`;kQI(~u?!P+j~zC!HARMuydo_~?RGAfzD3-F4&NEDTR~kn
zRv{QZLx7#J-0wsz#f|R~HFKdO`Fvp913s{z@ZDKKb&F(Zae!)=uZ9pE)Z`~qZqzo8
zB(wf^JibPC*fSP;z<Zpq@Wm|*%<Fi!Ct2<c29#t2quuY~{@aKLp4>K4>3<^OpLg*I
zp8Zn_u>&Ud7?P6rhW)>+=iybR4H9VCv3_&;{w3l&LX^k#G|s^aO&=g|xr>d(t#ZBp
zVgFX~F;)FH_G`sjMHZmrJL?}Rx#r&iW{qLeE^C5?eb;CK<OLJlvDujz+#AE9SQVUn
z;jy`i<$f*XQVf#xdtyZ&c|P!rjh@$|EOfSeMiM+hSeF!5)PS4$aeYNt-LR)5b!Qdu
z!6MY6hScIA28hfn6V22UwPhg>Za<<A4#-qa_C&n1aR;qU*S?+nn+;ycGh5<aa2p$w
zRAzi|lGPWS!MXB$BJj_2#MgB%mr7&r5fh$h0>?-Qbc+A@Ds7}wk3TsOL2DfIGdHp*
zf=wL#H~O#(dz_EUp4xQ^Dbb^NrjBQE-AC*rE)ic5ROcJ1!hCH-+$_yqj(D$(io}c3
zC2U2F=}ADiw_)ZB{-}i^+5q;Q_dMHgs)#hLFJWU6vx1Adm<)2T!F*$aAhp9Aey3Uv
z5qV*0uF3TWkbAItz??_iM|`ok+bowLUHKXoQyM-gdA03Fn8Nh)f*#WPR_1FPqw@yy
zmbcFr)_*lZMjsJn6_z;lpHXm9H>m2oH@$|?Y`N}*6IJletp<nuN^GC~-mf+!Z;?TY
zsvSVON1*5fyK}PSJYBzz8=lOUNN`sZm82J31keFu=qVx~B1hb{#sf^mfNpJn6ZPt^
zkS7C-^bv$BiXjugRCKsv^Q}>Xz9N0X0v4D@Yc>;bDqBv@Xj_|>ArxN}oK-A86NU;V
zj-K-YK&fU=M%x&BSK2W8A<N%fY{JI808cEyJRG962P&jKP*vjz8dLG=Y(;s2y9@&D
ztwmQy<jm>YBV>4v_&s@|w+uM!NevTyF(b@NVRr5xxqXvVTK8DYJ&aaw7ntEdCkT_E
zZZQ$=t~QKIIbM|wAS00|VDk6G?HTevF;(k}&!ZA8-x$)LOggBPX};4cdka!-Xk&ZG
zo4}+SAj(1pw{{>Uicz(penk8RJ$zno-i3z<sS>cW2HXNPwX^k4Q}~)08%eQyvgkiq
zLe(Y?Xdu}XomQA#w-6MYh~Mq;gtr4+Lz^A(4|B3`H6`GG^8P{SpBG6atE5#|4*Zym
zf)M(8W6!@3`tBqCCri_ss<_RH2ug5U&vmxeMdpl7zKGp$X7@(*4%}C3r*J`GO_ncR
zYyxUyiT`Noz<T{;-8q2#ShhYZm<dvr*6*6BAvwKtu#t=}Y~&lR_hn6?m8RvJuk*ms
z4C1B|wJM20>h36@a0_mJAg$e4^&nRS9?5NmdWGgI^3oT6;UQ@ypI!X-h@T@$s_1w2
z0KbkaxzE~g!9YjPlPWT@5#>l{<5<$wKT*CJiF$ME@L;}XfGJ380HZ<Ed(y!jP`+9E
zgS3b<U;GyyTG``Ce~Jhu0L<Lri*>5yum6PKnt}fxnBfEY$tD~9gsUpjtlEY?RjlP?
zr)sf;mFe|IQrIUL&lZ<FWZ|nH!wZS5oX*BtlN9$eeE^g7fY(g+Y?dtov7}>eS5ZQA
zR#Y4g$c2Zc=(o&zAKk+IJ13qDd|-7%Y|9yy{xOlKBhNtSQ5C;~Aa@+hf39&@VZ8A>
zMpjZ#?G1gze+U3xO`qR17glO2h(SnB>&T<UWI4AEPo@j3BECfYWj#tn{7cDhRbai@
z!!%}EbrwBT_7!1HTNLtS<7)G-8o2ci>rkOtns7>mH5@PO?}}<+zTgY#hgya-ilfc~
zun`5wfjqDVx1OwlUM$Pv<_qrp3g>+>2~|af@qn{Q`Sa0w5t}6)wSm(ZK(ci{0EqOj
zA)^axgm991Lta_>-H)u&hWtyhw_=Q&l-bv{4{bOf@3!4){2#EVv#~>qqkAP`zQf}*
z%#LWuCJLUq)vi;(0;Jr+oRmZlWn}J*isa(&__Y_~ZA;4u1o1!hK{^JicqBRB$Fi&T
zcJtj*=P+2MDKPpGH~x+;V)Xl$JnCXxR<Kk@Mr=0OB5YUym`$qU-7=6Px1A}%HB4w~
zIrIlM`9v`OoB8u9S^t(<;;)q4Dyt+316;ZCOa|5=7`dZ>rTl#WICt#gdy)iC_J<2l
zdbANlT9m*&kIC;3rh$I6fmI`@=wiB<j1N?$eghGv3`F9&7VAixp_Sc)@du?jmcp!a
zo0xzqpcQw~SWl0ls`R+|K8%3?xVK+E^G_2ppWZ}Cx8I-Hk|#=kcici})X0ezg~sTd
zKVKCSl6EHZ&=D*MARFsb&lHgrs2X5&VN>}8qozbgn{56)A-V+BT}%ubRAp}%s<4K`
zEs>}pq-?pm`P=si|GAKXs&Tl0F)9q{ONLdh7oY-Td?z^6AWebL4UgM+{;ZA|gyLao
zJ$1wrtGdk=@d}$Gkq{wHShR*I(-d5-hi8uH*=X>B6E-i{HlYB#1|kWAnonBJ(-}9M
zK|qZiOKCoyaGG-*|Ch~o%%|Y>@}LOys)6xjT|j9Pq)PoAaaTQ{qHqL1G6-lO|Ca8u
z#}M8k{t@vd;^+1AAPVDbCTkiE{N81mpXe?Y0eoY>d9_(MJ+P`WIx~GuD5p+b$UQ&k
zO#r3&!&JuF!q5~Zq?v32iDX*BhiMU5WvC|#f1N!Z!YkmJ0Cr_YLu>m~8JKD*G56VT
z>8Zf)Sq=%fM`l0FJpq{-kjjF})CS94<}iANfX}P26gt(o<Dol9=qm!lV*eYe6qSL-
zlSv50j_tVKNUmSQY65Sia@P(wD*h>ReruZ_<|V=l*<?#X^0clakHF#?q8_MQeNg+|
zV?X_8lOQY3|6pXv23wxki{6YaB8diGwX0CEe8C2k`DCQ5!O>UdTv{VeK^^sAO%YRe
z0k$A8<uYfB2cxBinGHOEz_0c!`P#tEz`|s_pHNQ+h%ZUocko)z*3?w4V^DDVH?WEu
z(wfHVA(_)xV4Nz@D(Bn-Wj|QFO%i|$PJD02y&(m?a=c2D>TVMQ9BdRsZ_aeIV=O@V
zX%Sep>koYJg;A^(X#Q*&)Z!KtgC_%_fz6$)P?-tf`pOGaCsp-R&b={Ja0Ws5Oyqq4
zN6PXX#O@=0UOsRA#>|F^7=-C23qV``KGxp%3`4qM732WbPab?@UItwf?ka<S0uu@#
z`2x}e^Y%!-cD4y`TEfvD47vF5K%g2ha#4UALW)X?MfUn9*6c#uIM{(v8R{+jNLM-C
z**Tu61Z*r17EPDZ_^Pq0+#(W0>YSSO&r?vPIVo-Oe8JDwO^}KHZo;iC*hj?gNPG4y
zoAgT=|D(U^XyVzJ|Ern9vv9t(e!VBL7%dTdi3s{>rE<Qk3;84V;S!30kTYQbSAJ>1
zf=qC2Z(LGkVXA~4uJJATE;8A`X!UcUk<KIlC4uSo8f*;ONK?kw<>P!#7Wx({ipd_`
zSF_N)W$u$9=({9$NDL_8c?Le&SP%2xO$uKmA1+uzDV0aq9+jmhj5(zd2b25CfYlUM
zaH?J-ru8H+T|x`qeZh6MeE174X`)qnQ3e<}fR*)-H)MT-@gIZ5)5VXMeEo@j)-5Us
zb+e>hF#f6ydsdX}0=ip8ipB_*oBfVP74}BnT&(I@**K#`FPi+;9*lmm5LcKp63ECP
z(3YJK8yzP>hq%PnlJf<H)kJ(nJa(o3Y71)p26Q@SALh|fyROXdPs|*;SS3niV|oiE
z>pu&4@4&$$t)ZhyOwuqiOYUDH{)&L|4F6JCd1>J4F3I`rNlkX9#j0>s6ArvwOL&qc
z=xj$O@a#8y+&dK9X)PoZu_@)>z~{Y}@50l3QVLODF;LIgSv^%$N}U9tMI%_vVH_+G
zKQUo!g1uA4Vqt*V-_{ho&Lk_yoU%pSf!yw%8R8u`t*<_tCx~PLgp~GxHDQ!9r}ALL
zT1JUhS(o6IDJZ(~Ez^<S)~Dwq;$IPeM0^8y_iV<8A=^F6mO;=JZsi%sR$}oL2E0Sj
zGOi79{{Nbp;(GOpL+}1zT7S5OdN25{%aTYMd8X-cef<+qxZp6}+e6)zGR+FxVQs*Y
zmtCxCR$xWN5<BPN&yK}EMjTNpG}Z+r5S<Yr>>~aW@pp`Gq-Eq{Id@?fvdBQLCb_??
z1V<IC&;dXeI&h_bJC;4Hlso^9+wwI&e~tLJ(wL_^qwTb!84A`{`(T|7v&T5{f5Tw)
z;w1`lLlRsXe-`Vwll=JXInEZ`w>H6`wl?Urh9Umw!*ts_Z1H5Xq(u;>_SduL4iV-p
ze8ZWSM!=`Wf2P=fZ{IjTtzS+1A|fnP##6n2fb-eo%SNmyk0F(TNJKKw(%jksn@rvu
zp#V^r;z(loCroDpi*x2P?fMK(*v*zm^<joq+mg7V;m;p$czT^W=7MXTC=QLr>g9zn
zgWaxBcW2L;>rCU)C6^%SYsE=y3IFp~Z+N2gd0oen!r{Ux4b0_8USDF~AL~I#g;6LI
zWZ*~~R~MJ6S*Zb4tlyS!Bica~6BR&$s6Hi&7p6a{fP!z>MMEcHu|C5E=Q|<}5kDDE
zA?o~0Lb&;C7K<3Z(E@dE!y&tM1>O(9jtQ{OY}gSc;yd5_o}GqTu!MJDw2H?M)mWrj
zXg<~mUUFF-y}{TzRZ^~i<3o~xiocooU$uk}%Wi<T>;rixK=2Wvt;%?for`h#?Tc4W
zUbbidKO$aHkXL-gb*+MO?^BxbjBsHpfW0xOqssz_(XzXw;xh0>?`!@rEV8Y&)~-aJ
z*3KuahNDWm+|B@o8!dg_trxGLI=(laPl$Q6*>Zt7-$3|8Z{|B2Td~Na<x3?<@Q@t_
zAh%wT>{N#8)f{k1x<D6Zs0~oI_{ES)jSOkZZa~XZ)i&I5XAL@c*oib~i`RV7_gMMW
z=(3JoIbu3_9d2cXuQ1>-k#^1+kzZ-;$mp=8^J(DydvdYSK&(oUw<`Q0;umUv@1(G^
zJs!v-uJ+%j74V7j&%jd6d;J_SqM^+PpxRoXY8Jl&oNV}_v&H5`0BMjqoay{IF!f*p
zBG32ou!et=VT=Ye`$Fg)ZP}`9H@zx8C4Q^3y-;yK0GBC|<HoS%NtlH-qxam)k$Ewh
zm*f~%`aXLWT&6LS!e0{v*Yme6PD@H$9X)04nGA5C&@r*?7n2`^u{eFQ`C;Iy#Fj@q
zsl1=qT{+n_15x>Sv>)l1|6Uj`wWz@qw!J~(a$z}JOz^_`zkL<}E({>taD^jgCtaFk
zhKoJ`={q-M9kQi^Wh3e>Pbuj6f?us(JK95BYP{I~vsqvP1t-GvLNk6)A@T}W8beAu
zHUaY_siKk-mCDi@UP8*$j8jto>zSSZKyAJZ026~?E+F07p4@dc6+~@e?e^;%BTOW_
z=uDR0BGGT~Cz%6WVsYZ13b>4tH5fkYg(+-JODn3ECae^Bkg9klg4R1k-Cq!4i-nnA
zJ8KrZ{c^n=Sy}_ONBHQ;T62nZ01V?<q(3O}K}RNpDZeVSpD(bUij@xv6O!KxVx&c6
z`6_iiG&bO-zz`aybElUuSy(Lj79sG~o;G=1Z=uMCtEB=#zEDCV9|$Zdg*VWrSfKt@
zu}@$3PP_b<>?1{W31^<{`xRM!po<DzS{lH#V0ZNYsCGEn5R5Ca`Lr&g_Bh%t49AIJ
zu)#FW=Ka!(*ReeV6V<slVaO%rsdW7tnx<zL`4;hqdCv*?zQdr*;ZqDBNIwf>XG(g>
z@)?{MirTar6L|N62x4b?z1yoad0=nN8l&}v1;I4Kp?6SC>K`lsGeK;-SciFhcKwBQ
zksYlqV`qUWkqO<LRB3RdMtO8Lham+<ssoV;7HqHJLczakjKZDU|Dq2-v_XycI6*|r
z)a{PvVl5D)QYiWA6Mv}&wlD?zb?H@Z2cYNBp764z)l@}hXR7Ezz!mS?St>Bm;I?HF
zcd8^EjjeJwCNb(r9#UpOz{s(l@OXO)M-|^y*$k_U3P4%YA>t8_*TIKB+OQ7+()6Xk
zf3e@(m=;T+BlW(8arGNZl}kI=Cwxnrz5Z!gI%;TXKA@;kLG2?^kxWocF9cn6188u=
zz^M+CD=duX1Ud{$S$EVFOX}b|d)Ck5;$@fxMPRi?Rc4^`(Rf(bfxU)QSP!;3pBiIY
zQwdbT?aAgEx)>XxM*$bL$vWW19{Y7?%Bv2-Yd#JokRqdkQGxixV?_U7FmGaHRK!i!
zPc}1ej1asUSPT62{n~-+W1FSZJ6QDF;Xk@{1|jboglGvCKM4K%h#s2kxb|Q75x?Nb
zDo|o=@4MQwHBuI7LPmUL6*fE@wa58XoaG@&;wJm69+c)a#GRe%7y4Zn8CYN>u7Mv}
zbM3;8sNSRjKp?Q7SlLoU?5PSI4PqRVD*8F%N5l&)ED}xq5%Et@V{j+!+J;=MGn%49
z*FAt8<%2Q-U~37AeDKx2w=<E~;#FGNzO|Od1_{PHQfg&kx35rjTPuEt)4C(x>Z{d{
zn7$@RwgczBV@vPZx-IE!h5{K_xuo{K2OqcGRJT?!2!v=3!WF54G82Wz%mMIA7>y|1
zT3J*^Kd#sBLUCm>yKm_lkhU&tyxbxKEpQI%An;=Cj7q*i3gs!d4v2DCenklgohk80
z8_;swMdhx&*>b@u3N+NjgH3Y(BJ4KCe-xH7o%z?+L0fWu<|Ww@NqVaZ7mPQ2V@_e`
zzkh}qK9RwG1<Zt%@e0aYi~U#V@0W%(aOFFgfhOYdvp$}JI~*W+13q1fo{|br+K4yU
z*rnF?xu*<rM{oFGg}vLWMy;d%t&<7)3NnKoi<MG{#nKAT7?L`!D`Oc>H0Cb+Peqj5
zkf5Bck{L#!y{{=;x!?m1=S<uV3}LRYzT|5FqJW-x;ExL4DnW0Nca{kVoH%9fTtrDt
z0Cz2kiVP_T?SOX5<t`YusF9DXj(@WMMCRR57}x+2-%M7XE?^kysMyI7-s}$|WMm~Q
z9fFx1H4v!}TY_nch5eePaBAwfGJu+)F2U7q%+wYLQVRSYytU;z)ggKy1;3cm=XUdx
zrI$)1z3EfU4H*YIkyKQMbd|+-P*@t#kr5GyF|T0DB*_boch3P>C=uP`T=?bE5PU=W
zBqqOK2e6h{(;0T|t+q5_>C<}sHS;EHnEeM2g&%RiXC61Q2w)@43X^rIju=(V;un^m
zqyR=FPe=N6wE~tpC{7%WSN8#O;8z6pa~;24u32!pQWcDL_-skY)h=HAONM;%EgY;1
zbKnnGit8$Al!kC_>%X-*O&9AsAF-ke>)hHy5^rBI(I{LQ5vh<e-$Dt{&e)Io)Lv<4
zt8qT1>8#b=wLlLWj8EM_7YhSNn-NrTQ)vcqL9OY5TC}gIfIjJ>t7U`7h<`==J>svX
z7EtUw*sO*js{&{V5n+AwcM<>7`h-V-ZidC_d4fV~1LT$MO91tkh%Z?tZlvx|N1PCX
z9N2Ny9R_wNJpi4t*Ntg5DnquDmBJR$OT@q(bY+0!OV;_oRyS+w34U3tGEs(6|6PG$
z4(lRRWm{77pAhh1u4`J)gVOrXs_lF0_!iwQ5&roz4WVf8#-3#@W%3Yl&r?pi1OOCJ
zMI|!U3JIFpYezQtY)?c=_di%(RjFhrJK$Su7rS`*3RjrG-o#S{YkXY;2%+-{M>XK3
z<`2Ko!6GKT@?Tc%do($V#K5CD!?({W*fO#mphbHwxaOsgg@{3&*w=_35n&I-t5~xc
zSbb!lhuA=Pi6j|u7y%lukY9XXbMOWGe8v!JTw-q=xNLb*1}pIIZ#K%KW6c@@fFq>C
zdK~wA#J{nbljXZL>Sxh`8T%In-BV&HAQgL(U}?dobstiV1g*wxRIGVx1<Wn;>MKve
zBz_WwR2X4f5_bnunFW8jkb%xh_xY3_Q8a7_u7UMmmMsrA2exB;rU=vmOX>(wh4o7<
ze9%B-QIru8r1-q&K@E8hdX_lN?JQ@jA<T<;2L)5D8;P`ZPWlh#^mpt>Wgu{I^SY(c
z?6c+Ky+(h{QZQj$Z$-{5<-%x0%FcYo%>YgYHhohovVjvWNnFhB76h%g*3`4*tj`WW
z?M#_oDp82_GMLKB_e@3|BmPLT&;nmk<3QEKF<Ka_IAN-?-CkGg``qFEE4<Q`)Bldl
zq{#-rH<sXHL}<DEwGJ~&RX~Gv{nxm7={hzXrz+xQJ&m%!^|oXXl4Po6IN5DR*8WG+
zYF=<xrL{%u+0@$Yr{J8AMwp&B#g7>E3%%`4f`JME$$P}asOHwRm8Fdt8%RPjB91oV
zA>xTla{~R9_Jq>Yx{pW|^jG5_B_OU1OiniPYVj2#>B9w-v$J|q&57J*6%0CRg?(`k
z=nccBrGd%C?@I+)GH<3~wI()V$A8DQ4n?Qnoh0U!s&#E;z~J!+s{jyG?VgrR7L(0)
z6d*rlJ2@w8sv+b}oE|~Rh&V_55%D8AKvFSOM46Fb8%uF-WZ|34urjA#u#8vB>nAhV
zg#nzcV$iS$SE#KpA<wfxgxfl)j(Em(^fnf=w<2h>{{4wM{lAehmNtA>1{~Jaxv;@2
z0?rHWrhtWNO%ZS<9;66DiS5jReh0YB*rzrRaas>zkqxT_uC#<?E=b)&A&HS1vtO|T
z09+W(xaWQH`<XWMGvZEZAUosQXQOzH_y^yWk@1ZH)H_g$Hnn6I)jm+JJE^Z$X_bs}
zXHR(@NDiOZcRmtU?r9r(Wg$huR=~GreF`QfnEY;rSr8IBB#jmDWS>wVBhLhW!qKS|
zyW!*(xQ^#U1}X})$3J@LENr0VU^pQoVNnJ%P<ND^XtwJo>uX!Q#1?NcHRX#1L8rAu
zE!_TtWcx4HM6qM+QE#e|Y7L!|^FKf@4aVhNjfdD$-plvl%&(3WogLB4u6?nmwWc!O
z#;E^yY^JfPa#!<i3koxnh#X8ecp~uD<Y*lRlqw;eH@>q(u(L9N{LUpwp+~@O$)Cd}
z;a|C{&hpyyRWfL)24IvRtM}!U!FqjwN&}uJYrnoQ1arq+=hyY~ES3M2C^n~Mn$R=8
z;?Amp83!~6#}+GZfIne0@(hB02X6g&CqfkSF}~v=9A}eJDx27HHWvn5S>arYq}Jef
zM=#HhEP|7kp~ivBs?}YkLSgL&JTb}anP~_+BXUGfx2!_7ma%TEv^YU0);tZSWji(|
z!;AIxSFo6>H94;~|Niq*+IDk*-I56TAX$?bQhuXI5MTlIE2?$0U+&nv`Q>HV^mWBr
zH<pdHgi*}}k{<eY{TC!G4>&M&o}IABpE=_?%eT*8(XSnVxdJ^Cc|b>sny+IIgca%J
z#EO@=^!YaxLvAl6kvpqlUATYmM!Lf^7B#5K&Rk(t>mC5pTJ*eMkCUEXbB_MV#DF>U
z%xtsQzEv=}o!<*jjeY>=&5r2aMttneZKOY5;rve2Flxf}m0*4XWWHo-0OEfhFceX`
zswqrFuqjDFl>8h~TzS{@l{aXI+vE(o3glp>uPjKp@a8`ZV~V8`RJ75N%2H4HrCbja
z6NS-Q$;^wrHIQj*P&EBzai#C5J;LO%lNI|j2%>{_`NGkYqZ0u>BdBA<ugHmR7r!Hb
zz2b~U8xTAnfGsz&T_f1R%0v%VX}`BdE-`1B06Y?0Ya>YtdxEWe@B<wqi{Jl<_}z4~
z6Kh%6Q&I|;f`y7qHJF|8RI<O3z1T+l1V@lJo`xG4NWU68_q&KM5yyz%lZTQ|mR+@A
z*WZcw>V0voM9Kor@|a&xDPL`(b#USbyuxXXfuw?~tQs%6dB-kZu$ffEG3OWutGjH`
zEOl5;wIp&E6r>L6H`1Roz`FvKRY3W?h(Pxyo^i)flRji3#g60@mG(IL6E^e8LKG%)
zE)6FvD7?vvYYk4R#GgtLc+M8#bp+0kEG<v?_%Ony=DLo_7RrP!a0H`?&SlXO+GI4~
z5(#$rvOTgM!L5O`sLKQZ`R&n`q{4xH;8iz9#4Hf#G2%zWH+siwK(r9R6`J9k+{47P
zPkxQ~3o|`MU7f8Uw<ow|V!()T9q2n2ENNjLe(;#CGzAujZtYQ7l|6i?0T8H^x!^zx
z?0!QES{nzv<reB?ZDH}69m3a$pICD*2#X1vf9xUYWh$_e2a2643lBuUBio$n&?l<q
z7duZ9pX}_Hn}{cR<kA#gkWUM_r~KC9i7L{^l*25of%F}_4`Z@hFm+=>qQ(6;)+ZN0
z!I}(kV_nhb_4+H^>fCKWLIXd$<;vgJ8I<m##ZlhDb&o^^$$ZM>3>NI)1TuMFGNO%1
zYhEJ0uSbc5wO|c)tK&))I+)t`_NwjKvuBI)k|WLW@I;-YOBS!@ceV^W8S&ea)z472
z|E!U5zPA?~?G-M_Qy$myq_6p&U$F&Uc8mAlEMs8Cc{p((^5$aXt>(C7zCu>01b<FK
z_NZ2BYR#`K+2RPw;p#X@d^Vqg2LIV%zc>2{ld1IFu+;mSy)vy*36p_d5tvE@uSgq!
zNNs(p1RF_vFIKe+E~8jtifX-Xvvc?`u2QF>u@ZW1_2rbp7;GF%g}Kzkoi~c&_t<HZ
z{C;iUs<d~585R*Qv`3cf-5VJ5_?00LDi)t1CP%A7W`rCqO%$E&p}ze@5qE^}Tc{ZH
zj`>^8OGj6eOYR8<3mSg}vL3+*M-pMliAE0DcKtLUeM@I_=N$*V1|F1r*JX_WX9V|C
z#NVj#*Dz{5T9Ss?oN-{<Lcvcde$`$)yWy9$2wM*1A>yZqhloH6HCp1@gJzqE-#q{q
zcAr<H*zaH-TdT^%x&}1w@R6ZsWyE{LqsdT3`WNOZGTx^J83g-R!kkLT73(<`g?0PQ
z*!P2QHHU;Byy0Xy!GgltEUEXIHrdj2t(1E%COl>Pcs3{yPA0_|R??WkGM3Z@2=_Gn
zn)Z2)p|7M*X)j~X`5i%z^9{c<%yHu8kDvAN%pUF=&%Qc>KJST&SA5LY9@Kn$xt)zx
z6F$>Y+z<e#<Ooex92>w#Bedz)Wq(CRef{TmT%n;hBz$5_YeIYrtDfhOcQ&`|4M4ml
zh^xu(b^Sn)O~luTzuM_9Y$A+|ejuqiuMwO|yCZBti2(24s55?wxU=~Hw+}bQ?R3<R
z_FUx1N&28NRjd)P^Q*yGoker^5%;XzC0j(5@@E8K3N=+&&))FKB7@PwyfejSHj&?;
zu^%G7*i%X~{J|mO8-pia>FD#Ouy^Z&&%n2^u(AaT7p4OktV!~iohV$;i_@A|{=*P(
zO;g*jMqpjUkBEN}11ItYHFFN^<-jdx4sf{yn3Hwa>1u1J?u7%Xa60eSyKvy3SLR`p
zJnYYx)J{h-2r+*hf0&oxDdKB(l*K@fDZNHZy-|9=E63ryw%^W%{T7&=dXYBtq<Evn
z!gv(`XK-^xx4pqS)R2&=z!zvA9<h0qq8mGZ9h7>Cp(Ha=@5mJvNyl-(hZmof;nWMO
zGQ`H}85?%~U)c{AT*7E+LdkZ1kLZ|5S|=EIG64O7&g*YB0;}T`$w6f_wCdNmw?{_l
z6xM9*kqG?>1vD}IWY0vEAhoY(QcX|agaMbB@w{m>rTce#F7#-vQHuj0wLrsnO`N9^
zR&8d#e_Kn4bOuZF-ije7EcqTf=$O!uI_MjY{(&<*@{Jojsan2nHc+rOj}c|Wi;a{D
zjJ712XW)nXfH~04j7$aF)LSFd#KeQe?@LGC6Nx(;Zy!eLjQqy!^ChmgrdSk9r#IMK
zry>hKCtJlcPe$FF1fe(aQe;n0;OKguEYy&)><DRnmO_|e#s7}@|3>_ei2pZkV8GzK
zF*5Dp)oTuY|B4up+SHslr2?bTEa~-^wDGF?)Y?NX<&wo;Eqp*lkIRNToC)*|QTff9
z8t*?FrEp?=tk%3XC1s!oy|BS42p@Tv!dC)y0d5&BgM3^+(Q2?RG{LZvZ`=@xI)Ys0
z{7TER=WiOLKt_VWVJ!j(6Ixc7pn9-ty!sUk^pUWzV&1lBfYoFsYI1L)o_qlyu<$eY
z>@!sY93R%rurR(e1=?3!e;5%PCQ{D}sluv14Nhs|tKW^3Uo>&u0zl3Fz8dvfL4M5P
zOG2Dx9mM2x2QcbAKP@O*CfGqA?jl|z{xFhb%K@6qq>d3USprl+u;QSI&)xA1wkOuj
zmDRW3zjB64pv;E>qKSfN5%E2_3q`LvdHf^dpAmNvKXXuqW%+5B<uE7Q%EoH9qBhhc
z)YwYjZocJM*6iv739~k`=WKLqA6k%m9AF8h<hzjePFcU9&&RTMI~jIAM!eZ<I-S-V
zhpEo1`Tn154EmMdJOYj`**aKn_a@Qt9h-8tPj*=h)4_ba!{UVPe{X;|uR^OZ0Vx=i
zvrGM2(aW=0`>+Q1JNxZ<&Avt4MZ8D+Gopom%7Ewsp6>~@+CV|DXFES|iQgiAPwZC#
zykpG@VyKGn8$16CDZ^yxw-l>2jCm39Go4r0?87YdElP6&PuhZ)2a7>cW#2w|iTFL@
z-w}Vo$SO8vOX+annoo?9Sp4!}e-`m@C2qHLBRsF^2l*fUVZ+H+1-l^kZ?iR!($I5Q
zqw-`oSm8PL$w1}FoQL>x)iACR&#==ohFo;P)&3n}qN(qIl`Y0t5Q;=ZENpOw20wFg
zB@Y)lSp=<@b(AStKy?$GiPt4L$%#|>Gry4{poybj8{I5ov^GUWYe40;0bY7=7E9uz
zin};}rsXMl(-X<(0Fz!3UXF?N>iSi~*pRaYLWC<q7O4u$_2Rd9%p<qA<1iNHG_Pa{
z3mviUe`DRp>h5{QymmJ2EaLzRC@2BM6Y+G56#N1~b4kwdm3G-XI{=+^K!_e4*FL0*
z_!{wlM*NJ;%)5QZFuIl!U-cOVx!S{8L^n(B`oZoTY-HBr`bUe*o|9ep&Hegg`fv|J
zHWT<_TWeF8?r;(r?-^FOp2(E;oZmg^^SmOD5&xadAVx!WBVQ^5ng9L@N=?fMF5PbF
z9J`<iDR9Pp)_x=+m~hlygPcf6PJC~lY~e_CR3ZAo!kEtWCE`cKzaxHkC9~5XgRS{s
z?E4%*p6yv;nF7RU8Ti%~azH|+uJVP9UKIE@IJ2+IR*Z(ArDD>UV3^TLhaC*r2}RU^
zHdQs)Dh&Ys8o<7friv`BzL#l*FBxZX579i?gftZf<~qakdYat~U%tZfRy41uaCD_6
z^_cAh))^XEkBL8w9E`$T-Nx^{w2*x^HL`G^x0%!g>P!)8WHbN5cO94QNCm3YF6#yY
z*V`Bor2CS{vX9(rw4q_6{nrvVIsl1epto0n>pvLt5k#T`Nx<nd{Y$V(&q#$@bQJ|F
zSK$mNsF4MSzM*6JX!BDhzNqJ0q;XkzpIN5<HwtGC^G8cLw&0*lBi@x6Az$sHc7%bM
zQvPZW7F?*<dZQXUKCHsw1Cy4o%jZEZ7E&Oz!BMpK?4n=nX%l$_J#zF_uNEa;;+Tcq
zC;t|A$PL={f^#`Bsc6GCiJQKyziO`qrY@p;PjOiMS>FD8YgC^sZeHz^M-sJ67wNZ%
zuQu{P<@>Ek_$D?bucV!5zS<fdon7DuMiusl%=zo!nL4BHI{?6l<sa7PuVteC2YSxl
zs2HyX=V64ZY-f}vvpGIeP~MTsE%?jZWr!|1!ZIuN-dQ_Fg_L#C%3m%$gGvIe%_o?T
z#MYjDetVTR2I~YLH=zNQB(E{UNMD*@pz|##)L34m%4FIIBGd0dtp`&A3ZFLXS-Nfm
zC4Qr{W>W3ph^t5qXyO0_NPmFyn(4H&O{mI9SSI73w@^J3@t|h$rG{QmFfJ_V&0}6*
zZ|^wn%FZ(>aE;yW&F_f+*kJb!DN5Q%xI=<-;oP1L(>zB^5kEsvq#tewF=8Pn;@*cP
z{0|7-(^pC2Ce4%7=#uvqScH^v>FIw*sP{qj<C&7mJvi>45&u`jZzyGMi`Q5qoz7;B
zyL&=iHhsoHi!f*~A;tYSoY@W!kz$By`N$w?nN+@4em&Dno8<l{<L|s=U9g0uiQ6+A
zewekK!25e#>J~cXN`BlYN3aGQY81_UNKUN6$@*h7sNzOpb@<xCDCmOwJmJir@U1Be
zJ>hp%>$(t$Dsu!iLlQ2&|H1n<RPVH$%KAoF+-o*~$-Q17;s$Da4<wWmA)!xJ?$4Ks
z<P~Q)c)01t+(-wePJM811*XnOi5Wz<+`z#eQ5yc^9bDP$wC{jSu}si9tR*!dkv!_%
z@2nT8!EPc;01s9h-+^(YYp9{4-XmVfr85<n3kSHxZ`7Q}k_G<AR=~7`A3&xDexzmn
zdt0Wf8^8p1+S*7msdp!SzhxChRI#_uUzihHzykgr@hjN1hmlhw%Xz(uH4j^PpwcEG
zib_B70cjDwvoRGV6u>*aVgP}C;Q(i3pbd2k$zkSv;rEC)Tz~Z+R9L%+Z&Y~-^QR^3
z^~IX=>m_df^U$kVLKV~wPh4^r|Bm>-B7RMLNNL362uvO*=IpFEEQx+<LJH9h|J#FG
zBux^35XtqI{6l99@72aDeu2Vj%pw>2_f8j7#qy3MOQkgkPEb?%N>XhrJlWH{brue$
z7+4(PK=fEpZYeEy2yVWDXeg7P-CEJ6wCc|lXTG&Ty%`GVf<mdokOhM!WSTEwE)u>c
zN#sOvwWV~Ti@HDo&ItbB5r1UuXGiJh08_Ie=<5ek=TRqXAdKI#W1VS;Mw?WeBIp{s
zy};bGV5cxKlIz=VcRv~jyW@WwJ6K`eRgrDvQFXUxxjz^#aNsa}@cy(8c*U*X8~<=I
z5IU=QpCkSq@oZ$%!h6s9sV8`v(j?QfF~1;)PdVs<8#^MzsBmc`_uqqtru9I=0u$Nc
zJ!?zL3ihZpe7`7yU5E)^i1O!%9|Ro{`#p_Fr)&&s>=9BUfo5Vk_Z9|Ei4WEXM7y2L
z-CP+_(OL9d{C;WA!&I@apqk$BPk91cCcHP{y3B6N(I|o6B7D;_#18MjhchdUf9zO=
z`!#|t%qltnX~CE7S$XxUq%^t+*As%b$<!_l<<ka38r=tY-75@3NUgKC4Ca*W1n!W`
zKj0)vl8p)nKA`5`WHSJ5sSNo<GxZh)#yx;M<ABw!D;TI)Rf9)oO8wDBzTfOeXMPJ$
zRI&<zsBLxx57|D3x$EH`3)vV6P_g-q#rDPLdlIjSWfN`;Vvtz$_bdre*gjWATb=;c
z^LKf1Il^&#h4fR@Rd1z?9g*pE-ALqBub%i&LCUB;s}j{^je|$r?H#*3+ANq?R7eNb
zBTs`*{C;l(85$!-M<XTjbsV)VbvDjdZKMT?ig67hC1jCiXv@b2E58PDHZ`Eep5_pc
zwinL8j?2z;gG)uD;MyB@QT{FVw@FwyQ7DZOxU)S!QjEB>>~xcrA$F#(m4A7+4hE?o
zGey`y>-a%}cxPihjzpC!HGouwCbd+Fuj!EUh`+knK!J?+iHHr)v@STPJquQXJqy~*
zGQCvw67mKPNQu`r;;#{J6a;Tqq3qU+18*dxTZ~fh|5M{v5*s+01Lv}^aX<?eLn@M<
zBOp5ML}peKB-ALf_<U!qgpweA@4&m~jp)`CYJt3|rw6z(RdhiCvawNvDFZR0r1Y~O
zi;$|h&Z?-CtVoPK8HJ@HNBJKf$+@MGQvtA9rtGrOuMxl4^CNBmSerL-x3Z{;BM=ea
z`20f1<w|x@Km%U1qcSdS_Mfjs4PYc*@8G%)P&;=fC?72sy%}UhpG3jBUoe+5KfPH_
zgG^1gG+)~j<~n;cStf<iOORc?<2zI1OUkSsiL?}={+XUZYbu{^Fo}D#1Su0g;Ua|N
zr27=!l7cvs0f^k)7{=6_8ji#}Y34`5;dfMhuJ(1a#lOLXegT$jl1^f!e*&1jg}#PA
z6oj`j>%hBZhuzj@@$Oecbuy=S?(loGNoG5$o&ohgvO4jZkfg>!{}u6j#5Y=MJKj^R
zCmFgAs3Cl$R#D?YMUT&GG7VM$NIgR>^8F5FdufBP^1r_}htrZt4yb|Qv(E&-BkjEs
zAC*!QmBixaC+m8t(J@>>MGe)&X3c_l7n*9fLLVn&B^Av*86~$Qvif?ZNv}-v6q?^B
z0aQwrvrz$^Io8sU(3mtsJM$F$!^Y-Py;=v8j&{0#l1fzo&<WPz2omXSA*%!cTuIq9
zz@V|F%F*1+J#^+`|Kz+&uD^oXteNUrnZ2%5K$E$(E!Vm+ee{z}8%m3K3jBgXa}c{<
z?r@EV+O0uwONUE`Newn+tFa!XCaVJKm{d8DtvT0NK`C!OqhM-Vv<)e^G@OIwXaYLv
z?+=F2oJ016UHBq>b@=z%<|rJjh%=kP>oTMJ^VbHPR%d(BLRx;07$Uxy4_|<wg2msP
zWp5$x-t0kSf)$wztTpLf$FhClRP?5iy30<Jo#p>uY;4Jh54mS+los{tB^>e9vuN+P
zh<`vqs4y@Yft{gHPRt?Daq-_NiKiXzVR8Mz*WZ!pyaPH~t1}hr!z1q@+%!`aUx`@K
zb<NTLc}-#I@Jf6%@XLiIBojF8DWTz2Kv3@9+xZ*lQfMeEroCvY_?5;6s5Pv^Kuc^n
zBjV5^m$(3-*^wfZ;%O@5mdbRG$1Yd`b;mW9&1HP}1c6?ePOc<?olVY^&%g#7IU>${
z4lEDWp8ReP`&x)-u5`F=`L2d~LWOlcTc_LN2#f%Q(!{W(sfRa4;Vvl9bSDIj3!A5P
zAMqUVgGQ7Gc%~w``y<1g_9oRCZMl_#XHs|x5mbAc;om!xa_1E2_lWPG)lnOJ1bJAh
zHW?XSXF;e!UxlMRgFScF(%7*;Wpe*L%@TKZvP!!<r{p(l_z*0T$icy++`0u-c2ga3
zYcN|Xge}cOJyn4TQmIRls2YB_AWBahrX%1^bjIUn6xPl>PRVh<S{=GrYY=twdnCTC
z-NI``*!<mz5fSy;hZpAkX0gFvF*2!0#Q)^sn=EG}`W>wpu){#lpJkx)`}Q2&FkdHX
zv;ZxnMa&YvabZW)531_J6*Qc&|DFqLtdXzAB@N5mDOJHpv0ZcVJ)8PK3!WlrBo4GB
z!Q62n1OM$vg?eK|%dGu(%t1J?y&_-kshwU-kD3vYOaR62{P~gSyWuxRlFP>LUp0&Z
zwb#bVvUgmF#0i-IJcp<^)LJS$w|r57U7gksM4A-{2kKVexn_y-2Fpx7vPqQAR=<=f
z^!7JlF=*7u|MLb$Tb~m3U2kHj3sZbH$r+wdIDC(|k9Z_HiHoT)jL8VZ1&3YOri;l}
z)pWFG#y=u{{SQUR$A}6sr%vUJ!@ijJNfE$65&B|K=EDkCHQw(X1iqlaMBwTeseN0i
zfE0?=XAWpX=scUp5#+AWfYsUPxh3LF$cZE>xW^5aR#smS2_xIv!S#2*j~X}Fl3CrY
zYXD*VS<P7!6W?+w<}Bs6WT2%}tb=h|qglToZfy_SUi|og_kWF;$R@6qK6S_%`YetU
zZ8oE`2P2I)#$FGc(4(7hWnFk1DzJZ;U3UnJ%k1+Qdf1*34qVPuZ#_i(J)(;E#~QaO
zwJO=dJDcSs_PtoP%PiI}OsaRbN`_j(drrj5`jHs5b?%AIR8PstLhn0|giIZ%F05bf
ze`yW;cVOR-q>P{vTg4+G-%)idU)=*QUb8iT9>WOx;t6Td)~NG-4FD$V6&DPfFzs1e
zzvD(;6aGwr_TJ{8+;LPJ)_<f1`L=EZ6Sx#%INEWN=v#%cTlBA!HT2kDEou1qH3P*(
z9k$?2C$of#uy~{*Ur^srOW*CK;&m7l>Z#e6Xwv6S;J?_Ar^OoXK?fa9zg+et^pu|$
zI(#|sSz@)LZ%(=~=e>{I<AOy}hs?z$(kjGAUe~X&U*doY<C3=gxVBqj&L#;!qgAa-
z$hU?YK0|C5pK-v0k+vh_GiGCY;3UZtWDwlMq3akJ^-4|u&J;Nxq;(ni5;jPs<mOwv
zhuljInjexMf5kJ+AndC(aK2)oBc_${{-dRKDtA3{At$c?%n40LS<Lj%9}z>u6F79v
zxBhE63T3-)<-hVkoE@}D;I0RP)+0{@tJfw}ARWHLwvSeQnG98p9OoecaD#D%I<O~u
zMAX$vO{o=+INGxnvvi6-B3>eb=NaakXhc|SEtONP(dq5qXvtj^v2_GhtUX42r=B-j
zaX;359AfWl8+oC(ZmpnOWFH8lGTIKvFIj$V-DzP4o@QWYyyXOpmU>iSe7mHu($~l@
z&v>G?xPK&ipRB$e;Tgac;MviIQ4&)F8+eE~(2KRWfSN`cHR;q8E8oe;1wS(gwg%wN
zmoU5jJJ9$$9NZz|zasv(m(cZ6`cN{`X=L|iuP~pgOSThHH2QZc4&P`h={YnfYTD)c
z`U`U%k|W61^j068vItCHRO0t^VoQAUy!&5)B?~vKGpyX}fUo@k5Io~2hQtR>c!rh-
zMoY?GQrZ9b&xK3Xp*xbycQ)bxe|84VOsHTMua0MDbVv?*$oQV1H}SRwfY9H*;XNO1
z4!|pS-VkAjtOKnquPKqC9;`5;(u;`sMFu$-jx3n%_QP<jYA9tw!GyI5s28fSQJzQZ
z<+>)`+rOfQ+kE4@o&a6F&31dGOXJfeXov)_Mtjb=-vEqQ*pI57m6gw>zfnl7d7i`}
z>%d9|i3f(%7?%`KAcv$6L06$~+1OJ#Hhj@6GYf*$qJYqVDMujIeDed3FDh_nW3~xu
zK3I<=VP|vNXPmL>^B?O*(B4LYG;Bmm7&%A$jzps7$5Q)o&5#4o!X|76Co-o3Qo>sj
z?B^xk5cM`3&&zrNYvA{U_FsQ^vZqR(_;ZS&M-uCvX%HQFXJmPL0&1HDq4L#(kq12{
z`3!TgwZ_;;7I1E4l2X35dWdwY&#)^+vIG<L$`T?XIQrTc?L;c%5d7az;g=@~`%KNu
z^kTjKJE|Agm0YkwW1zBHk*vQ+CN=UjHh5tVvh3H@1NR<Vu+!Xc@nG|#XOFwXF{d?z
zi4dzK^o*mbpkSo%JthaJz$*5vLc@7#Ei*i@koibf68anKlo02%V<BI7sQc{}Mw`~O
zM+{g-e6#V3I&GDSWfnhPTPx1Uip;Mo0A_DFX?hDi<nj!BrQ4SN>4RHPDfG5}>e8-Q
z9)aZxkhQUsR=ejYEHA5#`3)dh%aHs7F(5-BP6opUzH|U?s>@*qNLkpwml$i#QcDe>
zGX#Fk)^IArJz?dh6U0SfS<jdSaFsz61;8HMxG`SrkkxXD@=&d=rSLz8t{2Pfw`(h&
zENfBcsPspVloa}eKDjl5Z4Lfy?ddqBiF$gQ?2tN8m%I!3iV>PXIT;C$&d_z)>|GFw
zC-TCxZBnei;sWDblJ=a8ciq7Ut5x`x!q-UL{=zM2s+5YGTMAM?K!CTUm#<JAbQWyv
zO#5EZOCxJv_^{G&<z#WA1FUscAI)%(VV+REOk1;r%Cg`(NkIo(uR6K!O%JIK)gDqK
za{z?&w#01}^mm4%bw??MAco?)<YKm_iYtJKnE(<zkYla^eJ5pYdB$30Liv0K7GN*+
zFJ9?mkUwdt5Y)-<D}e7U?U$HPbV9e5k%pnOEq~L~f^^1p7p5*rUlW7QzT<{P*fl_s
z-V_We|FL5NYb<SRW9kyAs2*`cHxWX^vJISH#huxB??@~9&x*id!F|9Ny_2{|xUkFi
zk;t~+tO38UdmfgZ@YDJdD$_qH2^z6{!JAVy0Eh*DCjBc-b60V|4)~aFpXn&J2-SO{
z#}6>bhLT{hzW)h)dN9G@5juK9&{12~?fMsuU@c#^vo4Cte(`GE{FyqE7g1`HbgGmY
zK-?y|`gw^Tm>?QJQ3VFKur5pEsz7rWae&V0Y<{XTw;=&h+HNIM7i+xN?-bT8dY1%%
zj_GkV#(0DPmok0!LC3ZC3;U7I=JyQCM-W}Py7coCw84d(w=_V}lD3MDNjb!VO0mez
zD}P^DBT#7*S;o(j0B?Udp?X#~!I6Sglc+FUS=u1`s;M2nNBqjZ?)cwoegE$fk63GK
z8GUO)>2e)?^(}G>qv?0H;LwqN=m|O8&3;Nq0#SWEc2==}Gw}L|_&wtHY#nr5>JTPO
zV5BLp<#e`LA1cCbG!QCH$A*)0N@jV5v9KdPcth=e<W|+D^srVkm*5Vi!Xq^Pi*aei
zXZ)31P-RG{q6hljY-TVDr|^s40~b4-OwGT}yz&$8`N;O6OprJ4_`j#+18oHHGcr~Q
zFFx!^o~2b%deFxfPoCQETyA@^($xs`ZSk4nH_|+OfF9f9vyS+}62JZt@q;V+gaQ&w
z8Hq+cT*e)KPH;E9`<bYGqYK9Sss`f0z)q&{=|Xemft%U#BMoNQ+f0N!0^K6R+*skZ
zWBQn%`*IzA12<Ig3$EdNLlt_oEa&#&me@OS?(b~~fV|Jd9o6gKSNUl&$aS@AY3*+^
z8Nk(=G1bmBve$PUpi`3mgav>-7gpL+Ln^}C#?~l<fV2eCyHBY9!aABp07-_3nNVvz
z8g{){RJ_kp#{u^GjgI92Nfc&+t>w4P`j?g6RF)%cP_lj)N-x&-Gsmhg-0i@2$Rf6!
zh<Z6|@^<(%jpPIGQ*m0p;qg^X%y|EX+Tp#;>+P*LSpsZ6vgcP_gu%~F{Oe!=Km<iv
zMpFUoVoz0=b*oRtzq~~J$d1&P|0zt)R$xvCn=XIkyexja#Q_!snvs1z8@E4_3J%Hr
zU)X{(?YcWQEI3!wXQ4|6V>X-l_V%=17hJ*-A<lbtnzEV|2LAF62BR^XxgxQ+6~Qk&
zhw1^8Ial4k5fVBMf)v`zA^?Cuf4^`+)appv_hjBjD=0K*Gt&(mSi@(+uSj$88ncDI
zFN@-TzIp;wm}*-MR|+~(vuCzp;NoAo*SU=GK^N4{^vP)iov^9WM2C~Xg5Mb_rsDRk
zh*p}4NusMO%Qjda{9w=9%Dj}sl(?Y|uejktywx{@;6&z+h7@F5C6d}j0CEmhy2~CS
zs5LE@`fu_9SYm%E&`%s@$;KLsN~r^Ti1<blax(4tgX!TWR$);DJ~22UI9%#0yh3M+
z4$CwVgbArc#CX12TVU(fqd02DnI9s;j@tnI6JijutR2CMa5$UtrzPd6A;7O$<JBc>
z@$x60xX=@u3iz|xKp)o>!~kc|(@!tLMEATw5%!XF(gr85wL(e5_3v1e3pM%6XP#70
zKBx)_fbPkLx9dKn5e=r)en)7NlB}(<YRyL?<?m^}J7D>pxf;2~6XQ<u8GKF9kVfG^
z;xO7mo=g^S+dRppIvW_Ks+vZyO(xjC{dmWau02HFoe2qhj=+9x{EI$-IY2A?ROA)T
zWJ_t1a<Bz3<$|HU<BV!cM9XXuQu>9<QmZwgWht$2oJYU|JKMlp-xD0StVc%#&Gd5U
z@=lPhwf(P{gUwopJQ1Q!My8*vjrxV#R#eF#IB_-=zEWke#t_J9EhH+ei6GD)6(;Sf
zt!Sa`SV)Kjg1l$wjiAvX-F)IkGV6b_sUP<dBMsygCUh}0e_rdDH#UE}V=s3AKh0CS
zt{)0F;oR;KubhoWt~k;@6eaVj?0hw>UtyV*Dx=a$++N~<FIecrqLkLdT;bj_GGbMH
zO8i>`TTrkg${;fVY$8X#;H55BjGHZLgX_sOVg@=4nf=!C$Rqs;9iqGxL4O$7FF{;e
z_E|OIC!4plc$MTh6$_?%(7_%^?aGwJ&3DEPsj0N)&ZX{>N?~#SYZ%fkIf9rQNw2TS
zKiV~bDQs0s3zQW^xYQsObXY0^s943G!gpc%sT2B`iC~s~e$TLpH%O3%V01;%B<AUc
z9&d@1g5aaM>noTjxHs+U!Uv43)dmBsXlFPW{86U1;NVLpDcxJs?F8o_&4P=Z(kxgs
z;2lnLW2NkS{Px)<TMWqp)|RYvplM}`_bf_Fgy|C>)LDUaOCEE=)((7EWgS@=8w!&Y
zhE31Au%+EPix%lRTBamlb<f4?kQ~>Ic=8@wBgz$k2U6n)xTt!40FOMcS(=F>Q&jHk
zQLge$H~W#W<xH#}t%JVCyg$JHAIUV)lHQG)qw}=XU2fnQRgldWFj}L|8*u0eH!h7#
ztVa>3MAmU^f;y~9KkwG<y|@HPs1BT>oz(y?_I!}{^`n^F2Va0b2cD<UAcOYSA~7us
z1TA{1S<VifBmO-as}2X2mhlQ3AEFArXsgN+g2!wHRQf|zsVW``P;nx}{NyRHJ6u&w
z=&TKQUAX6-%>MS{3it9&5+uBmL`mS(WY2G9Gw2GIQ%x9B=jG4OXga+T|5X}o(^v-b
zx>jM%YRe}aM!~U_IdA#%i4^(I*DU~)NLim)F2$J19FqXeLWx&$8!acHwW4-sGio+j
z6-RBny;PcKYGMyeDSc&CGWlbLmSM6%E*CEQ2#)zq>|aDsaXvvdGMNpW_;z(jM&kX2
zy>zT9{$^q=RJhA);08Fe8TF`$bwPKiR^PW|7KmuUBO#^*xYssWQLV~v$uxlp7UBS2
z={O=XqsNG^jFo8DBJ3~(cd>@IorQ6k;bq+kThOUcGxc?ASyx3#NKE(wffm%Ax`^+~
zi9{1na0?4{Oa`&I{j}KcND40Crh<5pHNF5G+#{BsES^j@4yS-&Ulakuf=f@iskQ(T
zq>>+>wZZE;D`T(cRh;P{5kXDX0Vzk=S?80spvIq-_N<xPS5!8`EKtk55-8MHRs4rO
zhKeCYd+T9V#bM-Vs6v&YaJSYkFj`5YG>YIz4dHhE^1g58h-7x1Wlc>2osuOS%rjRI
zE>bnC8=zbV5NQueL=by}euc%bSI}!(1Xr@t7=OCQ1R4mYmxTVf1pom@0o;q_a*Yi&
zy(7C`?A`{2fSNbNn>L0f^<!fKf~29GIlLdp<z`~<fvQ5qNzPCZ>Rb|$urp0}W5&Fg
zr4vTsyjkr?MQpX+Dm6`6l+J1mfe$SRmuJR|96sv=o{StUsKK;$jCYorKdr|J3&H6C
z%ueG3tJ|`8t#PA^SB<+C1@mt8-UqDU?Xz**@^vCjOX^r#D^CPCbdiBn_#IK)9seZV
z+kui!hJq?^mpcP-lR@!^h~R0k$E*V_EP$0<%gNd*3eZP}MH;Y^1IiUOU<Xd~3wWSh
zgTTqoOi6rKzFZoGw9R%w$p>^MJ)8@_?>y+Iu(ze@+h{jdfu2UY&W<J=Ddtlpsf1m<
zIsGf%ccd-hw}^jbW0*QcJn;}099*DFAr_0*LEflJH&7F;IUq+msunfK!e;n&_KhV<
zVzGmtzUJ@#HR3hmCqmT7{U?gB%xmJ}HzoopFu^lms-UN-T?5z)b(bOH6$~N8zE06(
zInkfT{?9LP2Q)B6cy8|vAin}vnbjf;v~l)$(%FGN;MK2I7E2dkum`{F37i|MrYAyA
znM`uh=g;QrYG76n4Ugm={}nMr{A(SAuORIm?!JeB-{5X%iSE~g##H`YP_zFqCEm=$
z{UG#+j7}X`V-I2r>xnPvjl7zYapMR^dDWH$`!L<u>oOOu@P}>UfK&-kwjFlXN<?AT
z_0wE@|7RC>z~f)=6+IEUurFOuSYeL(oxj{77^;Df9f9RWV-_YIRpQhv&Ap`^tTjNN
zp5YFFq&|RWBs|Lh1qjpA_X;__VY1`I!-3{^YsA5ET|-a+-&nDBkElf|1IfuU6v+Hn
zJ-<-Wy!XNrDy7X-mUde|sfTB{b)=Z~GleC|m`heVJ%SZ!>R=_~65u_5_%-5xMf@Xq
z9F`xU^OC>lEqY0!p>*T+D)GT;CRel1$p2m_-QCmBaXUX1N2sNG`A*T}t33)_048HR
zZxO#E9H=NXe&DmVfR@tSd-|%V9w^)f2kB^oCKTru=eOey3tA{OJHr>EO8Cn<`Ssdl
zgf&e5iMQW_M$#fmSQau6P*pD$`7qzu@5v-BE#Uth5fQ&c{KE6Qr-=86XZoP;sb!}!
zzQ6~lxKJ5|G2`Hw;QC@sMVZsHfiVyI%0^7;3LtwhKeKRsS6gE7@f%K}vW&}Tb;$W!
z?jye8z#0+~7nxm>7$W#9t*iz|Yz(_k+=XJ$;{FSRT+LDf5`1Zytw04a=#c>Ea<ksp
z%aNLH8UQ+=ea-Ty3?+3_*w&_}r$H#VF|gJzef`WX4H5r4t3ndn_inYW-tck57M9rn
z@Eq|y;)Qc}Pq^&?^dsM!*082kC$?nW5~WR))kRHaUXlws-t@xO#aNJ3hS)&ioH>qP
zBAz(B^IMN57R!`Tw#c~OVA~WcMk{^vYx`FWny3=J1Brr<T@hy-)>@W{p!*y1n|2sg
zYxYy3Pe}Vnt`IAdd<e9>fkgWM&<wBZ7e1@GgbnBS9eqa0Z!E6AFqBc+3f1IJSE${Z
zkEG}Kf}iUN`db@L+=Ghm9R1y7pj&XE%IWu%l8)5jYAY0FIK<MjEiVFnfbT1dZ>a;C
z?M4O4Urhg?3QE1MVEO_KM48ewlod`#5}Zso_yl}^F!6)>e>R98W(0Y(r{(?m;n9#r
z2ab}JR=A!n`$X<wf97U`2Qr|OU7E|W!de0sR+8LX$1hg|h7TjTQyU)W5m41=TGI6e
z#&pOA&{@I%4bU*6sG6bNOT;U++E@=1l?F3dux8oM8vN_ruzkq(lS`f>=5S{sXb~Kr
zYH-_#pCi7pc!hBR73J594F|pb>}Znv!W{i*D_>LoO!*U0g@T5{yM@`Z7~)O#7Q~{e
zds}*oN7Eoq>))Tijx*KT13DzA5bypR5$j=i6C|gylh!<`5;Dnq|3}0_mV(?apLnVk
z!r2{tFcGX@*8qCMngeTb#(pK9DG<^P9)l)hCxS{&$wf_^%QLY2xIV%qeIS9BZbv1g
zWWhJMi41^H4MyeGla1{u_}GOEzD#29WUU1E_83#$kq~N{yWawUgUAvJp*H<WW3j9J
z3_(lDHWFhmHsE#!?q99r{r34o#D8&a1$?BYy$20+pCM?Kavl+Fop4%Tc`j4~)ANVZ
zkdp<i0rqUZ)%iC?!_Fc0cW8pe(c6=`E7qxwSThxUT0fFauZCjx#x{L4E^1?Q_L}uq
z6fg`0p|i7;L8EO3_BfM*i|u)`Nm>CuRYLM91yPyA(OpLkGTh-C?vn?SNZ&Xc$*gmF
zu#pQj7Jo?Q5a{!2U+E(Lh`8GP?<)a(GL1(D!HUBlaUDH_5evNEgcYV=A%1=|DgB0|
zzO)pzT>rQvNN1}O2x7E+IB>7|7^Z`8F*(d0WIZhL*y0GOKz+b4L_riRSVM>Xv^E}f
z2L-o!!0FF8{EIbKf3rusr&~~B!gmz&3IJG-wC`j6M8N^7?#{*?R(L`sj&n%+o|KVX
znj$-s<f3E#S7mrElL5E(B+fm)<OCs7bDYlEJ=5X4LOfildkMyz*Hgg!_csw=?dh?5
zHg(?DcS1^=NEjDHMn^Bl4#*Rr-qzX_@(8@KM{t$K7=Az?-V>q^q{+7poG<7nqMLxj
znZ<a-F&?rKlz@V`=iI9MPfXfmQKefWh^uvT-_z048X!_L@wk?eZAhP#9kiCz7udtR
zb(i%tKvnGyOQGAq)s?`fo^4yCu;+;9h&M6~?Z^aepuZUt35~wOF-n+#a<WqytYw*n
z-y_&l*uS*l5RJjwR7oePjA)xP-oIF?D~fFTqCT6jY(HT88e5dGN4rJx6)J$Z22_Ar
zo}^#raSF@&zXN=BLuFdhl*>8*yU65?p2>mhl;q-IF)UxhxKOKVxz#uP>BcT6O`mG(
z8S3za6@m7Q8vl`AfdvaF`e0)zPsNEi8T5LyN@C{vFE(j-N71LoibU|0YYe3tihyDd
z`f`hwLNjgTtBkEYL*36&>Sz?;j_=JF|Al6m&Ymbg;=c}5@}~9X>+lRzG9`w-f+X~H
zM`m27GWHGETw}r$t9|BaIzQV`vn2&I!MnX#$@9vVUdbg#mT|M@;7Sbr%6TXZ(`m*`
zmxZ5a>wqtT?<Yf3^8pYyosp3RA-~{!ZqajIIm`p3!h_q8RzKi?4>mY+hQB)o8qPr?
zZR99$(X*rfEuxC}1=&wW3N{<b>)HWLme`6S9*}ZNim$$^$<NT7)&4u;w}`J1fxA*c
zcuR8UYT|SW#aWFu)I_<k?)?2fv~eo;DM<VZ!mw~mpWVNLRQnIQHojQqF7cM?lgG6T
z_H23n0}H9z*#;rmbHw+x3T47qYUD*=wvRTzzt4saMei2Fp73V_ML^x~G2+L18bgo-
zOqTNL(k9w_MbM7x>%W57G;Jnm&JbkPp=^@qTUpzX7Xu3$Z9cy`9(=XPfH(Fyt9NFv
zG?;K_i0SelDA2i=?15t9g<JC~b;(z!8DMXPmdeUCVYJ3*7;bd?=~3AV26$`Uz=|}s
zBa7GU=zpW%LdoW6*+^l1Oc?+j)`0wlO*s+(RGxW`_?^dN-mXFt&XQlAFLlEYKCmx}
zr+>a~0pLt~k$M#ytfIh&O5C3+(d0c;wUhKx-Oy^px)|Y>e!eof)(gzhjO;7kna_fH
z<A!7jHQO_bw7Q7DepW}+x5|@OQ11@zt~TFvVE{c#ud1&r0z}O`Fx9KbUVqPi3d&7{
z@@G2I&g)qs15ac-!mp2TpuxR}IqKJLT!uQY!psNjI)#j0BmRhZ&Z>Y$!<#!BJbLAC
zX7=_KGCAMO+wktWb@&fsff^h`Mg=z7&I_;iumvh8pK&smYyg<o?~(PNUHVs>jVK~5
zjb0m)@0ocylLt0d1318EpFW$XS98!$d}&P{DJJ`6JxVml^nSA{PvFMx4c``44PM+u
zX~%1e*<S3~O?#GY4$>cq2^1W%2D6iP;f1w47{{Ce5QUZA6!g_^q{dcZ!&C-(M!d6u
zE-zV&BQU74J59H+#0p|5m74R#2V^a;*?Im2NtD=s@t>-#jsU>O_VV=<8Xh<*M|-rH
z1{sZZOmFvpu~D#%3D(qju;)E>5`b&JhiWMv5GIgtx@IR9GBzQI!30BB;Ojk1R>{Ih
zAhI|>2mU!MMz)-)Eyf_1oH|gI3~OwyWMk&#eI&b8Ise81maLnO%b6?OnEpd*t5aeF
zwY7az5r2bpKAHfwp{(<;ej19qF{w3bI7o>>=k<SC_k%VzdsKQlVHa_@B|DXJ$R)e@
zfEzzReP5B+mNt@ZSUXT}H49w7I$BCAz?|2kG&<Z`Wp?t44Igmq8;;aAF##Rx)ACZo
za9|IkuwMrd+1$y-ZeNU%8m+kgvF;?VIHCbSSN~biy*b!muXOzdJWDG;RECObgeco=
z5qL6`bBXwgg=~1BZf&qo6vj;ksZ3q^w}MUiK|J^o@w=5u@-bL)TYbH1J#zu<Tf{%J
zHH^vX8``tNh(K-1kBH}F7E!%aZ_mK_i|MW<B55fu*M9$MLK0QnBdOe5CG?q6bVkOl
zc{8=uZcEFIj-2Ij{rQo@Q<@c>IoC%{{_H<a5&ysS`d@frZ*3!24mSTuXLg|?TNHu!
z9M?CFp_&640})Ec7Y9(J>%}A>&lW*9Fem%OfK)(eO*-0A)UJulNBq&pI)b_ldrN0+
zErDNH;fxTXvI#3aj}jH?C{0u|SvU0}fRsdrJ<l$E2FIMJX{yOEeH8>C@ymj&C!C4=
zR<#bmvwr38jK8=p^SsG)StpCZDF;8;MBmbg&EF!1h$mV#@74jhU}K-GQGC|W{%qQ|
zG2)w{vD=5kR8S-PcK~`Qs)R-FMHVd;loHgMFwkT14K%DaB0a$O<nmt_Lhu1-2pf%*
zDZiK2!u4W{-&rbs;)=?(5^}V$=L35orzP^B%N9@qhV7oW>LM#@+LO8WmczEVg#ql|
zoA|D<K|JE)cUc{S&cA+2nX-8E9aoeuAO-yIXp?27zzCAi*JPGdfp6JxVV#L;u22!d
z(u(6++<#d0r{Or)<SrHDcfJWC*xL>0T)c8&GhthD!eWh~)VF($_$}h!5p~4>WH@e*
zh~g0OZT(1GNq~0N^SR-sdWefBE0x_&&TS8L#FSAFZ06PoNRiYF1NI_}6n7&JYAp>)
zO-T_`2)sstJlZp1izN;H3IUeBoCY2F-uW{xpwTmTRIMFg&@XVZX%H7{tdx>2lLAVJ
zpxS92@nAvgYWOa%<1S%bH|A(mZmC$23T}Q8KsAIwWTG?=odHtOpgZ@0vVPI1H8(%M
z4zK_l4pWALlNoM^9uv%m&cH;Ke^G}DZdDq#)_kQZF<U&$B^iZLn38gjovQSqe-a(D
zO-_Fp%N!9$thh3-G?*vbt{-XP$GEk_b0MC6V{2P$&{?cpSRcyPAc`xu#4BXhe^!zG
zj3HcMWbO?3%g-*9+!G6S!EbNDEkV=y1vEKC{GQCCD=bP6II24?s-Q4<PH@Y5{l$~)
z%wtyk>p%f*00t~5DDg+?eV9128iX@I&AH*<j9*0L&M1%I*5-e`&#KUZcOUTKua;)X
zv`U)kFIfy5jL`Zm;)QM~NsH8`+pp&dsIFZA5rVIi!`%Qq76r`PNtm0r?#<oE&?X4v
zk)yZJ(NA1cX;XAAxb*K9Qaa*6D#BM?Wdy<QxsVNDr68L)WKZARNAN4cch!y@(|7D}
zyY>NdpqQ9+A*!Ra?$ZmQ^gf~Sm2KZ!w*51!&{}tx7}CW{OX!spvGU_mS2e9!fYS2h
z789QXz|ksjJ?C+Oz$fzoTeJk?Wg1f3ja_N}`p-15Ht;uUN||@})l!j#3ao?N|4P&9
zbHpDO_qL#(&00#Il!<$c>>CkV#h}jEl?;NbZBl|K=>_2K!-RFEJ5cG!D})m@#grcr
zTXKpPQXr$`98r)rc&x!Xb|>O?l~|sxurd=UdGO3kUOpL%sb=cRum4K<`->G&I#yfk
z&JLz&kpb%PmR9(@TIq0t-hRwtUq3<(49$^Vv@J(S%{7_BuOjX{p`&CuTig7%Wuwf(
zf1R+?4FGOWV9=DA(m$59pTJ4HfQPH);SDRerRPs=^#6lrhE>Z-#7L24QA3&xR$k07
z)jqOc_6{rn#Ek-~$Raj&ivxkXx4S>DSr|#nC#-)&{UzF`v?}<N_z+1}6-gQ^Nl1}U
z5u`E^cqDwP*}2B!w{T+{{!0;PzZ|}z5kduA)(Dd~MSR6i-44)TPPgO^#i&&_E23LU
zDi=Pmv+761Cnr;JOeS(n>p%iAd+t_J@E3cicuz>o^s+R>@Svdf3a=(8-DE*rfqh?b
zF>lmelufii!B1Zk_IY6xNo^0v3X^E30k~)Vj@bRyD2Ra=dZnzH@j-=o{~q<<D}L+1
z9-oubs}C!N7epczaYsu$!>oJZzy|Z4Qk=?Bmq6s6-^(msZxqgku^KAORR(D?O6uC4
zsjTfBZOt}h0L7ZZDoh988Vl0H!M|hULzcqTw`{F~DW@q7^_F`~0mNtkK(f2AHrx^w
zUSpncVY~{FazSANWYwfUvZGK6Q*oVefol0F$y!uZU0^`D>l3oEosC@iu!ljk7~+D#
zD4=)lFs~X%P>}KnQVa=zyWwEbkQ??YsU`OYB~uo|D{wLZhpg3du=<chuz@6JGU4{f
zek#Om*BZ9O2A@nyFJIZNYQh-~b^h?r29^iF#?|_;RW4o_;FWn54bfxbpMp6}oyCSJ
zd+|GRgp$OkF|cx7=1Lh5xU~SZF<XC$_yNJ7DtnP!{{?invE3Tk#m<VOZPtNQyzj`)
z9T-q_<>>8=3@?%oZ{W}y<83FS#78QCr*#cO62p-lI+&_b20mw?zgkWbz6#Q+7Su3)
z!-2(8`;cZ31yQcYOP|OI8yj#l`=H+<zPk`fRmX}vwn4eEwW`=iar2t>o+F=W+5fL3
z5g**zXf=ez#n(oRO&05W&VuU5XBzewF?57y6uEwdwrg-kvoCnds<3nwjk){XvJr7E
zY|V9!uii>jK)&pai=Np3cQCBJ@p-L6tk9u&%5X&#qyo&5n@`b`JXA{tyEX1Yg8l<%
z<g|`~1FItUA{|8m9xCiyHS6oYvu9X#+`nYJwMipy`@c{DDMrg_4u)F7D(VXmE7{K*
zzk9Uj)2e&W=O7fx=Ow||vVgCY3`a(QZ2p4|SIeM4Rdb|E!(^(gWlH#us|Tvq769Z}
zu4@}m1y?FB(QQ|_oQV_WQnyIHqUOqLD_oV96Lfs|n7Dxn7qzi_z8Kg!S&H|R46~E&
z_8A1YwR(p7x<0tNA=y6}u+$cc)qxLI$<f)$#cPB*6`jx!gYUTifx-GiVm}g4j+}A_
zTk4CNcv@b*m7z4V86v(%JV)HyzgE;)n)SsCERGzT1{|nj!i*FC7fRan01NjNmZnXn
z5xIUQqZB4+_~g}eRZ?H->xS%C^msg#OnW>|01Rif>02^N!3)%b{o2;5@wL4tpG20H
zcr=E4u9i?#JQ1w%Pwpaqig+-Fvq19jgAVaW%0ri|1218_j>ZL6q<lIHss2o_p3>V6
z`>#nnj@XD$<wA%i{_;!2lN}WiUaH&N^IWtx0YLuaCC8(Kx$Bl42n+slkpYCcAzg!F
z@@QvsA)}f&<p<!;g@D)rC?_<@(sQPrlszK$z1=_q6x%`vL{dRc_TU3sQUBS*F&Jp7
z$bfKha3^TMmftCO%#nOZ+76^LPsJCBDp3Vj-*x)z;zhc*tdNpdg=|5ImjprfMEVQH
zb>$(6m5oX}m^vdpdP*F6djMT!J?+o*iGO4pk)(O(seP-9ro!~(<QFnZ)m3K&K#&E`
z7NV@-$ktr5Wt91rE}X2+8oMW!?Hq9*@t5o<0_hO07EizMkV+>X@(w^mZ*PxQyRthX
zC??C#ZYjK6pUm!q8pz%Ra$;vQO#~8dG}Hk888JotVh;qWI3_6uQN%e}^{l53tir-*
zx$5HjB?Eso<x6GHM%>!hZ9ZE<?V^NC^Wp!_b4og!D>nFN8lI?rFzv)d?sJEj^So3W
zFBUK8#Qp$DRV0K*)0V%QpkeV+2f&5HBh`mu$k~(ux34I075i0jufvNLOD2CzF2O1F
zj;2Ue6$N7l1Mq*oZjE3nLJY<el*_sQ4G8GdZt|K_IxW%6eEQ~6-s_qIPzt$Y^2Z(U
zqqFNgS|5CBKovvROU`{4@k9e}NeUj;q<M3oSGcz8GTyfqMkW&cG!M}_-Sa#Itw_z@
zY^jM|)|2o8qaVrHO4H+B;ARUF#KAJ{n*sFpNX!zQL5Fr&Q)tfXeqL2c9+=l5f71T{
z`})%*$+2Tu6E(~}#FUveklg3o7HN@w|3_-+zUOR`Rh2_bK49!0$WL9dfS^kWB^eR!
z4y<L%OifMAU~2X&A!VDwDwxwV;JQak97o{Wcb>1jIKXwpL&T?uPvGUr_@@>Hw(!k0
z@qvRaazn}^E+{C2M8sXh-(kuYFEQhi|3BbZDtOZjk>`Q}k%XnhRB|*ct=*0g>=R}6
zh5dKd^qIkxkM=N#y*1LM2sU`bs|}8j7*j18h1G}O6B;@r(D%GEGv-C4^Spjf33pK9
z?gtEJ2A<z(E#2XXr4-u8K>WYK{xL3vwHux6nMKbZ5hokO{lvX=7SA>)Y_%L|0K&wF
zX4W0oew4nHmS9v|s!BJPW$ih={|emqYMLR9VVLX`_XaE%HGm@}<JPDY#qB_okUF2z
z|5X`DlA9X6Fn2#ducaJ%W<}V+(1;H6VEThyl0kIKLGR=8%>r~8u&&!Qpd1-iMXIk<
zU|=!luX#(RdA2k+HU_g*z|aXhVZfIJfrlk*m1X1u6|jtI?7t)a@aH+Y4awc?xRA`A
zKko{?W3mOA<p@1F!ro*fLHj~l1accZ5-tNYe#LLx8T0dXGl|JLLt4DBhXh`1<!Y<b
z{9L~t^6(xpZe`6GlMz;<LFt9-kjE=+5Q=IGolR%@Ph!Lx3Cfk36g6>kqB-EaIS#$E
z9OcTt)p_KnU`%9yu0pvQXZA!rMG^n}(bIUsBse{h=vVGYsZaqJ`<|Gfpy6OOJXM{A
zgmTQZP0o)Tb%o!Ve^*$2HApL_P5aCEEHP~JA2kubLkkRc38{)n;_Mdw_{kD)<@W_J
zY~h0y;*<)HRtZx=q2BrpC9dd6HGZh#s9vz1Z0ytKIHS3DsGym@S}HcBu+Hqf|9q8D
z&?iHjLZ1cRaav#bO$AgfnYGqhFKe3)w9!QrSm{8O;5lN5_(CG2PPz*dZHDy%=0?s7
zH4UKQ3nLEOh<_n!xxFT{VRfrKSWK+Bse*D{YJ-xl-I?x7KdcNNUagpM`~H!I=(vRm
zO!bCun#+oRZaxty!v|~3#$d>D;$UlyQ}`C7X2DRWnz6ejAo{}N0TwD6r4KD0V?+`-
zaRv%L?rKR`BugMBRmF_lFkx5`a-SUFf#7@w<L>b!1!w4B@hH|5oWv_7c|dMy#vGeO
z#%|Y;GJw+wYun?rX83l9_#E-4jVSneB^{cni}ACLo|Luhtg^c}hC6$NS8t2lTV_zu
zTfN=KaA!}>+c8i;m86m+vS1)*?)(2nnQ;Kd38=~YPuA_7Rw8n1UvO1;V-!S>jczDe
zN5g&kPa8gRiuc^wLdM)u?EOTUSOxjcR9Hs}^0nKa{MURxY6vLkJZl&+EYGoF{Il}y
z-8lTpP_7v5#*ozQ%^La87zTFPUS&jLx9NW*8|wM00wa}Qui3xb=G`nQ=Vaz2S>;l(
zrgJ(aXJ|6ssUVRVZKP3!_2v;j7-f(;a3#q>1eTs{PlxMZGrQ6vY5~%EB~TrWk8bc`
zIi7TpLX}2aDch84IwR}Z+tR1uTZD%&o@lEVCK^3O{3GIj*pRi`{d?k3@a19fUB#G=
z8%$MHEtXL6aMi{Zs3LB(Ya1T8&L_jGM+2*>AYDFsrmjA3ZZv(hxkh1imxh(flKYpE
zbHJ0imMbv*cbfH56-Uml$XCeGW>(zpOk|m!VK3hXZ>S(ITDEyuum4;`|7t@t!eqqr
zYqq655{|Tpy`y|tf!BLFOZGVCoEzC#zreqA$X-gzi5mKxPbM{7I6!F)F94;TQMf97
zZSzk(v1VtI@XQWXyJU1X3IIybg%u<{f8}qU5Cn|L{_6-zi1Z9wliA514bU$F$XqO*
zecp>{GA@?2)@)n5S%=S-RyZKSP(fWS%`XvutzBRd@kp1iBHkPr3j0rKMJgdu#C>Bg
zOWeuMTtJ_wuoj892Y5-!COl5YA$dpdGx@DccX){Sjqt0FmM6_|0C{rv>Y}(1{*_3I
zb9#-)0kNqKNlTP3Gykx5mWC?Lf(6jlsc)_@jN04qXq|yFIm5X@HqnjJe-v=Ib1?gF
zP)h262I~Ztbn+qMHR9EVy6pMrELlPo8Zt`(<*phw;yZBTwtqX?#?*$Ho4FMnl9nG%
zn-?$jW@#&=05CwrY!vaO-n$1QA7C!dHm0USC%M4^SKPCx!Vx!8n=Pe&!!JSAeJ_JZ
z%QHkglNKM=y-*_!hxa^BK`jr`q}KrIjZp*{?qcG2-9>z6t0$c7)jDp{mw--B1$0CW
zULmN6P5vb?wNECxoNQ4tLui9p$IqL;0)O>uC-VoQXG5sjcAyjJvE?S`gjNeVnpR@L
z0!wd@bMh;`O17*n4MnKir=B9dMtqNmi06ny#4qM(RWz*eAMF}|mb6d~T-*E?9RyWH
zDp;&pFqIohV;as-5jD*{HSQz@Fu6_SF8NM%LkzYrS)u?~1b|b-zax4^R{D1JxSshK
z{7BXyUmokd;Rxv=PvD#T^(>g$CTxhk&5bK{5%Xj6JOOQEYXZb0&QO^_7V&5@oed7J
zXCX4{hgAQ_2sNN|`1TTNlt%ps8mf)g$xP*&3UMl?2sIS+&eqnjU-GFN22d|`*fRQy
z6;O5-SQivhO-ywaM1>_?Ejih3=I;~3sNVVh(s=vAh<Ule^yPWFfWo_xe|NB_8w9HK
z1{3Riwb1v-qV@#Yw1(u_t(O>D;DJhCjRCz|0G&yK0~Fw3_dnQ<<o`iTnA=gA(0r(s
z<3jHF`cWUifPde^Nd2(7oyO$s35g36EP0~UBN*{w7*6(6S)6V>0Q3;xFGTD+E<|we
zVl?!|`O8tNzhOl?w)1ElO-lxvQqana1F=d^T6=<epPkS5#QSE5qf47wBhQxu=+9Os
z+c*LNZdBGea;e3#8h>X`K3TkaYh-*e_4Q|)C*TE_$-h}N%LB)8!Zby{6!@4MdcB=Q
z(ba0+xEcshL2Fv}Ro&ivvAEnZ`|b(gnwubgu;6sNft_W}iCitGO4dVDr8;ac9GBL!
zG_f|ll@i+ofr@d<ElJKOd_LO~<@e@KHZqVa+2TFG>p~1GAvm`4#{j$R(P^APZ2_t9
zgK7Ewx)(+Kg&?a)l@Vt&pv=DC)UgL9cEXG#CwhtK!8Zz$>CI;_$$K`*QKM3UVx9hl
zrTT9Ofy67mQbYNTh4`Hg;}n5jO!*K-Sm>-|7}$o0V8F}Xk%7IDJWnjDbfWz_e;3s<
zk$;PQ&jEBJ3sJ8FK(JvK?EH);nj)Sfo=Gl0Ok%=B_$MkJlkK_q{Du7sfXNwjq@p{P
zU@+rNb?nMZ_w4or+cKD9R0f-Gde1PzS+vuY1ykawgPdmI8J8{Rky`*eQ)CeiPSZzK
zCg%6N@(~Am`QEt4r{r;VvO*>CllhV>{F_99ojspt@%=f5`4;ia_>gS*pqFlQoc;to
z+OuuF4gIUElm0I8UkfN!NBoXN?!vh}E}!RDM0`$E9KmsWV`x-HQC@WO4i&@MZXgBG
z3VEW)_a+}d<InRo0@em(202{O6s95=pfeU_j2sdCiMcdKN}(c*c7U_pW(*1AvRc09
zj5E)eaR1|@R(N5=(nc<}7}Jl4?<V$Hyu1i6*;KnK2&#y`;EyIE-UBj!>7OzV;9UM}
zI2fr9Qkh&@%i6%+fQK&ZDKCpRXI7o>$z4QYh<9%=)0D&QId}E4Yvl-74dCyFgi2Qt
zyzLtCE#lu1pCW#<V#GTpP$f$!dz`QQTSRcsS_0#D$ivfm{l;bcOlix}*+;khW8>lr
zv-w#(^kCF`Wqz+;auh!;|HzV7Ja_R8D-?&`g8m}e#lmP?6~2X1N_*Vzb+d@ABb1x$
zs<U=5Wld6Or!~Baj{C{FoleA%ckD_;U@9YSi7X{Ce87#~Bhu^*O=s(i?`^JmZQ9J1
z;;?9ejgwz6jxE4oB9)|m9v~!QodJ8p*&9~0TOuyO_#zDeDk42a+*_%*U`Zr@6`7rK
z_&Z`$f$Ml=I`9eGI&H}-7sq&jBxrEvS9(F-`Bk@$9;)MpWe^2vmjGFARjx7lX1UJV
zph#(5G-tHV8yhe}KHX8~ktFm;7Tb^+J^zP?YK8G63T7e_K1I~liM;rT&dj~To(Hsr
zqOJPPPF`{7Qm+l%a7i_J!~r#%2UzUu?0p_Fv>A1MXT-ssWdn;hd(mJtMOO_wIdRP?
z{K>#rd(M~WfTOjxNoH0j@X@cL0*1WN_3%U_9&JGLA^8YVc<L5X&!N{o3(iPN<Bt}x
zkJbyAv5Q5-{ql#~meW%vpZg7Zrt-BWFzMHML}orstDuc2mjeXf)?!p`SdA~3_u^ub
z(v0+>{wN7h;aAGN13$0Ww$5^=1qCgs*@fL6Bfb-^!+J%;zbz2>ftHagC$Gk4lzv=I
z5w=|f-lO=;IQ9p?@a#5vp841(0CX~vU4&N&cQ6qQ$ECXH=bL3xXF7cjT=337;{^rI
z?VE2h(6G!aCzlnmR1hUo@<6vPP$n(UJ<qRtj|gb&#v1doWC1eE*#_v!nT7f9h%w?<
z`|ZXhIQh|H#ut0UXF+7F69g~#r6B^UUR<oPu3(nT%X%WT#AV+ie$bgNG$J+o!AP~m
zn|0YG{&3=8&e!sVM07(*I>0{_h+o8W%~tVj<5)8L?{)z8+;BnMtvK`*{FT7n1_J6p
zclRb~3`=`^4(atH*VU56&%t?T<l(_~<Rt;oivBWHT~XKrB<I5<fz3t|lCh985=zp?
z%A}Lk8UPgBdA>X(4Mg0Y@A$U)OyLZqCJ}%{X;ReG+{<<mwTpNr>lhitbS7)l`lTv1
z6>vo1<`GZC%6S#V9CgM)-(C_hON6hfZUblb1QOdWVyj-<LnqJvT;eOSc;0Aqd*=j&
zs)nr#jrF3I2XID;VbsJ@;Bn7X@&W{(;hxjY)A=3Aal?qL3RKtwfZviQng#c!aB~OK
zP}##kODhs=lZOG>A|$7F$j<afi|CU=xekShL?sgh{FS9UtrtM)Wrhtu!4s*PlbbGP
zpQP{|a?N|11*A8hm-O#OB<<Iity2AaM2>d%3!%I)KwXiDr7TE~4^tgF$KubtVK5GE
zOzyuh$o>F_u7;tO<K1mOR(Aq&zgsHhlxR+a(Zubf6v)+7AT3GMWQD_{jpyqV5K|bU
z&Ebu;$h86TJ{iV8zeeh9dBi7too+P)JMG9+Hx5u?)9>DiB-)75l#uGqy4`<E(J3s%
z<!FX}P?`Kh4QPXc%c;J+Gi&UD9X%tn_+*(;N~4Rm3pmrky72Bn`Q7?~LbBf(21%)&
z+fRCG6B6d8uYqyspJ;zMnfG&upI*FN?_k8JNN3e##mlLS>Q)3-b~RyMC!$1cC#naB
zru(n+C&QYsxv%75d)8%I2aulq&z8M22k4k1Wc&tzmHmi*@Wz|m=BZ6<4XH1o4oSu;
zyqn}i!zIeZ|H=3!JKowYN5zr|RBQhgbn3XCeg2CJ%u#1~;Lq31lRD&Wao}Rr2mIa@
z1eO-k!%D6tMed$$=xy40ndtC+1LUY%E_Y(gM-a!-E)IQ4XDsW9l1;`_Db*wyOHA?e
z<{Q}Cr-+QFQy(MVBYsc`h=@lXx{+Ga+z7I#>RoeIPllSiWuyJ#CH+W_mvX-+Ogu-0
z|Mo_nV23hX^c!)vhI6US=msjLDF>Px+aV|xmPKl(HV|}S%s~s=_9Nn};iJK3Tcw+q
zS;{hC<sIMISzF345x*qkxUbMI9c<kzv0^Z6aU>*f1OOd3C5~5Oc!t3qS*(o$R%xxv
z6_-6Sx9=W&d|C@JbpI93><SSzf#c_e+|Zxt{UcKH2kUt6DSC*7QAk>RO3w1Omj23Y
zPew$ZkY+8CaD}A@(?Y%D$;5wUs6{s@l)<Env-kZ@TTg>&7JT$}1|$|U?D@PMtF4>L
zt?TUOIZoC(KG6l%W3!*EjeFYM0?B@I(^-iyDhW(EWaUByDNHw5lvZI&NN%+D{!|6^
z&K!_}z1TydIF)d;YEMKW1HCYd!n@+sNaJ(DQZ37zq#mdsU%7|YFzw#@aa(u5IY>pc
z$3#Ixe^6O!w@rX%%s{UrE)jpi#7hEmW&ztI1<R=P%w_7krMr`kyx^Rai4BM!e2xgZ
zZ|A?*^9SmLt_{tkXE--#n=OePPn-AadHCg0H`oxXYb%0Q!172aoQaOdmWx*l$`3I8
z1gkD#+2Z<ho9FhOdh1+S)NvhX36Hh+8;zV7j0n?i`*$jU-Y6`|h&wXKvo)|+^w<<|
zpc}nRIobLi3Z{U19xz))f>>ujJ=QW9Qhl)@?`j09)-?om2V|Bqy^^fXEYhD$2QEMA
z1b&S8uZSIO;CJAGAoP@+qLf8agF%bOlyp^TnR4QB9m5N(^4^?YZB?Nj$MnLEZcvD4
z5Oj;Rm59D;_M+qbb_tH%o6{Ze<OlO8t)cN0y(ojp5btT~Mri`Y5k`5t51_(459oKU
zCI(e1rN-*a*H5i#65x(jNopyB?YO`;aY0?g*N8{TU>Dqd$whTGX?#@#4pVYIpqsqn
z2C|4xWKpUnEh6Y>@|d~RC_^+T<v>G0_;p<eEr>gp<@kjOTUGzVMhRYf*uX?S0NkWx
z9Z|Q`hE;EGu4%jZL;z6?Xe>C-KfIC!-|k;VH{ifJjQ$SW7);H+@hX8Y1HoWG2%!qv
znI~(c1ae`}U?#9H{Cxg4X-O2%ZvkW?g+AjqM-KW0k3CsUsoBh6GOR+_yR@Na9Yy>{
zh`7b8H~-SI=R1=EsQlYg=(;(u%pT$+yd$qX0xueVd$$h2;*4a@d+T3PHFw05rz#|s
zF4QtpaGyDzY~oYC(@{1h->=Z8cCLGTxU_xS8dh1cDzzo3Ajsy<%R1c#2S2j&QWRzE
zSz*}ZxMo3=!3?DGd%W*EaxO)`jU*<+&P8wCZ*`+In*}_#b2WR^HdUavL*-J4+pAhW
zP51{5xMGV7TeEEdswAq!5o(9!u+$bYtpnhd$kNjb8#vjPCi5L@ly<`VONu~o0u%p9
zkNp7!bd2~!$--US9FPy(%x;rI&)JrYNZnBxtCl*sa&3S;Da=F!-7otEWVah_q}mfD
zX-&j0fG}a7%syhb2B8z^TvflG_p7Px2`9UAAExwdXJecun|@xfKs(Ah8+lA_0YtW1
z5L<Qq7n0RfLNHhWt;`>uxr&U9JZ}b&n$xSQ-yN)I#ox%cZSYqubsMpqB|%S3HMNym
zZW*9lkK+5Z9B8Pn$O~^yDTFy?{1+qDcb5EyNm%9sun=MUC0q?=%?R$_5zi417Pr+g
zvH-CI`W3NQ?<~j8$Y6=m78HuY)dW!TrqKZwX)d6>Zyx%|&V2`HJ6coI9n1D&;{|W8
zf<?MFo+PjzDPppZ4qwOK1fpjNGbh_%$?7B7Kc!YJS5W{d8*@h;L2i*j>E^}X^?a7<
zR=welw3Ap$19|zz((fY%f3o2$8>^s$AO(t>#vQ=2T|d)0z*;YKVeY`-XRb&+ekB&Y
zz)*92^ozCe?`@s6I_eSJ8U>MCdy3p`VRiMvsM>8Mpz{1X?r&OB%1RtIDzImUR6JmY
z3gtPFbu=`XzFY`1D<&%wzSlOZAl4K}yL{%3r~17;rC90MXealkLfH`?l!8kAb>%|Z
z_~4bXwi$1$!Ew2voRS8mEO}>!d_5SJ;Y@5zd*Lhu=#o{G3{yIhigq>$nGT%6?e(85
zdIy+BMa-|bu8Kr+LBTnDUgER_00!&bklcC^19~368xaSR*v<}p%@#D~%+l-6SSnpY
zj?*j|1=<sy78yvz&iDLIjSD4H9(NI+xt6quJ#rWus{ytAO7?dT$k>j-l4|Yncmo$W
zyN*{(;CqsRE9>4F!&5+b2w3HmM^X_)uv>d3f?qiE+JY;-b+?&8&SH-zcIb_KQ=CO-
zpdnAzPmO@XlA*fdj4OWL*`h2CpO*;c|NX&t)|g<)8g0mk<ptDBS|B7+8+HZ>vcV!1
zh%E+)oOg<8@@P+lhZKP4#_ffPmNWcxZ%t6IrnFnS@y-;?^Dm^^dIQjlm0g@6>6ly6
z!-}z>E)b}o0pHdRphmDVlKf3e)Sv?YwupG*ekx;lG{@n=nxdw43a=Kb>r&{Y+S7zC
zWU%aP;Q(aT7bfadeAWngGrQ4Gn|<J6H303W_!HGp%a0nFMq+4go_wSfrtJ1)t%{jn
z{}IOU%``)b2RW{HqkQLESeV}~tS@K>FgOq!7x%Bke*S4=bkIAWD3R+a8A&#Artbjm
z%CM#6n`y`0NBO(#N5^=+oG&C78C=w43C(T)Ysx+E5&w?pA|7pC!epG=Y>R0C_pCEx
z3m?gb_P_usbvAZEAXeJ(!Bi-Brb_G$&HlXe9?Ms;Sjv?Ln=Db%gPl^Sa<Mtjta`U<
za*h|#pp<RnugIO>Njy|?&1@QiLgFHTT`Wy(xz4N2^&YK*<ld7&0l<st({pp3J%*?r
zy)*z%_`lpb(gvy`N2>=E{BbH{9XX@}$EgDn@34Y~lYUJyh>p&^Bg=LnR*Uh^!O(jT
zz_&ht%!B|t%6BatzN7U3`Uy*|FK5vY7}mrP#$fkS5%FZstulUYG;w}u*}yge)k!L>
zf_gPMjYf-H5-EFTPV-ELoLkGlJ@-7n=Lv2$^Xu<%QM00;!o)f}VMDF*ZtcM3--SbU
zFFo2;kpkMU#rh2(z&q+_Z)gs*Z98M67Ahza`0o*au++-dGnmwOOQO=1ZxF1EP?QH7
zG4W*?CsQGVva<o>zO#^i1y=7&rgUQwWjtTNNRlwAWPh?$=)(-=!c%AFMV@_dFxamG
zhMYDlQ4^;6z)5^G)TgLeC9{mRe;xH5ZB=7=$OSe3X{{j*PRqbq%z-%X5zidW13^mc
z;$r{8GKd_!E-vqdoHRJd3lsDrrc79yq#OCu4p^+OzlC-Yc~;oz(zgxf$$wtFx)sAF
za1^k>14`(Qm0k$(MxMWR0^*4_>F|g-(Dkq>L5hfL#P^67MpX^As3lh1nlx)8gUF0R
znj-!c@n^*E_@(=ZV2#?%08$c$C#qbRh;KOWM?BAy*T71BMu3+}a};59)Rq!)ngR(m
zsor?zN-j(JzhqQJ0s>}7aYSzA!CI9$F)|ySz_JqAuqvnzTw{TsO#@Jvi@swTf;thU
z8@q4@Y8wvm^VM0QV6tBiHh)&ms9oET>aZu9d#CRi8FM3`6>INZE+0xJ`Mh|~0*>l#
zbC{Zn3o}3Z16jLrw_&-3=__CM>r1DglAqG~aeV+J?whabY{_0_*jXjU1qYuqvSXI-
zUl|zgjLMM}iFNeTr<9N{?<Uv~rf$f~f1uP!H*YRe{+IPoYYUzup5fIRn|%0l|Mb6u
zWoixm)Gi#aBR;S9FTndR=or+;k(;QhG)2@R37I$i-x0qqAKfw8<LGocQW>mLi#@Sw
z#L?wO2xKhKw*4smxhHf=?KJSeC&ELP+<$4EI;9cD1t#1Z9X#28Kd--r;T^48tj6cP
zB5-TDi)|U|06lxOCXyECQ&@7b@tQyHz^;6c_{#oWXlKlgzZ?CVcZe5NAFQoVATDwP
zeJy056IGAQnkpoN>|t&O%g%njvm@IqSpCw*Xe$6rMogs}Sg{i-ze+`-0UcK;jI;zU
z3L?mDmGFcqi?hwgA^8UEIh=2pzOv&1>||hA004jhNkl<ZK5=0^L8kI%I-$yL^<dsh
zx6o|{fR=li!HPP!YIB;o8FP9I70g4<HLJO!15-XR_i#$S{1ov68=RN-9YgCQ4rHX$
zI(q6;RL4A$j@Mka3jf{u0A(iQR1KiEh+j~?`uY)vD`n*j_Wp`^kNBPbtCBhCZ#a+*
z2}MqJHWEN~bdHH5$#DrA_E%wEa{)%Yz3V1N#2%9)aM@FCVc@$p7fi^UCLqm5205Sg
zXH2-{&KtH+hkhdfOtg!S#+E!&0sn^$3(|>Nod2+-q6fiCyrd4CPmqmi0Z}H{;V?3C
zvNsdu1zkQH8H9Mk4B>yrJ4LGI=D!vI;t=tR#kY=<V9Dw#eJ$1!q98tgLNGNFz*>eG
zhC!PBngkFxTiEoyN0Yvn)+u^P0F=1#F5)}yuPxuqmd%q3x3SoX9FHxc^Fq|!(?|b}
zkzluG#0k>t6$diXLiE|he64lsrPiCX-O_<KrVVYOAYPEGZ)EWW==h9F3FbKLEpR{6
zil3^GRNBf60p!FsNoy)=3rKIFWVyXvcyF}LysoJT+}D5hf$I0pi6M<i!d{bSI!u;V
zRRE9CX5J{L(q(UOkP_hSX?qiwb74{6thTU0L0r)q*609=rHs=5c<~XU9a`#$1s37D
zrVO!33Lr9go-3`tvzp);KsFjja{VZaFK83#?Af9Nhw067jk~0dnZphJ6ZPg`cr)-G
zXKK}H5jH1%t2uK&=!6~<!t$eDBmv)vWJgF!B`vXtX)k3J@j2q}q#W;f><0wkgSG1R
z>m+)zyr46shDhbpvWa1vL)Va<kz@ZY;wyCGCrf;u5(hd5u&;oo-bTremP$|jWhx|d
z3af~Ci})jAjQEYXTLOkkzVE~JS3R^QvpUhEr)t|F0RXyrktln7$$ZqmkQNmTYR%T)
ze`g-QC+<C>&41yHFDS$`07&qY;j~}i0!osKK566%Zu&w2VZ_lZ>Wmmhv5rFmm2(|Z
z&CWGGsArQTiQ5LDfuTxE^GyCT;5yoml%z^xDhPb-YR}JAJ6_bWw5@wELbp_phRF#O
zM$!Y&+QW`3f{FH`q@>Yp$~d)iSJPanCMtR0OeTIyD2xn>@H1;WZ}l9*<#*>czaN`}
zkit03j*A}w{*sYO=`<uDO62y%#?B1J=#>VJ^7T02f$GJDFP|6eFPkizGvFrQ^!{3*
zhOerP$Qaqq2g~JeFC~FBp=U>$)|xu5w6u0i7EjFznbG*UMQ@1_w=>ur{Zml2o^ZO2
zX+$o>_C*F(M%+hyp;<!SKcf*#q4Rlj7-3IvD~tki-Ryw4W4iz&0F`5~CyI#~41l!y
zRniD4M&Ux5p&>R`Jh7(H)Dizmo(Uf8P)o8^<8JTpLYbu{A11*Anf4Bh-ZML=Gp)h~
zi<*JndTZgFt-U}P+i-)s#D7g3_|i(w?<N8nBfgT5j{hkDgr%U;kL)dE%^Ub?g3+HZ
z@ib@CAmug&uH+0fn`WSEt(V~|_+05C#IvWb34jtr2?5g5#ycH2*h7NACerq!v3HUj
z3pKXb=7NK(O(iNHqK|*K#{%acUA%g+?pWSxSs7DjMQxI!g#oF5=VFy%rNA9|{h#>u
z#r4nBYlj;i$X0i#3Z-gIIbdDCbc-Fw&F@)-opA|hf4+HG_Rq=mGd`um$-MLFi=%f!
zXLTR(izyKHxUK?$%6XH58<uTf6W*(Z)KHORWE4v3KvairxWe4%RGpE#<bg&R|K*&8
z-X8lV@^k*1=l?KGHsjkmPSJ>6s>D95i|D?JR}4;yoFLrq&_=$m#mqvzYGIooFLD5l
zKytqbir$()Hu%fJhTv-1)IX|_qViUjy!pWFmU}$uoQIxmEo&b5u&pK$Ni!E(Cfkkx
zy1i}A!qTfos~f!Si0rX3Y}&8CSwO2nBv{bFafdqZPgXa*_<=2{_y&jG1&_k2C?>8{
zKr-WbuV?!{TJ39xdDVu4Z+6%4yv`xwhoQa`hV}@zEM;rDffXE90a88DbSV&9ZVuzu
z#h)U+S>diV6~ja*EIGibj7%*5nb09E#AL8Yu41u=IUoFPXQ1Fg=4X4p%ZC+6Q5Ely
z?!kEs7~X~)KdmEW7KYCB9-L6H-<wK2gTh$os9W6PU;<bLotz|UW(uA*Gngy9X>E`@
zGaMLvO=jVJu>>>AFA;ubUb!&dvoNf9`~D2eY3wnIg3nhB>^b5u>~gVwC8)ooP%}H7
zH#?-IaTFT}{xRYg1c;M~rgPxiU^ly1e_hPTMjW9t#B07`e0Q4<B<Y9}N8a!k2PS6~
z8=K3uuwSvbFEG(UN$)lpv-;?XaD@@0?^yd}&oj7yUdl}boiWMmXeO9Z`Y1>rDt0lY
zpycJ$b9zC%*ppTsaR;9wJ|!%Q#8(1PL0S{!eh2zE;v?@bL&`02-$y(~yhr>Kgit5O
zpYf3cn#rU6Ur{kC5>=Z@aG6wRCDj?L-jVw%kGlEKMF41NBs#G^y&d4f4%4EJru;=M
zC^K7fHXUls)@=k3?V+$oNzVO=u&mf52~lc;s#^$wH?X<|72wIZ<r@f`%!XUWRgKIu
zS0r#<$A%z&fN*+%NtR3bL>L-B($NRDA}=){ESsa}U<%J_?LA%CP{5&IBmNCBsqqa@
z5&u{RfGcI4+E9NN@r|T*&WY~nicJGRMqZ-aPXWY}`ezn$qZ7J^LivSDz49EDJmM1(
zBEuF2-#;bads;`8HOzK;AjP|IS~k{Soa4af=GMY9*^KO6a`Oec&;8N?^!6waEktfW
zE#1XANBv@@%#8BiSAzMcwF+oL_j}^@8hT&en5w8^+>f;mK|!~-Z1zgZm@&XCr$Up;
zcyl1io6X=a`Tmy7y27h}*awhX%Z#hAatrAKV3Xhc#RSwEERgYkrP7zTo&5ci`Li?m
zdPe}5En;sp_cjI8p#mC!>9aroHKONH;Qz3xMGMc>l3Em2ioRmJ2kP)c#Gh*nIe*y2
z5MK=>K69K@p;;6F7b>8^o>iZ-Z;B%-1Dw_ypw_<wTi^3oihC&8oY<xAnA=NLOC1!6
zWqYH9N0V`G$gt+MmUMDnmBP}dy_L8Y6i$f*#QX3y9i(Ee(bdMOrbVzG?5!^xfbs{#
z3-4AXORrz`iDsF?yV0wRlEwgU87R?-JVRmCSeSC$F}eT7+>Ns6Go+9RCiPRMVN8E<
zhIf9%0gqhT*N8vCA#ZCD!3zN>Hwvyb>=W47bMj<dfu$p|oiiw<w;6a7tnTkg0CX#S
z$5-96T07H(@7T+Y3_v=D63$Rk`XB?Xm^8ED{d;?4$km#6I+P!!6`3|(QoVmCQ~N~U
zcOLO8RjnCZFUC1#5L0&$zI$tCzJNi(Mx>@H7S#YxhP8IA=-C>8`;S^lR7bwpAZt-;
z1@Nw3qpt~HE!o?^1ka3;L}|ke&P1if&a2Gbah$H;sU6op@zLda|NL4cIoaWJo*-WY
ztQR$~46OPHe>j=h9<3755@!||YBc}RnfxO#xGTT9*l~*R0FWa*RzX4Q{>C0YMI89Q
zi#n82T~+u`5r0KY5uc6s&doh99Eg<4BlcMV;zgTC&eCs2;0ZH!k-=!aOeLQ+7~6Q;
zfcd2Z(6IppM`w<&_m-e8HgFEScU-x0$(-|~9oy!h)dEt(2M$*GzuG*rD{HwpfW1+#
z1qkvVT+R<>L`TFQ5np}(6aez&6q8wTs1sXax*sB-!U|XN$^)>tGBe&NAoM&fyTEVD
zE?bl2NuQ7|{*L&Rkz))%iYwIOCz}|%0lz!0R$7kQ`jKDB@3PHCv@1&g+`4BQE=oFv
zfRyNx^Ya$*BjOnGBjOvH5h==~?Nl8y4Zt<ZrHNzP!;7>~b3GxsNuZw?&KkklE7`;y
z0;P;bgUosaf4+ZMdb|POQd=>xLD2r)aAvxTlf`YR=)!8R9nfJzNT@POhSR)|tQ<@v
zJ|+97^?#;sd~XG`<MLr3v9>=Q!OWf~pnBxI&i9;x25H-(lYFM?9wY9E_ywd`P~V<7
zh~pZ7%$e#FWnPgTrIq9=?rZVpg&7o9CzczT|Jz$emTJf*8xS#(WWOP$Pt<FprNE2b
z3v$i^HLY0rAZiU5({>GDVlf+w_Y=pXApEOMC(}FC#wQiPTu*q7_{@1KvCxeGGy-{?
zt=gH>>Nrp3<^w%4;?@N;((s+N)e)u3YS%YE5@=`A`an*UbLe{3_%FIjQz3|C&KKOM
zveT<Q9-{gvg>NV~gdvR*&g2sT<YbBXZU1Y$oQjAgT-KRS&ny_EDyBJrJx6r-Cw2F!
zH}rtgYJ-RyO*A|+9|i0G7V#JC`<=;97Xf%CtR7e-r`t0NiQQ%frSfY{l}0>I4x?Iw
zEVmty(3iBn1AOuu)*&t{OFm@)7pv9kFTj4eWF#h=Wt5hIMqEWrl#mLnB%8@ULeQDf
zdl^)~1yWaywc{E+ih_1}M}mS0U(mzlz9)SE6T<&D&|gLwLl${8*SoR)lL4g}pVc-D
zAeO@f+cgC!d^|<Gkh?!37`rlCpt9zWlmTb}pbnP{YSQzUOJRPqztY#>1rDi2?XpK1
z+*@#Mc<{_3g=`GqoG859L&<6z0v}TN28>+p9oO>AuQMC9CEr+lv<K%Uln7k1WIOub
zRK_|IV_UZ2g^LPm;ztfZ`u!tSr#m|t8MV`%Tbqx;{icpLOhc)IQX7KoNJYA}3|Hlb
zbHu+Qeut{+_*|tWH4p7bZz1Cg->HBeu&sh>d~LyWI|UwW9@zx#6&S5cY8M|-;39MS
z8AlrNYGd_<O0H`bL4?V3#KaSXa!W3I0?>K01|gEI<A`_w?C765(C)JF{><#OMlkn6
z7B32BT))4ddo&0Bihmq=o1~8Ck0_LppudB>%1w<R)7-G1c_*$o{Q(W}$gzEc`8iTV
zZ`T3*h=1B!xRP=$h|`}ppNG+7IZqK2orj7zvD5+NYXER>gF6cx&Yr`gX)3Qg65@9M
zN>=x<=d<UyvuQ48f=R#G#}JmIT-b%iYCv}Yn{hqRS4MV*$jZqeD^rxU7-g0KXx&1h
zp`Q5nk*_WI#FYwYqDA%x%9VKy_T*wGQ${z0mGl5#cv{$**fVF2Z<uk$#xAP(83(nr
z@H?SFEloc&B)~Os00YuZC8bxAx)&QRn1Xm2P|f7%k*5+KSiO!cR)XHN8L$IB)tvQ#
zt2klz6*T4sf}Yr7U8F`ajG(59D__)cR4<#BSx*5DDCuyW{lRf_o_i06==z@mAF#)t
zXbU!KFeL_9QAEDrbT3wx&)4u#`i0KUkgLh3WiU=Pm_5E`z!@nEpUYS;5oh#E_v<>O
zR!+ce$ED62@f(1)qrxz{1C);b-au?gOYj??uBt|=ge)2M8CGKJ;o!eVWYEKJAI$>l
zahIyN)!c$~5Do2U?fN5f;IEn>f+f`Kg>qTpS_F`suTq<hxatCIu`{ofrVsj_9iDK4
zxvBYD;jC^`0y7y#?*Oeer%;|sGRRDCK1BSAdE`Vd!N}2iPedW6uYz%rwSn~$j}{!s
zW~PWAv?7f9{u;^2_lSMOuh@Ua#a}G<eYZ$>+x*vvKOs_&w#c)en3d>Hfaxt6sHlHc
zDsobA<xe{{Hr>M!&ZH!fo!P63<lqI3M@BDEpPb>r+;VSHc*)pKm|nY?!Yhl?qCJCj
zDpUS^S~ntP@DC+NTuLh4?GE^*I9&KQ_m(5|HoIZ~Z}l7?wIOa8&>fQ+^eI8%Q{~6&
zW-CQOz3Uk9t6?lvjdmPbPm>o0^cL{~>Z!TU9!9TW315>6uCJm3g<OY<%Q@CIVAu;C
zd`^RWAa{JU%yr%u^`(7U%lLYG3ROn#kfY*=h#z$5x_jHOpVGSKRc^X7Gw*Q;*?J2a
zZdIM2C5T5wP-%odEMq;G_^yDRsmTdd2i|a?7ZV?KBqR+v)w4agJuTxX-)u|+Dzlx_
z@^`8Su@2yaJ<vL1h0fM?nc-5${{SFzE9lN2f3QfLp(T4srg2QL4$l$KR8L-!{WP|t
z1r?g7gu&R^!xYtZpqy0Q=QUcKl)}m^4?0+tW(R|Efda1h-^CWzBrD(8;d?@K&zZ~l
zh!j9eL~*uZyP0(zT!}tC7;uXMx)87XWywjWqGD~vP47Ee<}4w>NdBvJ&elax343;g
z{~)C7SkIS;KaGwW*Wcfgr=39EZ}_c@t=n-e$F=_HVlW}Yd(6Q93F=pIYr(-yfVz?@
zgKBI8vVqJBwXw;c$f4xa(FUWdRotx6nJy6Qz(B();ui?i+)(F^IcX;V*@A-WP`@56
zM2I$1PSR~wuzeLXQV|UiIhts4zUBv_dg?$`a-ApYl&njMC`l55T@rxhG#+S2svzbE
zh^V=m*e0VC{rg3aR3@ty4jTb?_v`1nIe?j9(e4q3s^XLr1JXvyVd<8Yrb@UjG30Jj
z2sAJFvzhS3=BtgiWKz<<BEI`BW@arVXGUV^BQA9^uQ+U?x6ItbIpWWVoM{Rp8`N*2
zx6CHR24xtr1``QE;Ht|tfVmQb4uGMSbNv~=TVQ6Xj^3*e7ml#HNhTX(xGlkw3wg8J
zeU2xe6_MX)R%;?Y+e!QR{@_%;*yCL%>LKc%oUoUL3oPImWSCbNp@A`SHF5f7`++c$
z0?YMe7P$9(uS{hT#8ve=a7ttn1q90;G2}J{&;Z;f!jzcQiA5Uu?REj6#~4n8@`_%P
z&IU1G)*PBt0B?A#9pyi1Fwd-8zZnCiUd-$Yr7PKCTW|bL`WhHKcuyX31({_=`wR(u
z4i5T{h_BF?9TJwzxc!p1HS1YM>eL9(m9ExHexlU3HIyBPPH67Rxt}nKtF@^t96*2`
z_A9B6YR7YBE=7PYfjQXtP;J2v_|%qzkp=({dS_6x0I8UCF)AqtSCz!2QWqO-B_(MG
zDUxQ87gMUs`?sVIqVsaw$khJNRv#FbM=-|(I|iDj(<GD{W%eAH+~E6;EmSG{ws@5h
zz60Ms;=pFwW28A~uq9udpOPG>M#R746W{1VIgpbq0&r$lP*^IY&{wh?CtIUM43m@T
zSazlIIvu5*!o}jvmLof>jnQf>IUJ!??$@JC=49zJEOc&f*xSPd#)y~X2xjOv56ci;
zLYYLw9nHW$lFci%`VFhMr<y4CZvLYS&TL!@SxSQ4XVUADfL`$C9s=vM2_iFMY>Vny
z`j`1;K5<>In?<~WVoOCoZf;<2$K{NhG~~3*?e_%kl8<<|fefuN?6>!;jW0I@*+3zU
zNTgpVamM=og%m|JtEN*DgCwC!Gk_of3y_tzE+?d22iu^<0e*n$xWY5Ja$=8%+kX@Q
zW|;N^Tgb_p!X9;3bMxsa1M$lx;)gp+sNOlxR~sxgIfB)+l)}oq^Mzco=z2MdbR&R!
z$9vc~C+|KVm$&@$C1qXCe<S8}S{B~BbsKtM_9rteeZ(8|yZ~{&=ASYn=SQr_l}Cet
z=zTJdS|-r2|0;A{ArG}iJ^!T$bO%a2<M$6NLCXiJO>|p9(2+Pb%P3Pbi2J%!I;8=4
zPo?{coX<s*1GT*$$pfi$grrxfFp?a{evEip7KBsdzf$X|IU$lwj%4N;%t)0ukB(=?
z|AT(!#)4MQ?c8(nHu^BGJPqmy(AaTW1y|HcbbDth184wIM=RXcHa13*t}F9$7e%nr
z+`8I^=Bu200$vyq*(U}p*EgV8Y)N}P-p=0pd&IxJf|9&S8|Zj|@VQXv89(a6D%s*s
zBxyQjlX-k~_@)lT>)0~Ml7p5<yhXekNxWUhR)>v{-o!41@i+R)#S%7fL#1KF92w>R
zj`-gZcM<=`N6{y<vP^-;Jn-9h^j9BV|AcyS$NzQR?5q!XiWnn)fJ1JdBCVr9w9?E|
z#Gjz19Z|hW?qA|UVWfhG1J|tRh(+FS`XFUDndx*?xYi+8meBg0cW5Z3B@=uu%;qaT
zu3fXBS2%P98fjyDmasQ9db)p&RktTR(PZB<1BjW?`PCmvryaUlH&2aAE8yHRMn=DG
zBH+=IV4=TvE~hbF(RKiscy#8#u#3Qf7c}N}=2X%*Pex+&7Lvm{3vx5+4JGa@k^Dl{
zw!jfS8xhle!~g`@)kG0E$Em4*PVLd@OSqvj;YScXGm{W*WZ)BOqc@X14JQBBr{wlf
zv7fJ7A~l1+N6_VwH@y=Xe?`nBwchPDBb`^99I`VsGg%wob#tV@GMCY+KCMm2t}(#V
zM~zr&+EnZ00UZCsW)ifP1>W|sh<J$j8`Z@CGGs<qOW<=%3@D>mt(}=H$*gT5s^tU5
z)0cK%VsSY}wdP#_hv@;3P39(>>M7hx4|@J+m{o6`xGIGXtog+@f7rBs)ef(=U?XW+
zFhj7TJ$2EDmMu+cB7gS&G3hs6xt9$Pos)-^%XoaHo>SRl{}wOBz`;gP<u)Ippa9a*
zigA<iNkcuur59YMZhivFpK+-RB2!+XV3dG_M5@}gv|zq%?q9{2h`mi-&#53EYy|OO
znfdMe71j<mg-HH%-kNtFD4nuU!b;MA<#>S{^i>&{+Xw(7{zy8r%*a!D)dM@dSb~uN
zkP~Z56GhEib&s1+a=Ezw&MH==UC}(5qr~mio6~%Av@>r=_D2(-XB1XCgEuJvK7v?A
zdmiR!r6-X}{rZ{Ir4@$;{;%lBcPQ{3-QKs&Fa0t~c3CYj9j$+R7-BIhw>Ug9z<hRk
z7Ym;Qd}u*f_b&a#0XkE1Qnq(tF`f6XsjQ^|pfm+}VLIyGI!_8LWO4+Vu|*0Xr3K-N
zj8y-(;;L_7?mFbb$dllk8ymK}q2ln*((O#vf0i4Q9LbxNj;NJ9r>uLWxTrPDErnm4
zfzmo1dc$5p7(5yswLt;p)VK=pPL4l(WPp1S@orqsw7zpSdP;9y(|tZ70+`kmgea6i
zLRPab1<pY1NWQuFIhu_E_}S8cUDz~@R~t{4daumV`&VNmE6Aq{8YfB2f9_vv^NR!k
zdOQ8|7hH{DPhUO1>z60$6!DpF+Y$Y<wF95HtHKsZ&|@+IcxP$R?fz9hKXB$x1Y^Zh
zo%szB$pFl*SgHa>dA5jyvD&E=p`uZ~VDLf!^fK-%$ANDbP}Ju)v2vL`yX9&M(+wGG
zW)yl$h(6#sBn<cR%tFPZ0H%2e6Qa+O+n&q{7kw}pb>-jqzjD>m1gLa0eK-qfM7S;Q
z^}r7cvdfOuUj%UxV7-CZnx#LHTFnmNQ^bEUDJJvxze3k$*!r1PfP(s9%_Ayyn`d}n
z$M+b+tCeze@hz*mIQ<24W_&DydcWuGDlAi5m{DPqI<?E<#atJpxU+EvCO$jN>15br
zLzEvk9YDb>9C1#w*!Z3WyI539zkINKW4^rSh#rrdSs(d_DyGToNdmOmz*d7y=l>V+
ze<u!52=rva|AAXollk;f3a&1q(xkWZk-oQM6zg{;ODGCP7*qNy1=AlS#|sWxBDsh-
zLgqg3tIBLdAH8s3cchl{DjckxCnyD0rcOy0f3zym4g&tff(?BC>_-2ewF48<m()+K
zZIh8o_r&nUm-jFL8Cp{%E~4f{UTriXWR|a85gPBvCa=U(QMh>m;B>H@h^=wmlX;tA
zvj#JnSmKPqiA7cUeY9ScjT+_-mcHY!-}$%7*S$R}YGJ=`aBGScS7LQ-2duZJBrO<>
z9o$xLdW#vZ-0`6kV@+=Nzvu%vVR{3UWMO`%<^v)rLP`a2XOj{$)=l!No=^WmN5bv>
zB^Q0ho?4!)am0*fm-3iAkzKFRsB=kS3l8m`Gkn>q=G{`*z+=6cp?A!=3ya+w-#>8N
zg~rsm)!S_JSigV5xPpZpOb#`Nu_rL)Tap527{Jb|P7|cq9yh!vs%;>kfdhO%?V(Ds
zYe9M|5vCuZI+jYVa;5{vFh|vRzebR%66`p=L&Og_k0V*M`n?wU|J7vHJwD__zE$GJ
zh4ns+0Ud8s4NLikl=YW{2vY?%2tYX#=O#c{VJP*FM2VVnDvqqB$fv-QOc>8~a|<$H
zrjUAV=6JvjYc0m4jyafHQ|<MF@}@GG%F&eZw=d6Gyxu&-4%e#~7R%=zEnYe5)A@t%
zDH*@BC=+WTMbg54TMKl(fvMi=23Ndi+KS__OywdSYRyub*3&fI3Nzf!m4dj?Z)UTY
zwgX6qXWo<YN!$E{b@iyd-;+z2%qyu=>dLc&Vo4+|zW){gB*QwA=Vi8&a|AhWP9hVz
z{b<{gSrd8?@?<I-Dh7e@cf7x%c|-Heu>NwlfiV3@$u6t*Q)31%Jl5g12Lnl-w5s1C
zIvP6_atr?NX<dpNY=ln72IdyMTH^Z#2bvMI1Y%pz@Vxa>VphvT`tv0Os_1#KN4&i9
z(NfhFp2dnA8z=)PfVD6x1&%FD54(7&4wcalSr`i^&Mh-iE8W7d=DX5(o!(ke2K(d1
z=^F^n6}EoINs?F0|MSy4MsDO4JR=-i;*th)wF~=~0J<O#&KNND$c&@`i%uzo!VZ0H
zSSvTzKM|gu*BVaMP~_YjXg{HGmDPv!IP`&=bo1ETzXu{lZQ}Sv5x*hyO8~13Fo^Kq
zsf}y{xYq7jC8dJ@l_&4kOQ;O!9h5U+$Srj6na8X*9zA^jc6_b0AGK9&C8QbF7azQH
zXR|wNdc#|IEd{Bxmo+!&+;Iyh8+@a?{Som`>y-O>^F6iN2K0I+`aaoHhWm93W9c_=
zpi-iy7}tZf1e9WPAmn5jnzX}G_k?}!28A)glFG{$bj9v5?3`WMj-X-RHh=k#h_4Y(
zaG>&#Kawoa7qBDAe<q=+3Hidu8w7Y5K+bGz&xr>zxRiXG{Vy;#-9>{{PbLH^EXPhy
z&lPTBc5yR*_RjE=HsU_w8u2G$g~gZe?Gfgh1Cd+qcLv<Ne}q7vBEDEN@C9(Ll~-~z
zBo4F$WmLAk(jpzHZ3L$$P2UP4V{3h*I#QE)Y!&A>0$7WeR!3ZB6j(+u6Q)`mU{xT)
z633JA`6_>IBq8#C8kW#OgbdiPq(#+7R+K|JznJ$Q@T!F&gmnE2nl~pK2Q)2n@{8GA
zk-#vC6$-9j*|dZ(Curh2;a_zGudpRK^}L;ZXlLzMv3~!!q^DKBzF1B8ouNo~Nen2=
zL#U4_vodBy8~<O(6jS?A9ala$Y>M_94X!!0iA5hv&NR>*>R%+GDh7pt^zAZ+)bmGR
zXhzmIfc<w&wEFoGX%Sd8!uN=mh=};rG+1|#$=PNE`fiWi*;ygFHg%7%V=7~u?AaN6
zb3nPJ@aly~Z6L!22egtUQ!kGkIuV;22y6nl%t_)mp3PX|t_}u5tIZ;qqMUNU67SnX
z!<QeOUimQXa&N#(`*Jm)xlJKmuvPUs&H?n9D!p3iyLAjuzESdA^Nt0-fMdFkraAf<
z8cpZbaSO@-B}rTCzZja)3Tibzq_V*D#v>hW*I$AcB|hn_9WY}>TjPV%^&1S`NxatC
zr51J`r?vf-EYCBe;~p3uJs<r$-1s$Vg_!_X-y<Gq=--iwj>b!fY}&veI=>xhYR-W>
zaRlctJFR25VEzxhL>Pbi87?uy*wX--TZ{ATS$p22Byi3QhmRz1v$Y%<Th)U&yeB|j
zAU*%NRf$TgxM|x9+N^4GEw}ws5j(>KlxeDll2NDnzJ8*dtWYhmen@*-VeXj>l42P3
zze%xquyT<XP`*U`PqYfR48BCBp$RYzt2lA4EB<)r{^Pb8qBRRb_60I=ZvlN)RW4*~
z8T`RB*Dc|`PNj2rm!ucLGliC%B}40^68MSCce?_qM0T(W?gRZBXUiZC5x*s^*kQ4z
zhRJI%!xv^X?s3Bl4spWu7H~!xa+;ag{=q<r{G;5(KP#}$$uvh{twmbYh3V~V;Ec}t
z83Q^}ke7&l*!=R&q%yPL>L?}ER#+(3ZyuRO(9pJ~56YLCfa_ABd84hw5NdeKXtRiQ
zRGI_?Wfmevg7cNq$-r*S8(D86NCAi%!J1O8mMfAWUlJLfu3wdufrPrG9`V^8kyET2
znL5zN30pj%yUk&qWRdme`fKqDn_99X{Rz|bZ-^M?tmjX5;Sq}(E$he)9^{(<KrC}%
z?SP|&?%Lv6`pWSl>Y7j=$(49k+TSb=Qigi62c=F%XLXiJ#CjU)1n7IAQ>9~$LILQM
z|6Ay{lJzeL08(Z8an4K_e4k_>0XW-3*!~uAXB4Q{eRk_N`{K`%kvo+9t;NHE+g?#X
zleJiu^kW_o_NlQoBgaZte`Xv%Mf_?SQYkL4C@1z-5KK8xd7OKbTuX%CGwLa4$lJDx
zMpK<;uJ@h}=m{FJ;A~dOG8}O6Vh-k2-rF@6Bnqd*0jS`9Fr83O0i}afsB90?i<gU1
z3O#ULIp%#PyU&sUTv`Dk2(j}4e#35m<6&3be=xv~?;L1r4;-k1&afFhv;`eRSWW2B
zS6xlQv3Of&t>-#oE&jBih+kL?S|lL_aqqfpwK}jnJ=b+!Kha&6@Ct{ls_#2o+J#c|
zoek!@eSc4}y5QL4Z1o(jo|X^7s}dYMtAu|6wW$KFcBqTp=6<smN>YF@E_ML&O^l1&
zQ-?n!F`%arO(41<4DGOviU3k>#-N_`TOz%|*qO_E1X8jES0<_-h!{POkLjS-OPtlB
zh&^&JJrC-wxkVCr)p-^GuH^|hch-mCLMBz8W+CE+3Q=pWHh~lRzX3*jM{>TfpNV?y
z_lU=c5hPt=r^@EeR>9t2F-L}YNCscy-Na5d+s_2n>UW^2!&2R#{HtYdWShi*d1bx;
z|4Ngi^fm-x8-Yk`73`95z+ix?vuTtYMX-ui2ZjF(UtP0+@8lYbi_fr<j1%v|<%0=Z
zi%kk-zP+#19u;rrg}gABF46X-2cV;yToQuZphRy?!l@4xx_-g?6&QfHl$v2n&+Etf
zDy9#hZo%q_b~@qcjUr^=pLRrF_4VxGJu`M%T-?HRuXLFnOv}?5J0uZ!wYDG0o@%)I
z$z+&99Bsrm>ig;9&o#pDMnM!%XnTg|C`JCXeAdw|`Bww_)gFo3Sm~(5v~D0+uKUi~
zT}9e=_C&T05ymzGRsY|fFD&dJmISHC%~?pQ1mxt~_tvs<wLwak%}eO<3U?8oBEo`J
zOGrH#EKXlJM^F;9+4cQ_9$BR3urB4#|EeutyjliZSX)J`)8~w{Ia$ZBf@zCIeK#q{
zMi`r`gn!ZRbBXw8#6KhchLG`KwffY157w^yNIR`^y4sWtI?E{lY3bjsIL=qTt+g(V
zltL@rdJfivJ)vV&3Ng{h^EKixf2x}>{<nzVDdLp2jWtgz%Q1|4T}B83<orhu___<f
zdV*E_ye@)j8Q!gAI@;*9FQnsVa76B3KA8_JmYH8?cyk0(^y>gL!UBl;zJR7Bfx4PV
zbUT9)PbE$HWZ!Wx5SCis74Lb!bTRW%Uud(dHxZ<8scINIzGoy7zL|_|;|%8<BPID>
zhn4ss+k7NKL9zMD*q8y%Pt|l)_EH;G`YL|EgGJf1=cSGJ+sGhiNB@PPg&IiFz}uhL
z(Gvz<tnIfl0=%=ihx(2nmD=#0SDOb{SfyNaNY1_VvGr1Xtr!TP*0T8$juC%jQGd9Q
zOBL7=f8Z4LWcr^uNWUZ|kQsV8BSD%|c4d>nLJph8FBM#2y2%D$7NF+L4%=e?d$eil
zx(RDr%m`_XNu?JR>;~fDYPFu=uY1_Jt35MeWBt2`AI2PY@Fn+LW@mb(#VV8$57b)v
zWgD}aBT52Gs$tds^^m9XHa)bxOA%hT`v8=7mekk=x)UCt9p=CJ1TVCp@kbK&BXL{;
zFxOq3O(V28z*qeH$l!sR^DtnWbBW{Wqed{L8IRn^j#Uw>|7dmC8-)^91guFRyH(fx
zoT>c@!+Lw=rlQH&l8(UoV28Zj)KL0WgteeZk``8bE?BkO`gJ(K15skcB&7+yx3sQu
z8=lOqcNU4H3e12-vwME}C<Ru_z}nWwW$kG}V`n7B_rMora5F0_eSB+SB>nQ<8jv#^
z4EK(zQ<BKRy@xJgM{eKXbY4+wyqR5#T38&wGv!mYbB(Na%X?IieNG+-Gnv5@)PB#v
z@76q-8l19C`yXv!lSqz{y~vpu`JJCHZb9O^+$hMi4S$k~D=h-+PfvVm3mQHUm31$#
z>nEDa|Lhoc(u2}QoOwa%Wd|czCU@Rq9+EIMM4}e{R8o{Y@jp4vM|I?hi_|27gXwX%
z(_a_TRS};fg2M9^0FiT+QyI1-m{KU}6rMoyJCe9_vVZyVntL2hPpz}T9jK4y=WEdj
z*23Plq#z=o!w&s}b%_4Ffm!nU_lPG#&c$RuK>%p+A?YZH3+(HoX?8@-6x>qrAC#1Z
z@mm@EOJk+TjMVs&G{c4I_6AGRI;skV<u32GJ(J)69i;o!Tv8A7k|Vx&f#yjaP{B9M
zj`9-mYFa8ud;0a8!w8!kryn+ky|O_S?Ya+OV#5P-$!-56L{OpoVLggtf-yf6w`XVe
z16$33Sn5SdDWH)Jn@m7@#R~W21Oq3bOzf}W^sD-$@2N3?hlu|X@&A6Lq6sL_#&jVa
zx}0Al4mj}C{&JS>5#je_0WgR#qa{&``yZeeo+3sbx>zG6c)%!fld5iKkkToIRYm4J
z_`!w}KN93_#`6&Ii)sGU4tK;OTx`llkz~+=;njP#K&yBpDu&^#^p(?uB%XG@)F&L6
zO60MI0R`^9LHc|)B=X8eW+ug7tl#Wy?o2Hcgy9@E;Lec!Mk`6iC>AxJJ+NBu3`s2T
z<l92N!3-lKl@lG6*+Y+t_4W6_(2-1WplGG4T(ORRiwwYzPVnxTS`Zq&ZdzZ(Ea)Rn
z=LMJe#%C1}$Q#X^g_WE#Q~S)}_k~ba+ahf<i4{5L3tqW^qbiNp`kD-7&X#_^vlcQD
zh(U4uF#%y26N)|&tL7MR;Tc`Z;s%U#!vW>Ep4yCIL|5gah%QMXGmg&Q(u~~VUc*B)
zddA_N*C2Re7e{MW&y0xakhE@75MiR#j2UR(Q6$tX$Lt>ebS0hqRzcg~{GI=fc!~Hz
z5>ny@hBX2fGb$ymG?0_Exit}gi}>v$A;-L0@0Ln5=t>??aRo(^#nCS)4Xb)2A>G7D
z-jhwfB{h%{f%$91FZMX70n^Iak;*)AYX7}eHZF$aJFv5A=ezYd;Q+zjna9hPusEm0
zE*50LLHCv{U+7}2vHvTlD=djkUxoF7yx~TVOS;<|!EWncvht&C&xq&9n1MFJFQh8I
z!H`kl?Z#z4c>$8ykP+*p5>H8H_-rtwum>_N0%%7DRiO25xaO9P2*}fWoZ13Fzj83o
zR`k511?`OwjWzjnGsAgJDv+jn@}G=J6xK?Ms6Fc9f-Gt<4(f{az9tfy0MUKI`lP^}
zm-{{9e?{aGzZgE;@IfOuwnEi)wBb3E_4D>?1f!mv9Ex<V^yS7PR1sfr28SdCl?Xnw
zgt;&^dI`yTNFGP#*RQgDhjK*nV-+?eXX%scZ*kxvUovw#8Fz7A$8d!Rbt3jevJ*Sn
z5Ci^*_-eD|7uP?Lrj&U6E8sGZcx7GA%*{+$h`?+lp=!VbI{Jd;)B4LL5fl;B@zE-x
z@9dEXi`Rg!n62pEhEEruQK{_~8DN-*K?Uy420W`HtFuCFBsbrp7@k#7znWHBEmTj-
z{+h-6dzN&72G>pA5k_3%o+WQSBQnX&T?J-)#j7dNUU-g+X@y%`ZPiee&F((}K)R4C
z+2kqW7u<;p%QyNE3bK2ZJ}N7vO2yb4nA?Eg^ZoB}lwrz}$vk4dzW#~&$eT$aX1&Z0
zB5q^zRby?x-o$fnv+X1SlQ6tYY9LB{CmzJs<EjdlvoQ3s&|u$GKpnB;$a!eZk_HZ4
zZ6)hP5$QGJE1fTIR!vsnP9QecDV$`DEmy4fs30V2XIq1)(A?R~pVlVhwKi<yE-^#(
zDh}-aJJvg(4tbylxBBP|&Z}}TEyjuY2zO+s%||+FDcQm(PFdxSf>izyhd;mbU>LnL
z0oyg=FN{$iv{?LtZ|m46)jod#brccbdA7p_jh@?yDN#N&9L&a`%&+K8ZpPqB&N+Ac
zoeAd^JO2g+RIDAC2&1sQ@nXijV5x#US;CRUpHG2WQ96QIZepp$&CglqGmAgxOWz??
zGDacvsDo}4K+YJyiT5{vp;s=dHH}fTHa_{Pl8bEFus!Q4YVf=+CCj1A5wk{>{R)59
zV@?+$Y|3FbwAkiWzP^$~bym9Ct@)1tw777g;9g2hSZVEog~^D_jj?@K3`ssP=N9%!
z4Iwk#`e<V^BuF?y2Ytr6i}l6NO)&3t@nt}|PfI0qZuxauN1v%iy&=bblVJ$bR^QfN
zfwo3PG?=t)G%;#rzVzo&S*&42q<phnx;Fznf(aIPFV1L39K1)%p+D{UnvF(`4tYie
zOV{C|cM-o?`1+9i$!v=sj1lPhlpI|~BtcxJm%~JVB6BN9qyiS@Ln*tez9Y5cJG=i+
zMqy<iUH`<B3^Ep?0{D*DaLyeq9JpeoplN2~lzQvr-45WR4bo9d;)NLS8H=gy(N^h8
z6q+R8fnf^riu!!`h=nQeUe1W^aJol~C0kp;xa5W=OJ%~4`FU->y8hbyYl(t<z_*lk
zjcFOG!*85Wn`AIA@2H{&`U1b>t#A8RQ>8f|Txh||1@@DzA0x0feg*y4h_J@c5@D+F
z&|(QnBhV{u{zt@j(n@hrr*-tmXf1f4WSN<wEGV5#2)O&rTTuB`!i_zaQ}DkfqMQ_f
zsRkBAxvNRC&t$!=X{Z<Wle5XvDG6vD7)zQhn+3CE2}&Ndw&Tp5DZLy`W0)2|{qWDM
z!;>MNfn#5o2?a=WA!G`s&K#(K7p!~*+$zn--*$en2R%xt1G#uc@x14{n>B|xQLMYe
zhdgj@U8E-{->z94bsmX^&n*01?V&oc83WYfHsE6ROAVXE<yo=@nHj*mv$JmN9dLG}
zh7ljJe*;|I5yjb&E3c`KRi>3r4PdfaGyz-HB4r=!9^aEmUNQh#jO1cxJ3Vt=d4iW%
zoMA<O_3>u_nt_9MW%25WzeDPz*MA{vyc+6}1CkHfuHj79?{NVol(35#tJ_<ma{CCC
z<^<wO2-wL~)5Dto&=OSEOsymRU}$sz6?JP4xwph=Fa?F2se<nQqXz<sh&!0tNz#xY
zC%&vHJhx&mrsOtYXvfVrwjGQ8&nR6TxGCkEqTy#G^+)nVH`!0d!~Y)XrkE@YsE=dV
z48WN=x{^J+8ts!U)o~jje6V4u5%Fl9JvEj;edIqf?Ehl3I|}RXC=#1bUZp(%Kz#gz
zO@W>-xJc3`+3$qJ{a}Q37<ALVY;XRK;^c`kNQrhq<lUllpv%5}tVoh0K12(%#~x#;
zjK5t#;MVp_$TXR9qQagE&j2!V?x%<f)Ks9TC`f^GN<`ZTS|D~VF(?3aCabRTCn<t%
z0O*Ac6wuUx`Czm}zqFBKA~>o7J+a&h(@q|w5~MI8PR@l$)-I}Qf*>2$2T06p#fdqy
zBcf)+NL@EOVF!$M;GVDKG39cw{C*R@cho9cI<_;*yj2y+H$kk@PqFt3_aYJsKIpVL
zk8Uvb`4v-ni3s5HXeVLYep<vMl@XS`Rc+shFel%_448Z{!OA<50qL@GbgmbZ*4^w!
zs)Lb>RMDUydgRCeH|%HRoJkH6);+(oLTP75_I3mYB$_*mB)tt$=zx;hdKzuc-Zq5(
zl0zZcSZ*7+@zRkhIRdW{e?<JonyHIR#k|G(o#0<acA++XflBck3_zIVB3SA>qDRk$
z-E&jh6m*7vedo6Ct&>|#S?X|3k8p0>?CglYv8Y-|Cy!GGqOk5^{kpK)cTk?#`4PNP
zKs8%8nwCQ>$iN}h+ZNtGW9eSV#cCs?&k^6PK%Q@=P?hE1GgA3PP3em39eDCy_fZfp
z`JT$;NR7>a>ZvcKQtmK}@s6guq9E1V9MTQ_jC6CkdAZ4`*WB2(jUu+_l_#R_fxz!2
z>l$6wq7ot%O2bx3hACx!K#lbgU)MV5D;*R^Ht!xcoMH4OtC$vWWWc3$irf(l!ZPYT
zTaX?=6>&4xLa4X5kQ!{nl)PV2PK8U?^Y)Bc@iIWVg*~Kq08ujYRhjuc;nUmi_B>4V
zj2nD3N#X2qr6-m~WwHKmR1F38-VD}veAb67=#~0RfSWQC{VDwliz<s_(?A-ZSoRaE
zeE|pOAHna;3cp8_A!cys7f#OkqxF}Jc(SgM#>8s^fL|eK7a8EdQEQ-G0=G5UV-8!C
z*qwiQZx6KTxMoS819ocyKyAxFi{%))3M7yt2ipSxzqyV=QEFr%)b4#|6s~-x{&?~7
zBJyN;aF3o?)ZUPE!QKLZ^kK4?0viyTQma|Idjp(7h2C1MRDlD$W2lNk)Plpx_^O7!
z-Jo9e%-YTZ&TpHe2;>8H#xu3J#CL9R+??gpE}!Z8j_b<|-@oy-{n`O4gSwKxDNJ7p
zAQHKjiT4W5=bhQTia_sHxtADikwi>JfwbV**}f}7;DLu$7P+{8>Fy+eEE&+FYS`Y=
zpOnMgG4RGadMYFQq|kg{OMukKa{{d8fc>zM_E(7cebNV@B3{Ph@rLCC=6ZvI+kGUW
zkw=cW#B?=BM}q7$i_eXo86$q9|0jqTUXb7Z{*qq{b<XVV8^OH7b#7!(+L8{@cExV1
z=#$|RWeAH;nQLvIAaYtZX);;Hgi`6U_CXy_P^$36S8FB@QoQz~48CKOi3(^Bc$>^R
z%q6Nz_kV8>136eTZHc&K1W&%Mr^$Al|2umol1#I(pm+JG4`2Ws9XVvDh_4ZEJf^Q@
zcPDtA1&Po>o)#=`Wz^)(&Uj~J@$Ge?^RA*wr<`UoLrD48g_x`5e0dBQxxJT&uMx+H
zU(6mBXSlahDmlV@o_{rg%D6^=8LC_HPZLl?<$@kSxxvIK2A$2(%**Yq48l4K3Tra{
zKny`kI=p9qTF&aUYZXnyzF$#$WR?XEbVIcE(3tc!PzTRBD+f+ROM+Kett;g}u10{0
z?#-=b;DTE3jg`JFp7IhY{v7N{!9D$l3U+~uQ|Ruf;41$Vukn_QA{Z%<zCt@VpPAV`
z_M0(OHJqINf5)76c;eg=q7TX33Y))@f{P0na5BT@IPi<5rbnzjtgI~%UM1I`F^A%b
zC26f>pBtk(tcZZyN95%H-_2uo6mHw4urNJ59rcNO6W|tjA60+CWZaAMKU$7_jCjDs
z4c3Y=S~fea9caxumjI0n^q>Mj0Ybe`ghU0X{hq%&+vK4f^!N-rc1+H{T9g|AWQpsq
zI6v}b`-B1UBXs5zjEuI*Z9c<=JZhha07{6NtLgu8$g{@a|8=vGH8@o&V`VI1WfwoJ
ztFNWszzGmP0i`qY!qK3l8j)@{FvHi!h$k|%JBlzPUOfX@x0@;8d^Z<c*$Z30<NFVE
zOYYVipe(vGy>-E=_7H3G7G;9qc2)*x@YtfWlw)?-Nx43QE%0TJ_fzvt1F|}8@>dfD
z;y?MnDzop%4qNLBOZyN4qZuG-RR_AVx+jSBT{6)}6(otAR4jiY80{=;*p^WaWd3)U
zO|>io)LdhmumG7!lS@0<4Q-4Q>cN_+Q|ab+w%|EO_h?l|;gQa&gy}2e#Pl3jIi@RI
zq8!uW62u`l=JDsB9IX?xw6$1JFygQF_-z@Yi;r?3N%_(B*VZt1CStU3CN5=ItX~T#
z9<A2bBd&k5{-rNA^mf4_=G41`#A~2?dzN0$1Y5Z_o?zMUU=C}m=go)ui#<kkO734p
z=*r3!Z)pB98&uI!R)6v`R>mgxG^^~aYe3$qx8X1whd;9dm^|(S*ie~KY2kJ8{SOpC
z8+?|iy3*7v6*~1agYLNU13o+BEIIA0g454kxUJ-1XYI|})doJ`81Xm$xJv>+iB#r5
z_<ck|al!j$CQMu~=^|OA*Y2SU#UbV_Z?>7g>WJ*D_LFh_=hZ-ng@j4$UBwdYBYv}_
zT&0-I*onn!FcJU3qIPbfV}A4FdXSQq@PV;Gk}H;gvn#PVfD0RCm>$w_k5!CjjA}%A
zOq85CgK%<Fz5k1yKbjTH?>TP{QxoS`k-w`KMv~s#)Upc%UBHkvq<<4t`<`KLisyUt
z#6Rul!c*LKoR|Y~Tb1<&%Y9vsW0Ftr4f;HS!y7QUF!dq^ioiWYe1Yw%O_u-0pPhgz
zL-PI}ePvDP-q|Ui$T|!5Yg-JOvwj5+*F9L!3EW#_Y2nSkjU>@(iS3Zg4=~vy8<A!}
zInj4q`pq8nRpUj<O#rAPAgni99<Rj5)#D2avj-ZV=!b84^NBsr@WdPGhYFBaBcd}K
zPdL9cqGY2CQ!)!EB3`f`Iq=L<$?N*50?O+K{+leJ|0N=$JXIy=LHB>QBz`gsm*bZz
zD~t?&*|#;gXVT`LJa;fXSz*^!twBhJqkD|_#pWo?nCzL5a*vZp5!BflFiQ(l1!>pF
z9&RHb8Bk;FSI5uqD9d#i?Bbns;ph(UvqSseMtqA1=$H|0>9AHY)i$stw{SCil^+pB
z#IGo1HyU9|>OCb;{9>K)uMsaa3vC#%3zU17{jzL=5CM5u%XnQQJRjpHZcTF_I?Vfh
zvj(HuyWo4lKS~=58WEqV@87;(&7D``yDGhJ5#PA%$@Xs%{TgPje8tFwGDf5JdVs-(
z_Z-2&6EORCwqOo7k5;`ZHb14N6T2Y*)v%idPyH%3>!7*iYi|s64c57^x|3@>7O%>D
zih!^7Vh1dIH^su@<jkmhrY%M=#0~**j+2=)u(w1DBf?lkcy%wUZAlI&Hj@Bdh@>*}
zjeH{apGi3hoEGUvXAI91E=<y>9e6IRGb5$(XVCB(@f<OlFBBLH;`H0*BmyTU4>n(P
zF#4n*m{$pabE(pZ0Q!hU&ynz-5n04<R%E+<a{=>~(TB3fZM2N#ID(BALbyc{Sj5I2
z_%#AZDn2Yx%-=R(Hl5$Bk*LQcj=1p}aFWu1eZ;qj|2N`q1k2nOqcu&nFva2}A&U@f
zXQcnVfeSS$E*yAmp+_w74B7q_kwyG&9NvbEO99}){C;PaEFvgAN?#^?eT|{01AQPb
z-`S!JYY-}hiPE2zU)7x9jp(5asK!3ah+j={Cwg~cQcb<Sg`CHpmDZFMR=Oz2pO32n
zFSk1Mb3~X>*rlIm()9PFh?k=oyCd9=%QW-K8E7|~C}ODsdd2%IkJA2)rbOPje@V4!
zc1VG_1e2?(vIt-$A3s^Mgfhs?0_%o?Ak2qXMh|Az_A^2O)yN9c3~1tML><_5SQ^h8
z&uqOTyiES)0qi`1J0AFp-1xuK+IZcB{X|X1LP<_HZK~NbBfYQqw+@%u*#6}x6&E#_
z(i~`JkHoCZ+B*tc|7rboe(vn49%o}yy3J`g691?PYi|=oUnwt2z`0SzlE|(zE2^NK
z4_4UtxDhNf(N|@Fq$IM4DJhcEJ5L(;7V&?Pg-^E1N0KkWmuVe-OGfzQ7@gbw$R0fN
zbg>N(;kE1Bl8}sBdm%6_D6GkPIQK>s%%|iL8+rx!bjbmBA2Qe-^~>>v4K^@sTK|<<
z>wA2N|F9yADM!!N2ts<}(kxlBgUK{tQ(}Euv#TX&E!KM=4?K_@>^EarVeIL|ZO?nI
zQl7)|+a=`2)|CVeY3JvUkgB*}pPnGo3E1z3(=!1H)XaX~v>%sx6h`_gC&{eVm&TCM
zlJ`9@KeMjlg8wRZq44YP$$(26w3b=zP9n{2Q-L8Lv?o)VSc}%GdIvr(wciQZ&KFqn
z8l$=zl9D#^m|XvWgy*mfgCqA<LkdpoIV|&QKt+|?6GH^l<^%g^>bjf#>&#cUxRw)L
zaL@17XdzYPVhMzSK5=r#<y__g6=bs+19F&(1GY4>E@w7X8qlxFI4w1yv{XW7FI1r>
zn5eMcM|@BG*NC5ZW(npdpfl&^0c=|G!`tGOegl%hRaR#0fls~tJar&yWT`9>bP>1e
znAv6@^t|rBu(D@6cN6C}$Ou2L8<A#}>93F!x<9AkzeW6M#9Z#zf8jY-SE$dCaYBza
z*!@UUjE{=Yii}5At}tXU*eyr${+3xX_tp+F3!+<;3lVXN_zH--eV9r^gYoeiV$j1|
zN`I1$W0@>6sd;QRqeuGNb2`E{blicvRjsFCQO+#hWYfjdMxp`}ec*Cb@j2L-?JEa3
zTRY&<9Kbt2-0^;G_}xbtjL6+bDh8$L=31O-4Ldg^2Uuy+fucjsy%ZJ$T00K6R}a5F
zvF#&HL&fp;h(9sb?Or5tla)c%U|sGks-`q@#wq-~n)W;y{N5X{H6>sB7V%?6fsRHE
zY0V$k=hsy<3IL@A%BxZ5?fT719|{Xsd)SNueZ|ZMToMzWh&}_^%^PR-)kdpdthr_b
zeF_<hJbnkUDt(>rMX@1O@mwhx3rXG~AkA^+4KV8V<_!2>F?}#Ng`<sUP;q(jS};jU
zLB%9Cs2Dnwpg9c=DTVUH=<b$F8hLg>VYS;~Q%pFLGuC|JGelhz@dWnEn7W)AP-gO1
zA;rNs)Qr?j8S#b>(>4^=>>&2Kfn(|tG1UZEzM?w}&T|7Dkr{#gA$&Vn8|8qP-<v9~
zgjXAqPZ_NUtr)(yO`dGZ;!CpgvE*#d<{~nCzHM+(&+8(pXxI)fQ(0kWU?4%kt)&Q5
z&Gw&2Tm~b>M37u8PHmil(#XdP{x~-+)_kR3HZvHd2BoPP0$6{6h}^SS3m8+te|?FF
zh<DEFWjRgMAyI0)dwL`!`M;VZa<Dz$c0evcc{E|+#06GXfi4miir{Z$50ec3bf8V}
z6QzPw1sJBptoYrCE@}t=)^2Wo;BAze?&0tSyajA>8{K;=0$*X+c3Ats_vhSHja2u~
zh(D7u){NADV0;C*>y4n%5G02LJ?avqQXZ_?Aj~!Fp|*2tNxr=bGp}c!;8dget4Szh
zts)DWJRhmI@35%WrmIPw`hf~)q-W`gJa0gF+tcH5LIZhC44@{}dgh%C&t5s<LzNd5
z2@N^{9}6BSFmLWxW;z@J#I2#(?K2?wJCZZ^HX`APP`;7Hi$-twcL9P?LU9FIjR}07
z*<%S7G7Nnn_8$N-(?_AFh<J$jJ5{d}>rydSrNxpzB{J9v?>`t9QJAli51UI|e{Ry)
z&O~>}HYD0rKjN_IfW!f3)^s?5qpxV3?m#BhdjH>PxmUMaX`)fFlg}_E3l>sE)ZoT0
z3~{X^eqZW-r%mIj5$y>y1d(LmFyBKhmA-y*;4_VUvd53;rfOo~-R4AR)u>BmO1L5<
zBb)gZInIYK30_JhT#XftI>aKW3FaKwy_KHMkQXH|F8?U{%qh<o12DM>aB~j$b#s=t
zf64^t3yh*#3eig3RF^n#RcZwh2HrET|KE5R#@m_#9B|YHL3bc@?+m&$YYtr|Am+8_
z5pri}x0<N7BRqbxr$qlW$p&Pr_K8{Z*TO5Qm0;Xt;RiG_m9>*f@RRY6K5^cuoJQ9q
zmwbQ)8ElSXs)H4QI?$)kaF<RsXiFkl!;%AZAXsNS08P#$$JM5D9JB|S^`ZrsT6;T?
zd`|0Lj*-j$1;I}XcE~{8J4Dc@O%Nju-fUSF<v&fAIddL1USiQ2{&c>Dy^XaP$Y3^_
z07@$VdyC$qHMPC71YvG)@#R+>{S$%ih#^SDAWT)PZ(+2|HD9`JxyRfN0Fpp$zvjXX
zcgQouqIK3YHPH|Lijpzy<H(F;(BYoZ#~qPy7Gs_^9e^-QVPpJrMv3%>+IowDiwq=8
zwpLivrl9zRn`$;6X*g$x4`!q)zZaNOww~kQm(W-z{|Vmvvw6?l40Peia|~<GypP#e
zzEbs_FuTq~5b67OMz7AtySEmJp6%R_z-5*rD_=cXFpwr7$JkQ<prfjvmRLax5fTyK
z4Am_*|ADTTk+oDEwq?O2Z{3!$W?;~U3_@JstFb^ihx&H@VhO;M5O$VC9ytZiYZ7IM
zc#ZgG;yu5}%rv}3vT}<IAWVXi(VBEYKpEy6(+wOjkuR_~Ju5NceN<t%nuXkMeQ)Gg
z3Ef^=6Mu~v>^9x#NbmJz-QaT~`GXO>rHzxV9f6K`_MUIP;Oo4e(BLf>DQLzR?g#Er
zs`Z(L*+XjRJsSu}@}|!GlXvyirY)_LZ45%>&d%K#5d9MIzap*?pCdlQWTzR#JHObG
z_!Y4483^PI0iFh6d7sjzub=6S-BDS)awOlq6FIn;xtl$U_`KZt9Fr~?XVk2Hpt|5P
zAXgLd_lV!9qs`1$n!Y7gc!wK0<IgG{WOh#~ydgmwNKmD48BAQR!1M+T*vO!AgWoNX
zt+jC1;vTC_020L7nyg)bIwu(Hmd~wTxGvEvy;UOAli>*TYVZ2_%5fW=LE*64hE99A
z8NDV9wneIRfq<{5WQpBLNq~%fx)QMp_J5?BJMzQYrUa)Ho~k5glTMG8+>YGrhZNA&
zVBq}Ml`(g{4f-9D_q#e`U@xwGVMjgfosO4{PT({2!NnNeK)lzmMhnX+{RReuG6hu6
z#r)2Jbok^qFk##SHk{M8FxXtv0ZQw*M&LOfv**hvb7_@X)#0O7>^b@UP9pwb^0cs|
zs9)!g8Q=3r8KUDSdyGz=Xb#+|61B!f)R5VkRnvEdAk}R5VGIaX@ydytSh*4cbiu((
zLpEWYN0Y1Se><8GUXnU~#sQxD72fU9)|m|jc8Ifq;0gmZ8aXh*IjF)^Km=?s;U#xC
z+Fad<&HI%*{kj%Ij)qJIR^h>9H<_{5)g}lHDp@)HrDH7%n=Ye<(}fB4gtd2@D4k<J
z3I^|xLaB;L7WQr;v(^%RaeO8ByKO%wzWHD`H81DMG?a@P*o2pBNs|tird*f93`gME
zq6{}M@$oI<lhHmctaK3i>W?begE^{#>Q_ejUCn>(dYD>pB>#vgc`!j{is1qepD`|C
z13>1>(Q~~O&P!_m?l9KW|C}PeA#p#j;kh+XOW*Q6;;-cT#fzwqNB_{_%b9@jh6yZA
z&JG4faH%pKO<|T38ySG=KqbzkrlOs*DmlmNmH)dqfW1MX3*EOpGY$9V0P4+VHc5SI
zg58V}ci@QGZng<Tad6781R{Ea{1WTDuMt4Kw6d!`+DC=An(QnEF`{rLlRN672Agm{
zti!+Ve=e{-ae9Y{uZVzup@<iFyHtfTa10xS3~#W9vv{iG-0s#6FvEsB>Y%S|N5dDE
zY{%}S`=2A0n#^Xqy|XSRIgRuZ3MiP*c&rH`;}r22@tFWR_xx_x(F<IMEJDhm1ARH~
zBq|#^gwfOzXZ&_!cU0n%x?~Gw|2jybN=M|ywP{mR6BukxR%t_UbEF+-Qvz=j=(>u|
zK3;7hdn1yzSulM>X=#Ss?HKXvvKMZ#`JIhp?zxZ?{AUlBHSxzAn>VvC_t;hy@tMDt
zf<%Az*~?%;AEi|?3VVe56GAqXrOUPb=jJ8neCbHyU9r2hL8r8U=qjclR_<}6IbbeG
zbZ<gFcYgDy-eYGvd>P;0z*MCa{Cp};5%*a5Xtd90iT+~!%VZ5QLkE&dNXp&e!_y)}
z!QqycqOdM5t%o`UBgtY%8(uPU^e5uR3EsW7%Sm5_b#aY!EPl7lu>*UlRImVHk`K#!
z<$P|2Rkv7J>H~XwsB_ER257FQ6gZg>BZa>b8Hh<&f4e%kN2-JmYac!#KA4ldzc9Bu
zT(1yZ1E5V)_#DZ9u!~;UICHkP;j^!qNWFrmzz^07v~l|P5nsS=VN0NN*d0K#&Hyr2
zJHu&K{CvRZ4={2YI&K|~-YT%Usfz*@GHjS$%7Kfd9IXXQokIg(epv@_m#Q4>4rj;k
zE#iB`KbB?v?IUn!D!K!Qa>9EQCTU$zC}J2hI76v`W_8nvHO?qcY`F0T{<MQgd53nY
z?6S85SZN1KMW@O1&@B=AZmolu6%_w7;veP(R58$GlHMAM77WH5t}a}~9eu|Vp=2Zh
z+2;D^Ov!v8{hL(-2XNbH5hBe1{qUawYRaZn!^n&dO)0?n<URrX_jF*q+kA>C;>G;J
zZP|bR@(Ss;BWG4q#bi+`Uk|eI@Ll6vbCTi=(yc%j@r`ckn~ZC)(0t(wy@=D9614pf
zO3EXX&`v0^cI-ieGfu1Mg0GV?H!7&g?lYTLv{0dBR<#k7s}a12IT7`mR<HtY>(_`q
zYjeP=#VDLGsoi=SsppCMxII~X&WM8A2GpcAm>n`yanO^YxdsoHAuCR=zsA2RzW2~M
zQonX!fJ@?ls+C?pFtr-%zq0rZf4L}Q%)N*e6Mb{;XaLZUwih3&m;z_4L*J6E4}5yQ
zK7fTeP<=`P=*SG#suOoyUqXh*$k?Hass2l{fzwAIdPk4US9=ajZj-h~Dl|oM3*zvV
ziWczxb6=8RTuElttpQZ9ye(j^w}~{$@28K-Ak%U!Ca$~(61>x1JS6KkaaxWh^B?%O
zFm7Gc=;Gop5#J-8Eyw>6@rPCa9K`p@<_A`JZ$86^8Hk<icw)~d{$O#0RSC)gGmUlg
z91OaS>+e_9T?&Ycbu%@r$e1kNEdZP&{)qS<@i#p2WV~Z(17^py0SI^%=5e2iCnYCj
z$4aIgdS(cqvJuD=mD%|K?KqNMY`x}E?A|m72TT5T$OF%W()2a*_ynbW!MW+J*IJCw
zB9SV=RlQYOPF9alm8@BFkW!%pog?yl8L5~u>N6wR)AcV5AlDWsrH_{cV>|ipxe(RS
z-m%y(Helz&6k0-z)|R?pT}EV_z0oN@U%jaW)iyT)Xev-^#{FN{6xa+mj+pI40BDx7
zGkK52668=lQs57K(ADOP%pcKOv*XX3mxSfYg144I1pmHUhmQF*@Ksk!yn1x#zw=D#
z_y|5{maJsvPvmP|IaHal{pUxv5#J;JpdC-65E^D*1VWalO$V%eZ!Wn$06pY7FV-Zz
zkv<3nD`F<JPi0|X+8l+_TN!LH{ZvVt+7(MnmgDF57j`N&OK38uJJ;vG*#jTbmtaJ0
z4+qY0n+>*nSdX(7g{WqBl^gfer$u4En;dg-|AW;~!h|zA|970<#m@cwfcXr)AqLDX
zUgj3fE<8HnmI8}eKXF!SK<w|P?Glu2tloX|9=QzwEBU*EwULznd7VG&Dgrc_*#Pq$
z&agA8Kh+TC1g(oX>fUCf>CM^ZM`#-d>)d!nw5OV&XVK}!cl@;rC`+5sqoaCS546m!
z|3l)Lkt@2|GhH$htAxMb+b1%1z5{#gz<%2fY~paWORaCi8^n+|eBvSl6FDrZ?7~k*
z&dmg-d5-v=6wxL&W@l<C(J`Vn2g@VVmrmTf4J0{{LYI~wo=A}w8OSB#PlRv+3aZ+6
zxDmBUr73*D-@Qdl5x*K4FGWfR(BI(yL_<4tamji|3@X+tsv+Vz;{P9!Mf{%;zuWkT
z0qAiBdyeb#r#iB_7R0tyEW|xZ#$hx43HvIg*-ApC>nWjz7fX;^1-++$ceHBsD{Wo}
zklgM3W@I`C8=><HjLgBB>`UTFzrO#AjnT_2BnKX`gigzB%Wf`6vAU;FHQ^C-x1*8G
z=_TX@3N`I}z8u8Ehvb>miz4uWUry|#ntBA;I;$?<UOn2BjFRyT1s;KdIgq3O=y?Dm
zzf&HeFTdJT9hDt#oZ+5SwE=asM#&4`K3PF^arzJRP)ar;`LX1))0#rjZZ=EZUq;Z7
zq^|g=5oR*%Q{e+Uwjk$7t0g;C!jAT-XOmFXR*kF(AlEew8QgzuN%9C-8cjN?Brv@N
zb%01Ee6$%{XG(+*?9qY(3af)TXK}QafxVFrmvsO;M*MCKho#K|lJ2u(nKo1$J^XAK
z;W1fkop0pDJi)?y$p`#`e-YQ-@*b6%(nd<<Zk3fu_Yt3fd;|V$SSu*}2#ruxUznnF
zwNBh1wqKL=%j}k<r&U9SNX-W;r7pg?rTn>HeoATRt7Jus&GcIvS=+tLh~va?h&`l8
znuO-d=r6s7=<*KzOvceJllvEHo<DVFWWQN==uAO=W}3`K9sI(|RR%gnOXx-(uh6a;
z5WS(I=kf(jNMXlhT%TP32=TWAxp!gKy-~fpEki-1_zMnHl|tRmgf3!R!lO9e+T5gh
zNzWL0M~-y!;YzjkrWq)BgR1kBP3ul6l*!yxi9I|L{?GOvX-u|@a7Wf^GLK)_9?11A
z;9qNxGwW?d?_q*-&{XQW`~XVvnp=jh+D{$vofIWc5cFULF0nnzpi~&htvI(RVon&t
zyhze~<y6xvDCNs$A3&H+Dl>*o>Rqi?iJ!btSV})nhH(b$XCPwjaDdwdpc-Ohv<l|Q
zI;TVdwVP*1l7$8t^FzcVZ&2T6znK99PcnTlza!!N8$-Sn+ZP$IjJE(;i6PBdjsp}>
z`%xpLj^M1qdx*#*-XdNi?s+O<M{`Cx%43Gc{}u5MX!JKK9<pN9$H(O6Rq{V_lNI-s
zv*GWY%V)NE;|LU%ph|vFVgh2?hNOlRAl<z!?`hc`Ij80GXr!;}PHg!sak~wK$(dj#
z8F;aFqnSNVx3fFXO#X1ix*8+s7XE9r8o&jluiQare!muR1djeYllOHH3#x}D%*e-`
zz<tCYNO}KH`*~;aaa<E<av%kVE?D;rk8?6f+;dU{76Ll(EG>0EX2uAqGPF&Djs}pW
zfxBnIdu~I{Q~}G9@2~6^LhW+h-X$U5eDf)1%;jVNy|(<X;6k#s0fixe;M?{FUrQbn
zueOGT5=?}L%uSo5<SNL(E^7|aL3n4Hk=(MD4g=~p4{)(;ftmP+nwJ_}=XG7j%&d3)
z#Q4a}(uxlKQ)$mOT-cwALsvj#W+An<_iS^}l@-``l(OenW4%~|ZPx8Id!xsQzk}2R
zgY&}7fIpAKjYx%Z#P3|VuL>zZX%uWa%|Z2*^YS`(*iy^s9M|@zy!bbsIPr~oir3UT
zE}^kn_0RV=5sy3qQ1{lOtC_In7NTIoe$<L^;oH=Kbbx?=#>_Wj*oYW4r%;r!=LPO$
zmO}hkV~A{t^2DFqM+7loUWFg_<j-^ocNpxv!biSeC32<pX$F&F?L9K=U;u*-qWVwk
zwUAA$+zI~cvyso=A|`8b3V-vB0xC^oTyQ;M7As*|jcqIp;KAA$1=DM?iNaD~_53Xy
zfZWYwq@P-dL_6fx6Og`?#g8<gRs{Ze>F$ZWu|5OR>ZublK)r+MVkQRPUOj+Ss3md}
z1@)HyjZ}6TyM)E7LtofnE2=DNf$otJrU$UFthRuKn|)Z#MKxq28+{lV3;V`2hMIP?
znhA=9A?uAWEE+qvQmTsBl?4W|cnbp#>t#_3o{}}4u$DzhWqfB8GS4WbC3h|Ptk`sb
zVaqmO+at+Ri!e5`#>_{45`GDtwEzbF3OZyp=;^`}M-~pCFw{JsoVjW#Of4Vqp9)Y{
zqMtNdGZHzOCL-`ni~aW+(vOHAT(c|JL=D&L2MP&5RhFZ@<FUi&)Vt()jDG#v`HUoF
z!XoBRIoNQ_#rs>jd+s@W4L0M-mxe7!RcJzKrTK!9Z>}M4I!1ndSO&`tcZnI1dsahG
znQWolZpX*>=-1@#dt{p3x&Sp8#eZIQ&W>czSL+nJp^sYtn1}cLPkV^$z)Ey*^Y=i!
zDmlQy>lyvV^X~n^X6y)VO~mguT;ldp1?w8|!hMg}&514PZHE>PEG&#%XiR>94E_r!
z)hD4R@Jv@`(Ws)XFfwuUN)ow^PS%{9Gsnf|*39PvX10sKJ?Q#iyj^C})e?VPtjW9_
zm|Yp?Gv_|{R+|XU;2}YvBBx);U{2Q7uYzE}6mF=1R7x*k>(s-n77Gz{PwU4z%cjrV
zs=C7O038`PC@q3b6leC<@D?OKb53+YQ?F45sr~eD%@sofGn=A&A_=)}+TTEzZwcr5
z4N5rYA5W(3O(~R}O-fNyrGjw~z?wwD<LCDY6`U}@K!1aFF53*CojpO=We#FzJpjhK
z1}GEi?Wrlf)est^(LW4eJ#eFCp4w!OpOI9#2hH9x@ZNScH+&(&^(7)Aez8P(VLYux
z+%O;95*viC+r)wT^=nB**PkpWQI;;JurS`R-i6+#_Wli@TiP^|NYJw)mMlMvF#lw=
z31R3-JNff9AGBXkL3Ef)W>8@Ef#;7p87Hf7%RW@o%Yb;l@!+t<8nh6FGez8(>30U1
z6jM{{^V+=p+3WyX<SY{-4pR<Q#Du!eWvHd`N+p%aPd-t2A~A3I={pU)%6e4Lx~?m5
zVM)Y2RkN?m-xdF$0z~l<0+DP(tZZ*xRH|vFqYPBRx^jAVxWUQD=n?EuuMe=Jd$%to
zwF7nJ3borpg`paI&OuX|KEry$w!K|bhz<VXK&7$;yGn@iSHxeNv#j)0vftdsX}3U%
zIq`jo_?F0UlU0=jssqh`w>fX06A^e=k3?<0Ze)~!)Vn<rybTJXFwwI>(-rOcgdrC^
zUUUJ2Q~>FDfMOU6Q*kY?h_p|emoOoyzkoQ79G)EY@@R$D#rpTCHoQbKFu|nYjO@rp
zH>#Nj!i6OJC+MU<BKG75sT6(|mi@1YXPA?bNIc<%CJxB3PG8-B^=3A?v4DGmp-#%^
zE2s5SaA=Ik)Ys0^pg-4Ttc**zheEts5^-nn{F*efmHrt8Zm7ePs6A9PxTyBU4OA#l
zcYM2K)f2w8p_O2?P;vW++<>EU$%*egSpGI_%0NqsJ=0PsH&|VLw0_u>122(LRrt-F
z`LM|ZO8Yek6Xa^S$RA5(`Dhf(b|-LcMH6MTjnRFbnMX>&Og2MK%t>u>qTK3Z=}Xp|
zF)Ao*fLUfn(tjjDm~13_jl0}K{^St*6FmQdgY^t-Vgf+9eDI2dG{sV8b@Lu)s}$=E
z0uUXI;Ua>zD@mD&5FT-iC!E#<wf2>o&+XNif0=*rXhAmzwGO7az1_bG7(L<UU{tFx
zZ5#2tew>t>V8%j9fH-npwTa*lNfJ3SY4TU-p({V{$>}Q7bZn^bdiF0!!o_^eKb*th
zw5ef~3(3Jsb1HKNNj(5S3uA;A>!*H=`PIVK%%svUrU&01f*}|p8F<Hm4G!X$h$iA+
z>j0$sMouGYZ(?NmlFT^ng$t-`80H(n><g#(jTX3jd(Q?1TbYy1O_Z7Q2cpw&UtT7+
zp^&Eq5UlGMBi3)$ktFfDV;c_ux1F_=&i!V8u5Eb241W-buEt&;5F$uOv9W<UA8`+3
zr-rpVOgF6n>&+Pl=nW!9HL0*_;?-)+x24|$cHV$*4Oz>`b98EgNwJAwW~BRy_p2Go
zyT|Bfmv*;yAYoBTXg)L4E-p{8fAReyZ-gDgMDUG;kt>4malLt`h`-RBDBs)hLGt9?
zx)U?_#q-h!?aXq7<42>X7hnEI#8>3=O2OM@^PE`Bjn@S55{<2b{7BHBIn>8Bhprmh
zK-{Tl#L-7T0$=-e5-S>79P*inli6>xh^Ng5A_c+e1pj)p4DQ}4%W4rw)9)V?E%so?
zjO%Zg|JGQMMF6=nL876`cQMeDp=v#E?tfTye_oh-&f-@RkG;{9X%Tp^F`M#oJs-LQ
z!)8RXMIXT&od0Vn8|>N6IcR+`9l>tB_+e$7p3Biqs)m#E+l7`$+gnjO*%@bB8j+<I
zn^cD}Eq^f?g(ZhAF?_e30SJc#M0ZbJeWHANwF<`K0M4C+Ux|^q6?AiQ;WzC5LN(*I
zfm#;v2NTJsO7oI0Z^O6V=0!BX*DIya2jjq$*`x=kW#aq}1H451!P6x6c&`l>N0DV9
zi$0>ju1E_yjC6xQ4GiVMXu$6gZ**140N&B%EM$ZNUM-WaKe1++VSi=%iyB6T1glyS
ze$$e6>z6jX@tSPpLW4h>&#*+sR}-=<D3k!Z-g7={u1+en4hIs+87x7a3Qol`)11iN
zj_U%5E<WsQC0)#nV`@omKdmXuk!ciJ#BU~t>G?-SmX<a$3VX_LH1-UEmP%L&2>oUA
ziDm<J@RHN2N`g>(ZZ*NQhLTyD@Mal6XHS|dtROk8&+m~7Qw6fc78-2q-3s!Hm+LS}
zoaw=2gB^m5e@FZ?c`B*|A`(O>_o|miU9L9wZ{Bgvg7kd?Lw=*adaw?p4fHm<7iAq2
z59$q2$=-Zfy@uXBPXRl?8dO~T1Kj?^K}>5HJ-+$M%K2Xm*3C1KYb=YvL6P|t)Tg}a
z9Rqrdc(gGeDS~T}kX&KXE8=Yihgbu`>g3I`GIb9(;OK!=U;^15*VFJa)J8`$bdg$_
zh1|I~_mY$WCi4}=@~jMOrfBFf0pSXrEk-}W*hvq1T~0vQ@IJE*A2@?$#3wWFj8HyT
zfc~<4`&NO?umnlT|BBet-Bw!|NfDI1<a|(P7U(nS<k@bjTpg*KP48_6n=n>p!8}Y&
zpN8=!;#Y(JqJ+bQXrcg%<N(*0XW-mQb7@`~EH*jZSjvrc7bB?Y0=|ATmwNmD`Fh@Y
zF8U`vQI)7UyZ^Q-mEArfHwSj06nf=cNe)u2;kQh%Lp@ycwFNP_V>=pXiUkEU%Rqmn
z{Qn2xrh|)nvzG97{Yb`v#*I5ZPRws<I`|21wUGf#rg#q22pMBR!@zxJp7?fng&S*Q
ziv!CmO;mGO>*zB(Ja0%&PVnF5Qbv8<ynZ>dD|+@9T;&}du^D(OlItIsW&hcFfOoh(
zB?l*~)h|p~06H+wmJGDP><cctZ~$8B8fN(W#?=>~sV9SiH3@RMhlM>k?7|eF+@3Gm
z(a(6WnMvvD%@+H~y54hhw-aB!F-s?~*n<sV8gQwN)qZ9``eWS)l_5b;seZP&&!hr5
zD_pKsP%?LwE~I3g$l`ia0c{|lo~l*PH|$MX(%@lhhTNr0rY!zQy)MUGN89wq>N$&x
z|3U4&i}+s=f!&HU%_%0-YXq$%B2diT0V8s2;Zpz9F-fF*c)EK$(_Br=2<aCim8zr;
zJ@LWwch0fmf>9n!pdvN(1`X_GsL>pC#T5;(5Z^cJ_#^4hJ;$(LKGvkCzoD?Ny#1>(
z@z@SiI~x4X?T<GAL<i|BdZ;u-$s6vu!KEyU@Fg;4#m$|ykvA|>VH)U^!kUm!HC%jQ
z2eE_Q9O#QnUn9F%MZ8D+GvYff&0lD4nBmo9#NWM+o^e_x_=+jwE#h0mF5(FX{%T#U
z#by9i@5Mgi8u2w^i1-cZ^_8Zow~w0nbHHs57ybn$vB1zafa>5$M+|OH+{`UAx)>^0
ztbfhSu^qvrH@H(;0Zm-;&sXXHx})|YXWScD7aH0a1(2Z*J_3cpDu_x%Cw`JT(A>5~
z49R4ofO&7;BA%1vPp^Sps;p<?Oh~DWQoXZgou99Q19+r}*`WbbWwyWvRfz-5;2IzB
zRU$L5CPIJPymMcIieq$VS?omUC~ZG~Nu=l*i`0?|l{kPLqu0mf>t<@Thsm->D=erE
zeKl4(eaXHgyw9-eopn{K&aojw9m)5?CXWx?;Du$1O$kg*y-y)Fw*tz*e9Y}4q<p~H
z#?RJ3Ue`>`G{Ip_fzCGn8Mbl4>HU+a@R{eL2|c9&Tn8arD%#Z2ec2QsYY7B)GobjH
zRN%n|L0vv_U>zaqZ=9kxbi<m@@SXfHRdJQT4IQ<D2?!<nf{3_Sq8RH-m^jOtA2u16
zl>ufBP68<X?WloVpsnw??QvNN+r|L3m15s{!a+tQYrv^&U%6buIpU3{dOWdzVa`vg
z-YNO=I}=X08sWmyNQ1u6k>r-?s)m=*b~%E9vd!w#q7m#sZrE~3*^;q(kNAr|{&WFU
zHQSK_HwHpV+UgbM7Y-md|I`w`6s*)X_C4ayi0^CoH5t;WS@eu>p)L^dDi28!MjY&f
z=?_MPN&Q+8VmDIY9O^X#2fd?VlNfZn3Bb*G4-U*Ilh=mEa=^i(bI_SVhS5||B?(PM
z3=$um4%2GLJm07^wixqd71+XLWE(zcCdWmwcBr}44}D}W)6*x0=xRSa5)bC9edgJW
zQ&I%WEOdvt&cb+sl4q<*lb&X9EfUKK<Cov>@NhETw1_%kRe!S*nL^hNdVC^6Z>VsB
zn4)uE7>jbo2VIgjdc6vx0Oox0%;1H}()!#y;mv;4NcV~xs;D&udQMdj>U9!WQeRQl
zHyNhSAO+{IO>bV8a0PUA*v!%3dq-hTV!K5RsWc%`i{HDmHGeXMk=8&)^ItkJCGMm!
z{i57!ngN$w>KR|H#CI}1wn|D17O%3!8?53bI;XdeI5qg)GGM9bN-Bx3%NYoJ*5#7K
zfJG5_vd>fb<P96hIXb6=gA3eRYfUm4XC;W*clh(Ph#_X|op^mR)Ht9#X>6kOhW9L%
zgBxe3bpbeiBfr_L3D95xPoxYv9U&L?;2iOlRN@^b?zVDgSfwN9rhu*-;4dCXWKWw2
zOf+zX_dl}>b3yTJ|DF;ASsB=s#B9e|Qvqe<j~51{FKNlJe<2Z;YP-O0yuV`cQuM6N
zlMS3WP1cy%=KIY~WMP!b60h3kOLUe;ECPV{#tjThRm(w8O4GQ7I=T1>BRNMrVE-x!
zh`hSsfG<e~=nkQ`S>_EB`j1HAN0as6UIlS8tO2#KCrV|w?hoxpiJMwp-swd<@Sz8y
zTI1(eg&NUbVs1(Ut`jWSlVRH1=GA6`nWf3ovMShaMz64<Pi2p;$gSvjuqnmk8Uq$E
zk0lJ%(d3LJ=kk?ZNQ<cNpdUd5F^X3lL~usN&GlC){DA299PvU1F(<x9GxpSe1K%<S
zx_4{sDmi@bSl6NkrY-ESrDMcrJBZ>@ULu~o0C=kbNIKO*cg?ZmE7v-w!!yMBzt;}*
z9Pu56f~X>?VN1TTTW?)h3>a`AX9R3}{ANxM>(`Ik5W@_0HrgjrsLb@e(?{?_jCli^
z_AZbCH~5B!SiJvY^6i$7%T0P$Q-1xn*1#$^97#LR9KeH#2{WS*QWi`Ec5Z_!WT*<p
z{n`Lk0na%uPh88%9`Dfc=4x~O<>(76V!rL8HBg^z<WsSE363ms5L6I1q+tvXfRokX
zYyO)IjTM}*CmV{rE##PKgL#J@5eBMuR#67Iu6=M%&b?!`=EZ*?q3fCEaao@N9pIV_
zG$><uH_a7ZWn0ChVtZi#Oig7MdrIww3T^I`c*I3rt?H`IITf_4_2y0NnaZ>o^Q%tu
zNLF~l%ccVr+_+;o-q^wihHwsM(YJSY{GE6%70W9O*`3n3{_>0vf2I8UNc7EZA^Oeq
zcUt!0+*1h>9==BWE8?98Yo;vvXrlGK!Sc~4s1w}7#hN}A>p$AqwH?p5md~3Yr54q<
zMFt?1;DuYCQ@ay&$Y!Y#y4}EFk9!<3_ppboiTqFmUA%-^MaP_=3u|bX(ZnMg6i^E^
z6Jx(OO@4`c>(`UC0tE1l$A<=yU}qJuCrF93h?5a9H18mmU|r?-ky6yQ3^X$WxMik+
znEagNbxa(1*o;WgL8>+QjGB_KNeSrr|1?%*$d`ldJiKlffd#@_&Ub0_aBHZ2VZB=a
z6Df~s*98~9!?D%Y%a^_;u-k#;{gtdWH%O4J;dj6yJy0LI+B1n7OKVD&@GXhqB|M!P
zI6G<pM{9m7Z7N&}LR-WqbGUy+QS_OweYZO**EfHqI$3aXA2GNR2O?l`-aLXWCv3oJ
zGz5cZ%(>$l7bc+dI~u*08Etm3>0r&K{ns$|A}n9A=!S~d1*RiqvegKdvFsUVQ4>(E
z*6WhC(o_Jg5xDZs4<BrvWn&;|JA;p<KxT#%1L$!A-o1MgEyD5+j-g=9E|yu$KlufY
zZ^MKqdv3!cY<CG2a;3X$Q9xYO090}jb0KQr%7EC$Ms+Q+n404GMC=syy^=gO1~dmp
za4l!5Gow+lL^WZlI5zswRX!`I$vtsL!dC;fon~>fmHz?E(4`i5jL+PB2hFn}Am`+B
zpipo$m^Ilz%$9&&uQyO_4oBlqOE8t_u%1O(z=xLOQd<#S%+1AWzY{<sy??O~2M(aB
zKN&Y7ti6!}%XN!08nIRJhexYZq$&8TJ@{7kEG|aCpkGs1zW=@HZwB^kSoQ}@>%|X#
zG|~e++5RiJdPkI%Cq68rhrTg@Ihs1BFkMH#IZ8#-IBI;|ND?EXsEwvgUlVW8P@El)
zA5r=Ajj<4?bpV>h5}&x7z$+CIZxKJ}t{gXOK%!8^{0yKJNn0j+=<;q2AWB4DH7xKm
zq@vTo?WT}M=wm|7qLPB<$Q)U}=k+L(%urSi%F|hzcm&_WT0-X^F~4e7Sc}wh5I$K?
z)U6Db8Gm(exwDeBmiC{_x&$^Zp(jeoy<~Xg(FXc1=D^&bL{173R;*A)Qy8N#t+S}i
zR6rFfzt)Py0>LF-X%PV@`@EdJI$?~Njg&gBpQolvn0u8=QY9U?FfzEK+;{d3XlSuK
zLil(EM|@=dQ73O<MX=h8Q8zeO(=IrW4Hj&)ry3TRL~VFwG<!-7C^H#hW-O_q-`Tj~
zZ;V8{1>g?spu5TlgOydIMmBZh`sKq90D$?PD;p&;+tn*9z=FnC&zW4Dq)J+auu^ym
zA`7bi&v@)+$@TZR*Nx~UR_qK2y|dh<;~#GX=`@Br0v><_g>}^y?9O>Ti%M^kb8V<t
z4b0RGJG2`VMrIEsA82HHF!E=NcqH`ni36SBMlxetYnvsrH!k&^Fmg>+abVQXo+P?7
zEmj5}{<`@*08xmYrEETetHa6fZB*UHM}&1BOE4uNoa(6gI*Ff*T$i8GA1M_aKWgL@
zZ8`gyf7|$m4A(6%oARF>vZLMz*9-<ilFo^2<g$*zB5CgcT@8`6W6DitJw3}BK;ZYj
zTQxPTt2M_@>LjIA^p30FFDp1F&WdDgz2W1Zi`GC+UAWfZPgRGi*#?c6NFC6Ri2ogt
zA-bDjq;|#x4^}Hp576%spNXS;P@j@|Ii_jTfNG>)g{_EmXRSr{Y3+cNJYR7Dzj6#k
z+mtNOMj1sU#esycxABb`qn$@0!LUBP7ZiV)b)J>WNYeiY>+8|Y<)qWKxwPDJhu5SG
zBCz^sgj2ti%nN<Ah7dMz{OsbMNOJY%jrGE9H}cHZ`tOJ?;%`XjIs@yI`TY$C;J#YS
zonTajMZYs~c2UI3nW$dk-w*W7zSwH-!6O@ucq$iNjp#0ELOFrpcMRKFltD7n>?r-r
zSo6}z{Hw9JX$|B@#EGi+9=~0JA=+gTJAHN7&tx(9w8RDl%))jC>K#0yJylp53^#U3
z3=Ser!G8ynie>D1PkEtS8((Eo{0on+{Y*|I40kb2{*A+*Gx)DOyGKQ{ojn>+FnFVv
zao}J)asGG4(@8tg+XJ<KzEU-OgU4^!=m#9AR;b=QfF04j=AB{KF=JO=VKNtMSP^C3
z@Ik>JUGXaSWHcKR6rF*_D4ahd{*1`!<&v_yP3D>FpjT9FM?%lI?4DDtaRY=E#+55=
ztCo#Kvu}9%XqL;|g#KVbcj9bK#v+YL5+d<SYedAE)t+zU!5GfRbrMzKaT-FG)coF!
z&~2uFy8pQcb_Wf*k7$`Jdyf@#)*NxW{@j+T<aQ=(HD_x_7?Esq{lwqQK1imvw}br(
zj^BvEt!Xw2oBE;Ra=}LA{(c4LAbJP(FSwHdsO1v#^)1v)J3blc*|WMC3B^X-3v=3g
z7P9Ax&PJ!yRs%{i;Cm96lMO`CT?`zp^Cks6k{azGIR<Pd=eL!O0Z7+h@%dz@XkJwD
z_8Cznr9cGedd_T*As6@(foP{P7K<0lOjDiI6O)ei0F%`DWgMX$Y}4P7@KkW~bEC+H
z2qlB(3dSkb5kDf{BR-*PDp{k6CCuD^sXDC4-dpZd!Y=0P8cZ4h<OVX9UKz=WLUOj&
zk&CX~nbxtSvQgSY7{W?ni#5oBuX_%?k`Qle|FDg-iTFL@|A26qxADwVq%zp8ol$+a
z*8oK}gSb2MCmm@?zD#0L4zy!dT8$7&eKn6Z2SLeE`clr@+En?8b!si6-K}-(1<Z@+
z_L279XMSES<=PuEvMVP!NPMLE2(GSJpZ+i(sKFD$I1{{z7`4J0bbcDo#4k#77riCL
zd$N?v<}_8YD7x~R2F{C_`|wc#L&xx*;5cDaO09FMSL3>oZa(yn1i+3$K}+v~_Iz&-
zR=iPgJ)l_R>xBiYmQc3+=#iYc!Q2_Ky&7#-(t|ti|3b;D3iv&7<7$s)m!c`>w^>4A
z1WDZQoRe1`shQhzz2>(aZ94JdrPR8n<az)MW(Gzw`)^|j9JHJwen2RcKq`ruw^_h~
z?U?vUX?V7#D!NSp>G13@KBZMX0U)<g6t{O4rm-0#UI4xGAF1vqt-+K8u)w5V?Aeqm
zz=qkf%fx;f++qQ8tMgaH+q30@;;<I4f}{Ur4qP1-b9Q{i03J4V*csjEZNv|pNpBD2
zIPh5?9)KO?iVN*%1N8`DuY!6++5sUMSG{b84)aPq9;Qs9pQ?_vl^z5WrLmbdB@DWL
zb*@}(gtPQh7sPbUngu91Spg2Hf#TmlW~C`2QYrrg|CHg3?!k_t178iqY#*S^Fh<5L
zJX-m=Cjf7EbA%Nb&-iL_iC1W&Va<YxUh1HcwPz<|_$5r;R#DvXpbF*uId_zCq%NCJ
zH5`f071ggF7VEtpq-thb0MVF>oWkpThpkOk$c)|DSbrJGoWy+NfieYLM8)Z!l8;bP
zyC>q_K_HjJ<{gRu%Q^#{)2v?^qFVtM&&%jv8l#q;|GCNZWKl*g;B2H|7>#{<vm~Ca
zC3~U(m7ge4*rtF6eAdAGr464r2m|}QaSMZW@daRRGDFhIS*+j0N9Hu#l{UY6wC~J#
z^TGv*fBqW>dru_ps76bLm_nzU8yq>Jt+k4DlwvddPO*ujoYE;RYY0B2#WBraxFB+r
zV9YhT=ZLq6Uo2b9S&ZE1s1yOpEr`5$<U&UK2<=Md3n=e(N7H6l7^`t6BMXtvGE#iJ
zL$UW_Ok&C3l(dPY6|9U&kH1=dL@J!g=%~Sza0}~gVG>&y|J(}lIl(k6S}j-%{q=|r
zS_kEdbZz#*|AfX)4?vEENC_IGwt+L3hIk`|AMGh(LC#uIb*#Zo*UblF2yUF1qMacD
z5)0Ycc}!nAnc{ph>_oE*-Zxb+vfm5WQgMFvTu4G&r7W6MEFvPrsL1|&R>uYcmj0(P
z80WI73?jBjW&y=2rsk+sF7)0g6-}XA*nX;aXbu<e6f+7gM{0<a|56R<Oq8i9xy~zc
z0>Dk{L0BrZ%*vsjGk0%(rXZ5^n<g-r|0`G_)yg`%|M8<%rqPgkhy9)+P8=#7KD81o
zL>x7Q(S-_h2G$jfmL4=^VAMWQ+tXV|HbM>XqQGv8cWVTb46$MX75ghotwfe=RDk{&
z@vF@wIKev>_K<*_Q=KQPcVR~7Grv<i)0IytEPqN<(AjC8*}Z~QZp~?zMDq;?QW`ls
zANPs?5~iWb4M%1E<{HR+;0_Br$s?^4ljX9Dm+FAI&E70<tOblrPXJh)0f}3sB^a5c
zwc`Tc`Jk4oUYx$3eJTtDk2FE-VSoSI`byGQW;^$Tw*|)~BeU9Ypwboe@QgKwwvYH8
z@h^@<ZE<FUg51*z67U+%0%cYNPJLK!gAa1fLcuQNY{VYNys%$YHY!fl6ELb<7FVZ_
z>gbsz3ynS2cr;zxcRP+7*MGGGH=rtPjfFT7BsSK+TV`tW5~^iGMUxRTPe-A&R7}3{
z(0~@iQ;_==Ap0}`&)D}j8qr3hdUJzuKX}0Tq7Iy)Q##t@AN91%Sn0rrUf_Zk#uMy~
zM5_`{9pJ1kM5yaJ1DhqbfALgFC0H4}a!;05tRHCP=U*z*!DQsRz^k#)iwvl<p^=59
zG7?7D<`fs9V?l(>?|TBr=FlN3xv0dDa)3j`OT?e3`Bf7URdL63oPOQQl5upTCRS5b
z7<r-Av~m-{&GGVI5s#2e7Zb<l=&7c)4mT`lWL#U%ie#*g?m6WUWjcRGR1trRI5G$D
zl@$2E-Cfqt<MRVNa@<CCZ^8kzIFoAqQ0_1UpBn{o2c6Ja;&WPe61L>o#S(Z)b~Tr0
z<`e#T^9(Bt=Ez0{^WhT!K=QV7egDEcEd}Mtsxl97QL&bRiULsS;ApAf?3K^N{c{ri
z1Cy?8YQqaO_LFgDFLsz0DZDZox%<jMtTK>@xJG<mD<S=laz1plwu-?x_-PFwGUI^f
zJ-b@0(7!*k3aJT7y;_><8adh%F1<3KJSK}*f!_Wg+1|rzO;*pUH%9?yaJHYx@p}7}
zkWfb|aB>hX3av5<vLwo89J>o1<&CFmF5Z8%X_}oq-=;Ea+n58Q^?Q%_lO#~bcF#f<
z{6kK05r9;Lp7Ui1|1WUfa_kDVL*l;*?6l)-T_}A%6NV*bzrAjaV8N}6?iA_tOoPGR
zqVeJYNdp!1Ox#(*QJPNi%0ojJ6h_W7mvYXKinulIA~&qM!NJWs^gjSA2Odu#1Cr4A
zea+zp!CKvJ12lAlB?o|5qit?W&_z5)yhc2kB&#Oqy+lmli0glz#`tCpu&#X39<|s>
z?eCmew#JNB&SW%RPSvahhp6gsX|^z0I$xPsVM7BKCYTy=QEDqt)w5z3(+sL0=)Pj2
z4`h=gb^IqQ1*i6xTg9cpNILtWTI93!8PFN3Sk(*75+z1GBk*s`<{#EeFoHNLgL0*1
zgJGqDG6}#vxUL3FywU^RK<Up8{CTqxEBL}!Jno5{R5ihgB=U8wf>IH^V|i<9X%>H)
zSwfMm7jG`sf5FjLHnl>%e3y?XxQV-%`2N9k(*b$)cXGlM08RD)N%uTC{#5#m+}NGl
z;a34m>ohFWA5}FEo9JsUARRDrIZvIU7s!o$f<aV%&a6apqKF-k(o$%r{JVIG!rac4
zGE~iUt5=p|;e0l<I0MhM>A9Hqi2tStG{@S74b9g*OCSn+h|B@7RN3-&7};$BCK`=f
zGZZQ4CG5)V9~%+^iD%SLc*k~21V32{wqb)UGswkOg~5*j>~a1`g`y(P558@V9zSSM
zD*(sU8o?E~{?WLA8T&o+$f9a9eUWETqj6<1yn;L_0OY6jBMtK<UY!i=UBMd}3lW>q
ztLINMHo3D^XiVijtgpY|K@mIah3WYSB_RrFDGe+BL`vY&DkiNZ9>af_M%m&+dfcE2
z$5KD5Qz*@Vuf}g*X`_EyCO*OKY-CWbUTl$#=xTeM&Xq&4kwK28U#dwtE6m2_->Hmp
z;%ncTppZiWU+gFIcUJ2pT)+5?f9veP>J;h!X6yCW5>CJ{sj5Z8Ke2`XN@U!<#jS~_
ze;y-_xS%`EPljJeGsr8!y&&#ZxWkq|J5XDwHUq#f5dEs>sBd{9r?{tO?&4dR8${^f
zsh$brlP!TZZ^1ywW(p(kkA&nqBNt9dAIz-=(4rs6m;fqO+1_SrcdS-=4TUjVJ$tyv
z>2$<~XV}t33O!Gfdu0+;!6)8f24^$F?bnR@|3}0F5wx<=wqcdMPai#2b^h#Es?iVD
zi_~$dPc};7_RSr8U2%+GB3>eXCmvmG484Svi&utmsr)3QHyJyilelmIgMpffpIk7;
zBR;IOL8WN{q~_>dNazY6+Y2<}!7ORx;&*KF1?*fHe9D<6Uh%}BGzQ#9{AO(1NU?cm
zPe>fr03s}0WSp#q7!)|O!B$~|LYb^jX4Y{}41em7cz-7M+pj60cYB1@l~k^#C*wIH
z!+|^{&l6Aqs<XVfz}?-0UV=m3eUt%9cpgUNsi&X^%2%6ZJZ0BAZck#1j(Ao8qU%iu
z-eGoc&=w`<UtUQPrW8Y58Qy+0k(7{3i=t0@>IH?;p-{W$&gZk-;898(!NutD+ehT2
ze?1C_y<yN{`Tpvo3}&=&QE;ns=}~%cP2t%Pv1lNUDtVvCNmWJ?Dp~m7(PE{t_RLDo
z(%+qucP;|JJ9~H~f18(kAMtlvfKzgYCXV>bn5y94p~{@A^Ygl?BVus?_B{Eg;gJ&(
z`~+(wQAfW<;PXfP67eel;ff$V$Kz!Lx!igwe!d2rO9NZ*g*dJqZ@I2!4Sza@oHY2#
zBh^%Cz;val1V(!}VX>m|W?TIgAJ&l~EfDZzM`W}G`!ixdnJ$TQ0tIjGrXlptH(!{F
zp~5Q%VPHFhDNQ#ir=2k%X^8k1@g8wUo6F6ONoy>@(wjZfFSk)$EvF?7z;C2BDx6Ma
z6B&O~;0$B!fJ(s99g4pU?fgL9qyQCd6c7roZKgfG;8j#iDOhIpVJsX_*duR7(94mi
zI?^WijpqTx`ugu^`3T&_Tf|q~P+*Z?VAs+ubeZ{^)%sLQ)ElPt5@cNA8)*DD!nm{}
zVNcXrye3kfS9UJ2IrYWeQ3IOR4tU_$P9*q3PvdGz?KTCdl&7WQksEYT(AsVynCw9f
z>KYkr5@nB5*ntEpfAhd;dxc=Q;?yKuslgUSG7#2lAWC-UqT{p4NP{Dkn$2Pluhs{!
zW5-U$fz4m)1BVteraSA^bg4#XV#4>tfHOt`+_P?P#F`FWK*Qb)W|--nG^ARy55C6@
zcR=*-5kDZ)79WwBVCX=et_sQ{O1Sgp!vun`urny#-~c+57e}7f_`V*8s{=KfdP|id
zEp^TUEf8>K%vXgc?(kbBHH0S&C^j2;l)>#O-~<PFvH^hACIS)&J43kb$=!6#Cn7<^
zTW)U-tK-(DHu_|PfJ(#xL(+;c*$DHBpGdjp%AeZSc-sNgmZ0^xk#{<2f2Sm{kmAp%
z|0&`(vc`#{A3)A0XvYqGmvYcGSxZi{$KI&Q5s3V{>HRY}&jaBi#~-MRuf#?RP(ha5
zd<B0sAC;2ofwXuRT-|BSLS+U#=Wui2%H?spwS>j5UrLD)^jX2=H(blG*+yNmW)c~O
z31c%pNaBhVz+GTHj#z>E1%_qSTj(P_zbZ}3<E!rG3*4H?-)><v*<Wr#&pi(O%3y&L
z*~E>(nC$xt`T!JF1{SEty0?nJ^K0O*8r-lP(*-|$*;Fx9)*sgxS1$qJj!1IdL|_83
zJ4zNMNor2XGRGf(m;y@&yr(g9CO<@2iCsz$aEbSxK<%ZyK~|Mvy{Y$XV5_v;6Via-
z=kw`1ZyJ!+|BAU*@PY4~fiN_j@>e+t-(*VGU(tM=Fo51zlzx2)Ie|`fxCf4hnvyoW
zPe9WBAYUpl>A^Y!Is|5Wp8cM?>!`T}vG+UvTJ7ukvJQ4z{~Y7)nctKzfxCKiIvy=H
zZQjDGJ$0mI!~Y%e4QW?tqK?JNH4*pAP786<we6WkqP6P)aBuTAx@Gyevnfb-n-pTs
z3hCO0h&|mw?^a(O6Ymp-H3s0`Y;b1;GBs0-RF>UXzntf{OVu0TradLWZgY}bkn$Xs
zO;XBCrE8hi1%!+a(<ke?$ocP%+bK+bktJ9D7V!tu+ho6Q6!VS-dd9VaB@6)9P`FpT
zKu70(ga7~l2c}6xK~&uD*#F6b)Xo3wNs}k&D3u{fh8&L2lyXNuSNICD9JqKbUI!AY
zLC9M&(1|QcI{I%EMDHoQ)YOu{A)`vG=p8+{M|iWa)Lt7q`11?{1Sj5bXPc(7^kmuT
zMgR~8sZ^=v=YfZAh^*Zv2+S!q-rFqby|I_l`cHN+e}24LZ6wl}38RaV7ahg_X3e2W
zNjI;sIM|Ly2}_k%axX=KAg;EA0f=f){LR_id&KvMuba=L8Ejal6xg}dixr-B>jUf@
z{QSEU?mFj2G9dD3Yy9&i6^XX(!Fss8_Ej#&W8J)$gvupncvexANW@D3#M@@{8z^EQ
zm$7C<(C~{*BHQp@wY3uDn994=*Yl;NY={U{dq30DpBW84;QCYYMkb^fFjyXc2lNs*
zeL{$Ly96mlmH0hCKAb>hqQL6)G!S7b!QYYSn_;$6SZ<F}5Y|%+%gvld#C!Fo@rhL(
z_>vOlrm<>tyR5A(4u4J_xx*9BJq@*aE)l646j*~|{(`%&ZAaXB<kk~$yZ#oeSCB-j
z51>KR7~Is={cmQB#)zr6HVCQ4d6s<Q?RArZmW-mSK_jC9=OdEFbHo#e;bs6{mz(%f
z=?*)45@W^xUDOad>--pa|G;{vj@T38HVi0-NUCu0z2$F`2uqc<2te{7Qj27Es(R4i
zJr}#Efn-!#8~epO4;0ntrSnJmudZKJlEB3YR+QEl>mRzX!OX_5PG(HAKb*b)GyY%^
zz<bD9fgQ;}bL{B^zTW_0IXdnmdG51`GF)1*GkN6g$_tZ<&7SZPJ*OwKs)H{AfV)05
zNe!IJ(PTTX_DtIi7EcV9Tz|nMIvfCHYww}<0m<YC5Lj-^%K%nABmrFL%RPhA)C~1u
zPBypwnwPY-nWU91*b9JON&Eu9<@lP>vQDR6kElqdH3icfC?3s44#0YmlG_#FFx%D<
z${>77=<(4q_{GH+80I_6wzIvIe7-j=((NkdG~rjK7uNQnJBtTORGW1M7*;XL$$%;|
z(j3d*ZK}w6h~axurkXYDaX3c;OTK<0UCnd$?qDI4O2$ZRdfT1`nxTC71<$hQ!W6*^
zTI}y4{z_z6Dt>FD4|eO5C1tg#P!=4j&h&(%oVcqihSeG-OIKeB^W2770ENTWIy0=8
z`Y@H*4x~!D_zR2qDI&;{7H=;2@;wf)1i{`}9&`D~`@6-3307)c&p=XHFDc2N>o0)w
zoo<Aq5ixsKdln*`HzgeEm{sVjfB_TYt<Gvv=}X#g3F=>w11T?8eQe|O_+4cj15KgI
zZ9=<H$>PgZJ`OO_Y;AA$pywS=mfbGn<Yr_&Q)6fSTW2Cg>0K7ruUM_)aK}i0r9xq2
znr?1frFn7HcX;B$VsK+Sy|Mnjku&ywfP5VJ&UdnmV-f><z+Dz`2mjHVa6U+d!*m$8
zuT`ck+9z8+8{@=|3<-ay^~>;#(u(V7DfhUNS*Ap!Ht!vRVPGc8Xh?F;mn~F?@;}NN
zB*^%M>z8~abLf)Xg2;j#NK%l{y`y3}eslzS65k2*SaKDW4TNaek(7SP;cO55q~t8N
zBsV9t61NO!kpb+$u98h;)*Up_ksw&VQ43e)eK6U;z@sEYUZg%e!$gGx-WdRwVYHg%
z6Pt}NV$5G7USW}fwR*<~wI-TftYJkRsf+k-j~K13|LgvvMkt-(hRbVAtuR`VAxq$^
zzCX-LFH7Z<<kJO<e_F4<%50q_>4Q~$XDD`#_+v>G-ws;CN2ru6A;b{z!e0)Xo8NKD
zXOlW0Dw_BC0u(=)zB}DNb*&eSpc*4yB1Sqy_h!tCB6zVXO-~jBJR9oQ4<rurJ`Ee$
zb1Fys=p0SK;`>z>6kE~QhWET#C8Z4+q8w-k_Aafrc@Jdl$xk-|*k}cWon=#<F?UxG
z{6dL50a+EFTUxCi4K#HqZ7xX>KI6<5R*5RO|AHo2#hqp|gD;G{y^ucFmdbrGv(FRz
z8))L#0~9XQdK3(#sA^Wm@^cj?V4!Saw4gg!W|F=#3TO`oop!9{nc=1fezHM9HC%Z?
zBiF!H%(baNfv}MQh_Dvcn%MuaS!SK-g?`>eibLL(I*TKoWU>zUw{<U);}WgafJy9W
zZIJaY+^7st+TSO>XIl0h&a&u@SwKQv16)>Pzn$$lOeb|tIJBI$ozJAZAnxs${bptF
zk4-aAXx-V$mc-m&o2L8uX<+1>xc7%nCLD3m;fq**;mDz1BA?!`U0_w1z4_6nz8tT+
znwlMXz&Q!QwG8Lx=Q`akQx5WG5V*ek`4{cyvP$zNN-SZo0A5Ji=N;htm-B($`gbSa
zo;O}_%4+9P8$Y!PQubUA5+#?!UuRvfkvSoSlbhwQ*his{N44zMwq88&VBKtvxTvS+
zs($Godz*H_Wzx(XmWFeuIBdTjD%T1u7Cz5^S~E|6p>SE_xwSbx^M6?b_c^L_P2iP$
zef8hPg|_(?4_G&@z2$Nss!7{?{^NdQnd)5LAZdmbi|j&*i`4BxbfuoEwXShGd%!8|
z&qYq=K*QIyn>&-7;^UHb>fVe<zv;05r%l%xv1QvXx%VocHLU2|#kT#w*z_`^1%XnP
zu>}XtcYgd=e@JM${DDvVn}JofYKdz^NlIc#s#S7PDv)9@GB7mJH89jQvJ5daw=y=i
xGBMOOFtsu;Sef_lFp7rU{FKbJN^}iIRtA<34Gkf=cYqogJYD@<);T3K0RUK1dF22A

literal 0
HcmV?d00001

diff --git a/shaders/CRT-Royale.shader/textures/TileableLinearApertureGrille15Wide8And5d5SpacingResizeTo64.png b/shaders/CRT-Royale.shader/textures/TileableLinearApertureGrille15Wide8And5d5SpacingResizeTo64.png
new file mode 100644
index 0000000000000000000000000000000000000000..2c3f21eed13d8087195e16e857b0e1b4ffc31eb6
GIT binary patch
literal 4173
zcmZ{ncRZDE+{Z8L+{ZbDtVo0(jYC#O#*ggd4Cioc*+i5b*@_U6kx@xz2<6x#4aeSl
z^~)#?TZkf_yXU{>kLSEz_jO(OcYWU9?{)uk;z&e2Run%90KkgX*EWK8jlU0)0baA~
zB3Iyz@#<y5Wk7fLnasCzTL9=gNO+2lCjdtPX@~@17=nfXK$4NL0eD>pzzD8306~KQ
zpp5~ze&cT>M1mucBnTXN0AS1vKtBZa1HeoIz!S~{qYyL+OI<T!2G9U(Hw3_)hy>6m
z04TV~4O1onx*(_*0EkFN05Ajr2#)da0st-wdLa^QWTv>o#nAwaKu{l`z@<iDQ9~r%
z5$J)S9{5Z!gxSU*Xc#`@ufbq+NHPQnu+TWH?nZ%2bwdCJQ+X3$NXW<=06gp+gF8Y1
zB!$cj$Y|gS01*Pda7G^l&_pl}Il*V7feG9Qvk3sM_`fiCQ}`7HLpp;&2rwH1q>=I8
zIr|vHb|jI)0+28gZ~>DLGyz+T$*`B%hy?4ROptxBy0H)JXT}}TL52w61{<c}k&^Ln
z01`)J1|&Ehg*Qe33JTT$06_xGL_^riOr{{=y714}5bg-VL*NgQ2^hmO1BYM=-pB@U
z7(^n#oCsrpHUblH*OaSp21FpD02Ts7G%)(hPdKE5;Ryg~3e5nB?&uI8kzfxDBA{v5
zO9lu-fF=NN^$*-XtTm1>VFpI99)J!25+KM%@O2>YBzQ%d;2?lQ!qz`95Magv6!8!y
z>O>f_0MHHtVDW$F<75J_2m%SdaTa*Y2Zl_5fC5lA+&c-Nj0g}Q!>&#UFp-S@@|)t(
z0O|t%f8k95L?Ob)eguGU0AU0mEI<%J2V5KneSk#`a3~nykN~H2fTN?M*K(=wcM9kn
zaC+J+!VOg?;TL8veT%=BtN*``ZkPMNFiAO9`!dD6Rm^PP*6J#sx@Z49YoE-mRy6bc
z14eysH@3*?DXi9b$_S&T=b*VkhEAQOg#q2EYMmK>w=;N2QlVvAX=Qo7DY>Iexw*XG
zVx9QaZ=_@^?9Cs}u$(>49D)ADTTj;)f3@6uFHmaQk-g)QiVHs#5FBMTcCw&}eDT*o
zpDanrGGW|B(1T0Ztp0wTBK4R&H7j7eYGRi2Kam;kq{(YbS5(LQf);}00w&_Gw{+f|
zdTO_8*(LcQn_Xp;;@S9`9JoHy5U;{Lo>3X#dsSgVs4jR@MOf3cK0Y9b7V*e4{Z8D6
zr>{#weZp^_l5X`6c|~T){3R@S!UZbVeEGq4pxJY>K__oP&3l!d?fi}d?YaC`zl2KJ
zwS6ZqionzhMcmc&n);Q#=09OWwjQAoP@_e7i9hdpi0I7H)*CG1_{zK2Los_~%Ep)a
z(1Z)kVowEHZRh=``VQt9Po8t{xYZOBZC&Cm=<0NarPoE|ojBpTLHp8_Re9amre0bQ
z&!eZ8&JFFC7=5nxyE;`?I)M$S%dFY{PqpY3XW{s&?D$z1!v5-T`71hfxkDylN#$lP
zeYBcVwsPy8t2*&+BP`mT+m6#aWqJg)?wO_mHPeK-Bksyp<?u+hpvh0^KZ)OU+bW!1
z`km)i$k_D6)&*4EaXu+#LTi@#6~!X%v+gs+8c1G?y)=TE!MtA{5nQYc@}9&i=)c{&
zA>WKcJlvIjnboXcD9@Md-RMn&xajj8E~KirJ5owtpU%$r7#i)k#ooLTEwzz<e?i|u
zy^p~lUCn$vqrOSx0$()KT>tOTWbd^wDHW0^@%E*ALsx!;a@TtB@b9(mM3|^wd5py}
z%%?G;f8~7|&SH-*)XZMl2J54XyAY>`p6Yb`_jo~*<@d86pamv&0-f!9*Ibc`6Vr=q
z2=3+*0h<&VHkF;+PSoA@e?9Cn1HGs#Z|)uU6_Qy^D!RP+fUo4!J9`DYyE9HJuggkC
zf6E-$+CMJ+mE`+X{$|%;#nwB<$BHXrq=&gQIhuyvC8j+8_!jD)h=g+vNu^irnohFT
zYyeNQ#;G$22e(<qtESGoM4U^~u#Sk4q`z?Nwpv!!t%!j09y=|c34EPl)DGov+`_Jg
z#f**=`!NR|uK*`hjtIGL?f&>sd{A%J^!+N`Qj447lVb9>5H$Ng&b{;#S_oDvJ~7qj
z`cDbsagOGy7PM{QucYrIFSWYLItDLPG`tKgYC{@^{!9*%zBt^zo3F#km2?^3lT($H
zj6dbFhi|M(-mpDg+W&idRW+Uag>F4YZ4z1E>ZtmFn^56&DsI4DttQbxHPCoZb~ef0
zx%&CUw4O|}CPSYS)uWE=z9n>iwA!aNVZ7_7#L3DzhC8Uo##<U}s*}0bik~Id6#q=~
zOQN=Bz5lu?Hg5LF4X6)lE~hlJ+P>2NDOk(e{NVBcZ%e%yw3%ivvtQVGBP3GIm?)Xd
zmQa82dh}DfO*k(U9ei*42LFOjiT^#70qRu3M~vf?)b8F7N7p}LU;O70O1)z8w?AS$
zl1j=SOlwwTg@)u=^}Z<i?;w<U+~|6FKI%))q0EQfhaEV>Y+cMw!&rT2<lSLPjZ}BP
zds)}LvYvMmO$$ADcv}}@G3q+pD`fNx_N1_!yzV1DY1H)t1%VQ+opA1=F5MzgTs=9Y
zkLsDL{g$a2lA-z7<CQTUMq+P(a_N7FNu8vgqiz1UE^^JAr#D^NB5+n2l{bd+zDT`g
z8`L~uEaW`!JLNI<$g9l8XH`5yXo>J7xa^4~9ZK+KVVXKo%X-^wfA0N?LC9B=OpJTn
zvZK@bDVJ@LqCQIrX{m2?W05=+yDvE2zOZ&2D4icsO}86hZyCLp)!fLFpAJN6RUTXs
z297uKpEQaJ-6%i>3?-Pp;8<bJT+)?k)F{05qU4c9m{v{4_vit>2i~s^1riDp&s
z^=xM!u&;OYxxJR-RxGobmTQ=t+)Hrx6k*tOF*2TI{F*VA<?mC9xfW)Pz(}N>Te+>=
zw$(Q3zqaOLBg#gYV-ij^<<E2#GYfns64ZU@JsllyI&a0eocB;UYCKA7Y{r!*7r)Fo
zsXwT>q!h*}k=#@hUfiW3JK2{$uRX~&8+NGDr63q#z-rEKw}0bY0`D0bRZ5lSc(pQZ
zN7&|0C6~oqzrBLz-62PwKzHNG;fBR^oi~P<RcPdFja>4~7u3sw@Ys)ITz1>9giP$+
zI9=`a<mG9*!IM?ub}PO2PC1_}4y0W++h*EE1{yC%V0J(Ge0B?)5mo4Ye<DXe_Fw-m
zXEwCif+o`UqGEP_*;TCW&?joyD%FjhAKlE~)8I=-Py{CX<q1wd6m6r11Y$+aP77&h
z&o5U?sB}+v$OvrKJI~JVb2j9ppg*;9>@l3@$dH{j2ntt3Kisg3=v7QwWxbS?d8mB6
zqa-CPrFl(kB-|*kkU^~3PQBKry+x;@`T3(p{r-!#V{G$nNI~TkR_iINaZm4NzP#Vc
zF|J?kkIHmfl@Xtxww<iYT49Q|h>4ul!{iMBl%7od!a&h5-LpYTJ1_mNTuk!OC2dh2
zg2dR{;K?F96&<IYaH1qbhB9t6&hxg@*J^!ergOpc)6kLJCb#S{>u)`-0UDVfq{7@3
z*$d6wX<bxYdTq4bV>$V5{RYo^m-cU3>fIVO+0tnpA<<$fFU`j0LLYuGwLM%Fi27j}
ztspzWGcx-&+x*C-pq33`kSrW$I>)<gzRq1%+xdbt+rY{z!D_X8H*&r@I4a*yutCFK
zuY0J%&m+^)L!&U3VTgD98LNcAiKLV^7i!$C$=dQ|yF8_0@yE=ywht}q2H9WhIw|(O
zP3|(Z_g$=A8_NMww1w^7D_A+D<9cy>Kd8<}%4**@V(x!aid)WdQw726-ooZ09+DlY
zC#IzzWArw1?QiB=2J$su!h{)94Tnhrg{n`^V#MQLk_GLA%Yx&eukqL@DIJO5H!BKM
z!ZAGOvLXU~(`$`-;xd1_WL~YX?aLoU&794iO-Q>drE$^qyL%B4lR&WiDZIw`q+&yP
zwg3KbuP`ya7R%aN!Tqy2d78=-p^pC=<67dN!0PmmZ-tZD^+2xo>=GOF?5L1jeDs*O
zX?f>~byW<abUxlqA~S4y->c|wufM0-n%6{$r}!h8vGN0VZFwxu4a`DAUdyqx^RrLo
zsuA6^jMCqHiB|&iDE0dUrQuZ5`t;$9E7R*vWg-gJB*`H4Z(4~g$G+NZem^7TTyRXT
z`8}l9mWcN`yIRo6X_2>apB9uFGOy1U#jj1Ih-xwJvT8Nq4l7;1Z5UL)8p{aU*4g{R
z%1C6v2}q-ts~Pn0PT~@2*fQB}?H=>5`lSv|{4JnL&015!H6~Lb%TiK%$2^U*sZIC8
zZ1f$(Moi@(dzaHYXyyKFYS!W9Hvg6A#RYe>k?O2Ry`yeKOKH@b=l}dJi`=%sot@_%
zevc`-$?a~7CyJgqeYjh4n#(d+N2ic3Iz`{9N;}p@a!p{NE@o=0l#r-3qJH7B#WUF`
zzE@pK#`_OQG;szk8n$Raqt{pD%tvWd-Dhn@_eQBkKI;w<EFoD-{^2!_n4j<Xnq%Mb
zu663^wiWP)>C<Qb+Us{r(M-=z(3iWc-DNws=H6g3a^%NPn<`zJ=!!drA9zx9c7-YA
z$Ju@o5B10Oz+!a!apQ``p9dVtJncjCT7swPOH|Yw5oeY<Bf9DE*#aoK&t3SIbRlAz
z*MAlqFT`T+^Wc)~kK9-nvgA3WBQ(#Z9?0or^Az}`Y(*-ma`NwKReA{sKUNr1%9VE4
zt}sHEG4;r^P6{{|qoZyWJyMoTN~B^lYO@?ZuBdgrvGZO0z0NGvIm0Z2l|0=LVgAX<
zjh*o0<>+!=iDY?$UYm#buk+-<n0te)IrTpj|D0&p(U-Y&VZMEr-@X(%Z`EW`wZmY#
zbV1VaC3nLGhSQ0Qp8Cc#lXq+ZrcvEYPwc4rzDk*MEn^OT_XL&-9ro7!HfT2yBN|sc
zzYC##9JrQ0hVkJzC+r`6dwpj#7Bc<L;!oi?LwTHd$KhBJT0~yNeC`<?!*}DYqXN~4
zkigV_w2ho*9G6!#@2e#e*}f@6o5Lox%!6WWR1@z;-4Pu7NYxEe+<MEs{NKYjShnh7
zIVsMvQ=cP?Uv*Fi>=2)mmybVhn!?X3-^~Q}gH65{>j-Lw!Z-o(1Mdr&lGmgS9USKW
z$rcT_J1wLsmq@k2xgYs_=kmwo{4zU7-M>@(tStfmCz$k3xJO+(eJ96fi9V}z!nIs0
zjL~2|wMMvf10B0I8hZ4)TjdDi*gQElo2%sx|AP_p(=qpRviEaVcJy(EHy|r5b5T-S
zMpE_?MMhEiqJpxFlBBe}vb1y)qkGl=Nx1Im<mPhg|4-<)7-54G09J>nU7=}r`+op#
C0}TrR

literal 0
HcmV?d00001

diff --git a/shaders/CRT-Royale.shader/textures/TileableLinearShadowMask.png b/shaders/CRT-Royale.shader/textures/TileableLinearShadowMask.png
new file mode 100644
index 0000000000000000000000000000000000000000..ca40956499f1abf72df8753863635b24325e913b
GIT binary patch
literal 218631
zcmWh!1ymc)6UN=$Deh9i-HH_{UYr(+LvSb(oC3v*{BS7l#XW?yv`BH+q(E^<krE*C
z=l|~A?cKh&GqZQQvvc!(n`~^TO-{l>f`x@euB)SIf`x_ic#DHY^#3B9MWgw#z;jhI
zP{YDbEPr~K=kWMU@K#@26ASbIqkO0~=dp$upz|gG3yYNY|0gz9VKEaHRy3BbrkYvk
zdKJnt$8N2ACmEHuCO)Li5=u$;s)EEki0$E7tXW49UA=LSeKp~b{)g9{>JsxSvq_oU
z@{7(Le1E%ng=24v+f~8gIjY=Qs`pbR8@@_YC{DR$!;AqOzu#u`%l7Zm<p$G23FHJ-
zvHdgE=}RGxHnDHU9P1)#;Sw-@GIeY{<7gu5*=S+2a%@!`1FL9YG=>7bK-CD00g~@P
zo(ihtd1G<^)ZpN*8znHqsvCWWxPW2sB}f3|XndrpEGfzu$C36pQx(rZr<^G4qZ2I$
z1Vf8f+KfBKN0f~Ku@ATs`1h_gh&!D~r7H-8=$<5yBbd>?aFulU-31g#u>%p@<H9e;
zrgzgY6wu=$9N|WiJX*kE{~5}U3_y_DM!yy$@?zs&pN+4Hx|KpIA#dR#UUu;q9JGEf
z4vK0ZA5s+bv7C`)Hqiiq1x&y&pm&M%F?47~xBwIV9xQ>Iu7aGk4O`V7jv>Hsph0kP
zD5ea&0}f+~j-Sp^B`u^%4t?u3!ZH|*p?<VGVL}dZ09T<$F~P~})C>eaUN1;lB|5+`
z6d1pc`zjDnET<z2$!vW0p(ql=M5Du%$lo-ggy?~DLd~g%6M^^`HbV{J_HhlI<a1GD
zKmASw4VnWjwwZAJ1~CJ3fRG<9$(Vx4Is>zwe#2@Sc@#_nRQWdV00k24fCa1W-$!?!
zrBkie<Lkpokv0e*#0%SJ6qE?W&M#-QHfCIhtP~nEVyb7O^WiEZptu2J%Ihl$7|CA3
zKl!RzLaoS6$9gau5*Mj}Fyd)KvLjdEgaxtL(}Vay)arP0I@b&O07|qHk}%Xhb_4DU
zEdw(P9^;pbtH01()KZavVF)qU=NdO`NK%9hfHsj!;z{G|;$%@9EYl<=9KJJ(aV@9@
z#bHR$ct7HS?~3viB6@$4h=Oquz>yuu6`O4wh7FBxn}`h?+$VZE`&Ns}do%!%0lx)s
z9=kp&0}@+0=^%ZleB&M4J4y6kVsJ%)lo%Q`PHj1!ccd_(QYNj0lqOCQoC@uoS*8SM
zN3tRm3aDqs`kP7IlY+SGG|1|Q+whm*Ii+*{mG)nvH0Svth5b`3FFe`^8jXXje2)3a
z{!`=FXXEq2U!wxF3kkV2#clhl8?0m=9*o+sx73XgG(i}yW!+e38J?P#^F<oRkz0{k
zS5(}}#7BN9SS6<VQiWy6EM1W@uuy}xFCQp|yQ=drcoX~-sWERfPGr$EoWQrE?#t-W
zd}**6E43Q^f!qQ{8<CgSdXjWv_*?25#GbPY0F+oR;j8Q_{JEN*8Nj>P2P`M^Y$Kp&
zng#mAmxbK((ge{v>H}-nqTA1XO&vT>Tp(q3uB0Aq_%0ck6iC@?F7EI^1^&b$c_8h0
zVLxCy``Uv9n6+mpI^QtD@1^!dNX-?CkjQ2<PnUBc#lR`D=+2|v<a96i^e2}D($)aU
zs=NRzb8OkussfWur2bL*`(n`CGU-{<sQf3L5BOT<L6%G>z>RzRB=ku?#=uQB^_OA6
z`yd@=;WqK`wwURKS(Fw<7CF0uz==RMIWzjj@!LNsWZ!EFBp5KeQD4QK3GY_ahN%O#
z{M5LW>vcovt}!PcK*{z#E>FvP2fRp9;Iv}ym3rl6iF9eY!-E~@PZ(=M!8JAcOs|yb
zU$ZIl5><3Tyg9};Z6x~h_3ZBLBrb<Z#gc3O$jlUxQP3vMuf^DxcF{qRT>dPQk{YJl
zs=U4VPR!jn{>x6(d0GUCo|-;H?$~)GO(X4K5V3w6PS8({*F<%sX<4%yxb@Qy%y||4
z_+>DqsmULb=ac%J%;YaNVYtI^JS#?ny=xKgkVbxu(K{8eb#M1G(NXTtPO6rcS$FTQ
z-Ga@`p8+2gAzNXxdyblli`m;`RXKY1%X?`-Z0(M7O~f-oRWx)rqR+bn<K>%3F=?tQ
zB5gP}v3=s<PjW@p7_tub@bKuCh1&>H)t$z_dR9V;=8O}>tP=UG$ggSXex__o=omA-
z3rl)&xZDmEBfPIaER%^>#QZ9z9jc!)f&ja4Cfl$YC98J-8?pxw7;`&0UYiobeGkzY
z$)f@7&QqRlKUvAIIiOoF?j7R!>PDlhw1xN9Xiw<O*A$De(R>9-Gm<!Aa!&7YzsYNr
z|His-wTk@(pJ{|sGdPjnSH%?>ii{bSCSu6tzj$4`5-USku1`Fj@g2eJyjUQt)G^Qn
zl!r{Mj7WK{;+2j3T#b$!?br2wy8dK#-=XEe{QBuGipDl(W#i!6wcR=~|FUMiN<13d
zgfOdEJ%jbdDJiyY>+=KueoRm#h(^boJdYQcYIw<;$+P_ri)MVmC6qFtjfJWopHz~t
z%vpP7e{iMR{g);g8Ju_<!o6tIhQkRpb=0YtRwVMZz+ZNVzip!u#}Y_8p)sou&@%bX
zxt={-1{E8WK}^D_w;?R$Ka+Y`0%i$Ov!rr$BZ`BGM#jw$?NEwVr#Jn=CUh0h7fxDD
z3KED(0*rP+k~#5+Rq0EEnyR&HFeaSv!U^B*Fgz6hB0izJPT)1&FPzsw#Gk^fC>S)2
zv07oz&8n41(+e(*X}mwfa0g@Xw-`O;=d+zd6+HQSoc9SF_B)^s&lAbuq7&k+1tH}R
zBC>NJ3Qk7`WwmPT$=``1d|Ie8&G<~l6TIIgaUn35IE=<Whr5ET@0G{d!>N*_9c^r@
zp0o^5cF~FSbZ7s6su>ZMSLMQw9b-a8B=$&B(la;uSzyCn9jRh26j4nh3U*NjvE1|<
zg_6xW=7je<e|8KClHMSQT-*i(ee#c}qh3Ty66zbvAXI+xiz@gk*Tt*~eG!hxYg@g%
z#!xQ@p1prDHXSILBY8MA)8+#QMl(t_0&V>M#LC$-;7mhRzI}0kC`9f!TR?Q*;zby<
z!v>4SfMXzYM(egVo#VHZF?cG6gY0>QNj+*D@=H!!IVo3+ENwWgFg{T{Wb4Qvd@b;J
zeZ*({UZ<Qw-|jL1e;cA;m3y4pjY^2e#H~Jo!$}6C%U~3Pq?}%Ywz5^_PKK6@rv?Fl
z*atYp`G^smdo@k$FyS>*9QOEe_ZQaUvwkc}m_R62Syswn%+rF{*N4<NX@P#vLWxVk
z5&*_+8#<muWu`=@?-TDx{beI*zyyOaJ5a_@r<YbFC`kg1*&>&byvdkFD34;a1N5mV
z3o`65iJB=<nNgswFVxK(u98p&3kmf_P*RvkT&s@1DS7nm{7qod%_If*<|l+A*@)n$
z;|G4fS7`*xVU!@S5gad15jkYI{5}2p7p4036-`va09;)1&4_~p894g$``>K&Ke_zz
zw@2+_W@A8jaib0ylgdoQT`r-&z^1)~MvADtaz+AY>_Y7{(t%DqOh7O0t>ZG3@*-(c
zgvbeu^;6*~LsRuFfHCSk7D)861b_Q8i&{4C>h=XJ1i{P!Y5#$V{!&)iwgtehY~L(n
zgtKlYg_NSh1JQ#NzQgZ$=Qd)WO{L(KGGhpkx0ASIToPHzK6*hau{-^TAUqyPib(P4
z0$KGGUg9`pAMUNw^aAkp^@<Cs`i)aBegJ?nD)E5fyU=5=H9$J#0{voMoj3s$X!<x1
z7VZS=pCJDLAkXX9l93K0cA;3D{csf`+*uLa&i$s^!WV1&ig9I7nNX2~N{AwAqJleC
z>WWkMGkz$R$|x@4V`pHdidrZi&5gH?4#M}_2XJsG-w|r@1rx~=d}#`|Hmt%J>dhod
zxB`;?oEZy7r3IU1dXjhOA-{1)jKl+tdQLt|kXU`_Yr3ZF;Z#Jq=$1La`ra4|T4&{q
z=UEQ7pK~*cS;egY2&1myEA&cdNy@~IlSPYmvoDM<S^<RHEXXub$~-!)`aY`ogp$#R
zn1^L@bm8W|O4T+&>%Gofmcgv+K0?mXrA3z;axl@geAT_q`?g5V(TsVt*gLG}%c=eh
z{v@^kQ7==WfpKN9H;gDem7#*5H-+uQ8}m^TFmWUi2txSnKQXPt`zkCJLu@Epk}H7m
z9MbdyXgaK6==PkohW|YYI0b+KjXv)v_alyvRsWd$fMEcVbRR1rL^f~WJsr6W{J8I&
zYWX2I9FK9_`(;M)(>Jlohvsz6-1p?Vwa|Yvqjv5X14>{oC|gARV~Nx{6)^$+wM?()
zweMX2U~lsiDYx-ns_sxyN|(qX`lUeVjDGZi9bDX>U^Z`e#{v4EFs(eiM-w<Un!FoZ
z6R{IUJQALW>EAN~HL0{J(dhbiaGZ}^*db(s?EJA+BuZkDZ=w$#zY0aGA~I;si1<VG
zOK#WuPI@Hy`mMHaw7G4jWWE719G`|Q?Ey{+@5nDB|9%=EPN+!;B*w7TjT=M}+(H@a
zC6L(NY^=579Zq*AswnJe2Uv6JNuZn5#29<p{mOo~^75I=-}$&q!-bV~@&BMDGC3UJ
zi2J=AUs<Xtn~ptu8^7C?QM6sbSuAM3GfFtfGEbYW>VWi)g82vr^G$v$M!XiBmC6{4
zsqiZUJLC#E<5V3Lx0#0>hd^7OaUP=|yr}`dJ?vvSJGpxP3L$YoBl=f65L1TIh<Wg_
zYd76(q}aw+R&@X5m%<J!)0T>+pIQ0nj_I8<s<IWUyf@_eJvbe3rl`09_<X>lf{?sd
z+xz+ukVEbTBU}ZPgC=)9wB&w`PT|hKg|)sNNyJ1M^b;H);^Vv~{zZ_fjLyK{S|Vji
zntwl^1YcW4zi}NFKdk~nTsPiidb2xG+e7RStVj}HMO6Miv{1&wc5(66;p8$Zj#1&S
zL%>ss92^Gy?O7Bn>J$fP$Jz5jT>~SuclVwTg^S!k>|`^2>zk4N!4p4JHp1SED>3x%
z1%e!t^)Q?p5fNM4u3Q#8mqA~xdFeQXa@Wah8V?~}h7`{j`C7gHq?yS#C+1Ea(aITO
zopHF||AqVej)}cqBq&Azj#mm{Qz)em#rbLSU#51BWsM3PdNCTc1+q=Bq!Q8uT3;OO
z|N8|I2_?yZRUavbTf+&A6|dM{4FFUj|8a&u{qp}^!h4+g*(_zt%jZ{&Ork0Kt6WuY
zn*^PxmBCLBvFXM(;zQtdA_zw#)@_l28{%n*UvB(&J(DRCe4~&AAbqL+rWlx*DNm7d
zUgTT-1tc=m6B~fHC%RWbT;@n}zqiP<<4Iu;fs`!LmQvD$oQ&=;ef~Bi0;jegjSQXF
z8iKuv)fVRju-)fMk-n7_%=`hT+ZH<73_FHfsAbOpSjL<u_HRyRpdr1K5(AyBFwhZA
zRih-1oS#W={to~l$6@=knP-Yw{l!TOlK2{#BLoq#8IpuXQ02knQJ@{e8$q|;I+35m
zbyOiy6qF%)Mn)>wKvyTdZQIjJX$jbx{dEV95&hfJBzL9+wd$p~y9c)OM`MvTNir=@
z9LZ~(bX>j_!I|yB7jP>3i*G>vi&hAZ?J`PR+|z^L@QWSHffhwb?a%g-p2|%y5JSU3
zjIptUgJSdOzT+1JR(v8ah;xM#ag1Eq9l08cV_1q4h_#%X-d{dfOdNnW>EE-W6muV^
z@0BEh7Iqgm?r*33hb?>{AZO<A@@f?5*uV(zt-WD}b4ESedoDhjOo7rymQ{$8UXu?d
z%_!9Glw!+L2sq-5Ia2wLP6&)=-r6nt8d29kUlb=YGvyuH#b&@Q`ts9;lsO=HQzOLt
zxUpDGf^&aKwnuAGu(Cv2ZfNXpzZmT8QC7sN7V2M>B!vL%6OrIjbNmG;F@!;W^M;ug
zxgg2SPJ{Q8upo}|VHd<%1FK>xH%f-kTSech!_MJ^NY12pDFU||w0Th6sT1JJ?R_ZT
zu4J={8lklLd#b>n*w+h$Ip;{N`p!r1B*^G?dhbt>4%RK8#<ey-&8Bg^xUJxHWO@%J
zD~|S~cllgBVh{1OTIIxGCp?!FVsEbyg=sGpnT15k&0grrirwq;p_GOF(0-gqR#f~0
zrF@lG?lr+Sdty|-&J$a3Ap&y~bIplxHMw5#=RkUfUhuRk9J!LELxc)mYZZpIalMw`
z*dGW{8j=)4W;8bRR%ixLafSa@Nzj$Wt#vfv`l}Pxkxw{LriytkiTmU83<;8x4Dv)p
zx#O1h5JOiSL%g!kDa!gbm=i(!bcUFCr?Tl5pZQ6^!@`W@tK@{2<vQOeQ%b$_OV2R8
zlI5Wht!|+jdEoV!3(676PpY7)Tlim>>Uzr1luM<1XEn<kT9Kp}6$WHv%dI0M7tId0
zlI~c+zgoFLjm2CXI1&-$W@-e*ReWfSf<9nlG)VZz{9(96tc1gyB&GL3=$dWYxU@*k
zBqjXAB<%k(3=*Q39U~>?w~{exe9x(Hn;S=V9g&non7oly<D+fxH7#7FDuRpxohuIU
zlR!MM6aV2Gu`ufo9a;Etaa(`h-6Vf<qw!mX_nWq}TAq2&P_Pd2_Qa^QL4HzrNVy<N
zFCC*V9@EwItOxRYr-94kQq+PH7sz|h`T1rUaKV{^PSON&)%G79zoJaQ01zCxbe8l9
z)=Fe3CZtBg_r_$z3Z)S66Z(o9Ep=na_6k!!yhDl4O(L!s+$mteyF^o{9ZI7$YAr7P
zp;I8pu8(kX%MR9)d6@;vxJGs3cxp${SSkj<htBYUr}8)H|Mu{JXOqLe2N?f*-ESif
zQ5Xj_Dp30kIt$igb_w=xJV~}i%`$ES=Sna?cYe^9_6&c*^T+7i8?mE+av05)s((Zo
zM*NGE??TS*BlUkqjidO))2{ih^B>;A_J=vo?k~d4#i0)@G88}qK6(ZE&zsvOq+q9v
z<=V23w*T4ONfxA+8yze8iLZ~r=4_+w8Q(QFNWt6SN&fZ9t`+f$a~04GbANZGF#|iW
z6HKDs;s>4cz7E{xou=-sYx8Q-hYv1XeCKeRq}nojZ51L!8l;yHrn_mx+YXTFrwM<Y
zY(`0~1In9_l;mO^NeR-vCe3r|?b~Yy3vM`6U?T6dd!LIlE$V}vQzVKM(PPQWl!{LO
z1SFO4VmC*}TUXdGO<H`5(Ed`UoLgL!019K$Wj)&E3nQ|TM><`Aw9ga5h_vH&4r{Bv
zy?F!dhh11;alMq^C;4MT^7uPC;G@F4jJqpY&t$F}!ZbR?r8>Ct)P(Up$Lmki);5eU
z79BxMn5wsdktSF!R<UwYFV3}JRyc;5=yqoZd}YKRje6RxNlt2-l9hOQKl_X2?WxGa
zpbAlrWHWEAxqFkySQHi#1$c$HSV7^Rqne_MQG-)fPVY%Oeux`!vQ*xtUXqW-Ppec7
zAG`Fe5-u3EPOFID>(V)*%F&gmLxI@WF_IPB_@2{u04OSLg!h&3NO4{NLgAhAnZ!#C
z@Wa#ZKf~!9Y~%4uFm2Z=HEgf7+}p#X-ksl(xg^gB5Ua42qHCRK2&y5?eS$9*&IHE%
zB?*2&3o1X?k1ld1U{HsyR&dUR&9Xv$$tc;(s1!*O)BaKAkDtFFh)CgKKq)#z2pfN3
zyqVN7c6^4y^CmmE(FoqBfd>amSFiFN6!d}-l~9_+@lg}QH3gJJ?n>e>NOIl1Yu<wi
zSRhHWFhElPa|=XH7c}T|fjgLx@9}e^BryrfW}En&Th(+_MEGeKneV?!7>6vrC6uJt
zBv<!0-fWZqO^3o96NI-oX%awPQ=y&f!!^UE=T!>c7;=U_DUR&`U>mw$>*~Mg#qxzd
zRooCUGLPQ{5Q~qx7sM;8GFZ7$ht&zE*_r*tD~@YypwYEkI_%eT<~S4m`_&a^8baTT
z^<1)Ucsl4QR6n%cGuJ%SA3!4xwgiCb<;~5i|C*VrUkwba;4?yLisNV{d5>+fv7$(f
zA`%dW_i~h$Fq~IHQ_P}mTDf6~!Z}+mI@s@_qQyxa4bOiLoWt|)pM!FuXbNId0lMZM
zxFl4BSI<X4oaiJNZ}GyNL39$$mp75fGkb*0JxG*FedgtCMTOp>++cKu!jV9tdoPwH
ztk~YrIYzu+7^HNhVisDwbgS(QUil8H_;#{o-{L4y97_r?)FN5-vP}$u<CKC)3wCnR
zwJ;jnxPW%G=FvS|MaHYMH_p$n1Yog<5Isvb-GBLTI{VWE;O-77z(js_Z-(&iOov_L
zXKt8EM1(a7mU&nLP4y3Wu!N<D?QIBH$Po`i7>aKMU9u1R!y${E52Gp0CEv7&mEs*Q
zBSBN3O~W5#H{&m$48?&RFV)Xh9<6qDB)eSZIVt1nCF+4OuvsGO9O_F+iszacpht%6
z#<5obRv!pofick&^NMlq;yB%<>7kQRG?{IZFRT#yYBT<an5;x|c9N^Yg(H*rs90|W
zU1Wx52<h8E2L5ua+mPdE?&265IC=5PYmMx$M-xBB_Vb5!h%uIMsm=HRN!~p|db^k_
zNuG}D{pq@krw9@s2syQJG*F*;o4y1@L>Y3aN+JMbh@f17Rw>qXzx{lYtrzoRJ7h=&
zf93k}sOj3qvees}+nbSH5-%Sbj$kpI#Qimnvt@F$6n);hK_N2wrk7X-THK&{Z_d|6
zCIAyvOi+`$_nIunGXxtTI9f){y2n}?^fs99G+Tpz1IebrL5K-Ihv_csi1-Bn**#3x
zvSa{^QHmgBbhOn~_oSB<XkI3~9e{H}h!HG4kRaKLyoc2x;CSwg@{YSU1q8pBA3sq2
zPk^0V;;t~;S<GYny@Sj64O8nJ^+oxZzc`i_@X9)p@xG5jWkjqtlKcCG;e;4->}Css
z^KpRCwt&L~qTi8!v8Hlhgn<q3DUso{!wHkm%JpbC8UZwhBxpC+YUGQ}=nH6ro|j(B
zmLKl)C^1vQ@5lCWB5<Y6#4BOyDSfo~z~u~%;&wgm2dL;~tj)9NI2<3r(1E;M&x=oI
z3iG=xNM;|15EI=Ilt7Lx$AQf&mT^1(G~qb_cQ~<8lDlu?#2*!z-gs<R(l1mT>kFF!
zkf##|$<fR6dVK#SQfl>AmIzLGEGKg8IUtwM^@6HlTm58SpAyJfoWu#G1Td!`9dERR
zyO<c7>;LUMQ+QRS3XDE!N9p?<vr7Jc>6CDu@6G-UTLx<MI5w6xk819H)JuId6I|8G
z2CrY9KCz9zfZ0gtemdN>Id){9PWhH(7raRq!H1(Lv{Bz|W?9T0PhGX%%_&Kc4^w!3
zQP7Dup}lCUDfV;W6V^HH&30mHWz+(iJm|^67FMwU2l@#XC<ZhlRbYa{Q_J@9Fs{_%
z%ufHELN_Il7LBtrB1B)1cyZmB<ZgD$_~gU>k*g{k*nD+MY?OT5-E>{E*T2#YQ=yR2
z95ofk+A-U)Zau9xYtG+z^r}H%pt{C}_o2RNkcLLKeUKEs*VTm_0^27w>ZMCm(&8Ew
znF}qc(ZHd~BW^TedAW7vVV1Huu_J=excc5r!4{JWCyyX!{0{u8qr`x@?Hh$zmPUPm
z4IW|!r(AbI#f5+emt4Gihg&$|$pK+6zefF&pSa@Tj#epB1)3XhN=TS1ihLmNN+-@)
z{b=JHago#kZ_eRV@-carK`_o_*S$D57v>LO8IHSvwQsIO)o&N;^+asFKaAie7yUrv
zGh!h7W!>7dTTF+Ut~%%Vxa!VrQGwhN+MtM0#(pmrNB!Ax{+E|%II}c`B~+mo`+0+*
z*HQ1+R~4!sEa}V+&DaTMuH-kPO`i?EJ4#!X+V0vjlyKWNBkb1gC6igId~3UGmZwb5
zvXSyJVeET1@yM0`@DKV(qpiU%tMMcEUXqhS@jiMuP;W70?bj!-UgG3rR*m-R?AEX^
z+wex2;wxV~CXvj9Y4e57#RCrXNVE);myFb=_iy-^?pL-#!=-oBhF}>uyKVM$Mu+JF
z_tBBad?~$Yv|6}*k`2J{j2-{4ooU)a7*m|Sc;nyALsz_M{pjMXi?<pSv1IIvL-C_4
ze2x7lNda*3&G?)a6^Z%(G-zbM9f~)41Z*u}=xk$JVI;5WWT)#U2`Dsu91q$h^odlE
z<PrD+d+$59oNrVoAuWe)Tk{PuKoA4^!`29~2nDH`q*z?&>k`(I118&q4S=b%&zw}_
za_(}PSts3QKN*Gq5qGz-d;G<};Zi7>(I!29QWe{9#1fusFTK~p`;mm?;EkTg7OOG%
z1M#2RI@a4Ii!wDwn3HVnR~Zrb6qGVP$mY`3DlR6ClystrVX?of=J1JJL3{_v1BJV%
zhWWaP$ys2NVVhCgs+NiaSWNfAUZ(P=Cg7C+!pyvD3l2|$|A>#z_zSc*_!$XM_okiX
z>L`xy@zn_-OAVZYcl5E>$Ez(EgnDDgWp7)4mhGeq2LGxj8~N<K&}yYFD{!?NE-Ut(
z-c`WIYRL`SoXKEA#t3*deW8`4v|A*r5V?NetqeAC%!ulJvg2Yc|11b(lF$3I&+msC
zp*aL7OS&^66?Q0H$Lp`b@T*8?=>x5B26J2yT%cDRcS$pUD07nH%4+4?*836AYfP2b
zwdQw5a&`{OaI3k8+$X<hz?@J*xiYzstNN2--$@qll;{<l+nrb-jto1x@V>o-Z(cIL
zn7}aj(3P$QQU>*`_A%K47T-W<<glgWzUKlO9HXKr8MQ_YR)l>a@%}BrPpfV$VszK+
z85c`@YSY?J2;b&fB#hISK|q|*lkm7$%eeH1%2VjTA`D2J_X^8CUWFI~G5pZQzBV@M
zAed%-K8h)Wirl3q=h&04ZFt`(b>H|CqNDJ4hF|YVEW4Y!VpUCNKVz+gfWVPx3uMuI
zSNm=)pqwR3ZyLt$PT3M4D6UW=w!F&toL~LoZ{-gRCyf8qu`ll$vp2oWD?w@$kOEos
zxQ;zKq=B24Vlpb=a|!d4)7ZZkE?TAhS{U-B45_W3qh{2<wD|U(B0!9o&zqSa`(*|E
zAkqH!$wuI4vdgP*{GITUV;^EtZ2T<6$-HNfMn60rfNl8?ANi=?cx`v>_(2d3Q^kC>
z{o(g}5jEN7a=*|Mjoto_%cR!YF&=3YcafCd^)qf%@%z<(qfEu|rjjInIvZt~8)f-p
z?tOh7HL|q1#IHn{miwZ0@xA?aQ*3;k^X5z`UP3z+&(iIb$Pb{O*LO{8UyyRM1bpLP
zBv;}HT##f-a{t~#eX_h|!Ib)M&&dpNF{0AcUc&V^(zLp@$S2Up=J^>v^fOsxES78q
z)!)p8;d;{#S*G<$$2IIpyt6a9H+hWv3SGr#?w}>YAJEu<PI-8l$+?%=x&Ea2&hjQ%
zZZ<q<6e>}F>8vx~dW6{auy{p4E>bN;+-H?5kRyKK*tK61BAOzJCqU<XE_45%#pe&a
z3X+UHGQzFy(H4K6yCKSA0u(kgbUr(n!1-ihUmg{cA~-EV4nMMXv809}#lFpc@j3SU
zQU<6pzRB!umK2l0mJm3SA^HIujNbaQ!@Qs0KfII8|AJYWq;{{d=Bo$ew2-GwAg!{g
zUuWNC90g|MQUIR8*KBQgp*>AT8jOgaz<W?aE>v>n4D&~evCt4Xspfj{Q2mgkIQV>f
zV!EVmzJF)%y{V_1d}tK-GHUuz?=su=rscc5VSIYSD-Tw*(Efp*kBhd})B*sWWOe9n
z9=121{re$K+>Kdo%UkwE5&vfv0-JED7&WGVI4*9P_YG&Gj@~Ful~3+W#yGz!xxZvZ
z*h~2N6-9IN(T<r3n~T2Y$UVq)PO@dN#Tu&S6dw7Qq8RSTlgLG%b~7>JJFxI)i|*Sp
zMa&h2wbK2#-`#kPM!*}xSGf!vb9bZ!n2(JQ*9EEVq#}ixNnCyPlU&1!{Bzz)O36js
z%yKG%b6(yWZ=P?~E)CWAY`<Aq$kSHz>X1$G`t&i1EX+G{f-n}4@1f-}S%2vCiX#}Z
zZ^tr?($h+>_7Vx}urww2|G*`{Q8xg#WT5_N=B+?bjdK&lWD(JQc;2_aXL0(ejf;u4
z{iBmHQW7fPxGo-Nox?nvO)Xun<Bs!-YMUu2>-nBU`0Y44lKV5c#@C;JJ`yO<*^Vrk
zrpxSdkB_J}c?SQ-<8M$nz0fIOVl17<EW;h;p3A0GLvdQOJNWIjHn)^4-gg>SaCDI$
z*eF%;?xKkO6a->{tBAX={#^FaG^FC4Zyu7Y(=pJFP8aO63}Fq<+#VUv(0o5)r8Azp
zT4;>?G0`o|QgLi>r};KQCr<M|4oRHv9yGx3VY8;BlKeYgOQuML;?d(@_B<gRYB{Z~
z|7%}cq;2kMqjS~LxWl?_Ab=VY$#i4T2(5m%Vl~@W^Is4{u=~JOxxU`#<w>>Y2Td*S
zgQvRw#?{B<{mzQRb_$z7d3a3^80Un29bR~5!E@X8-v(FLi-ysSY}IDB^x%9WSOy!}
zkjGq?9q&0^FUM9*4vAckK5sswV&ofZeOnHRgGH;2uK(yWV^$9|E%p6Xe)-H^xeM(a
zb({OAp~7S;yZ1AF^UD?@t_^Dq)l6%`ZB5@EpM&@fr42r>2=>$Xd<&f33Kq5ZmUYPv
zxE$2>+omgwG<h<1|MD=|7NGFju7}A#JE8bwO+Oy^{O(owO7(|#>|BU>cF@?N1T|`Z
z6IFEham(>fr6Qzeru{{bPL92Im<IU|>g4Q`KF7~C+mk*Ub#?Wu$(kRN*O4g%pvM%$
z$FVeO>7_RzHpc$)zKwNjOhc8NqzWH!C7175De&C|&zKm{^QZAnrtDuQC%Sl5{AHbH
zMJ&W*^=JJ7^{P6bjdy%#MTOnx{Kwjf*>br5(dA}dB)q8Aeb8v}67Xxp6{B*o^t4>o
zuqfCszHz~sbbc?|Y4f6L;m8^g*PWUIvYwha9sX%R-e&3VzITeRVYcwvWaG%sH!pZo
zG1V(fe+A=fLsT~&3s;vPO(^<x<vkFIo!D>75TBm!IBb{kP3QMFms5179bKLCACtYe
z&v9D}zyGeu;@K)JIvx~b6`RhHC7t@c{N7;wjMY9y(Em3HV}K6ZRPTmRuYUnv5^>zy
zga1xQq94)wLZdFFY_LP9P30t&Pdl?KS731=M-G8rU*bkXKbzjh+>@HBFHPg`?DKUm
z2K8N7<fKXry2av>oa0}fTY8+AB&Plf4%FX^kGr!f4YgLVSb5N2V@Ocm_p-1wJYG&+
zT>U!Loy+c09kLMF(DL!6FtI0+?Z>%-*g=fe+M*?I*{mB^=4^2B_$5!4seF*o?qX!O
zQ1iBQ$4G~ng1&IF{=4e=<DiZSll*osxqr{JcKQ6aj-sc5U7A{R2J1b7jtlkZM9JR^
z`dO$5<p%-3-bXNT4{e%+_ljdRm!Tw7NArqaCQ?xv`Qy!*uT(03A-B+bJj*@VYKZ##
zwQq)@otu>3*FY8>PH1<#NbxlGQk%W=4lWLl6Q=zUdV_?4!pe9Z%VY15cJAu^kA4c~
z-W{wS{i`~3u8fY@vFS{8LdKhc{JxN{16e+k#NFP^K8tSI<!5h47_L6D&q*&b+jpgD
zNjZVZjM_~a{v#J#qO0(Hu`jv2C6^}4Q|n1njp2@LcQ33y7Z(j)m7c$aRhhNRMRb`i
zy{7i-+>$Z)U}ace-41UCM3FKU{THJUyGHBHK5$gCyac$Eir7516n(qt|G<(Jz#v={
zSGp~p-_?N2Gq74AnfEXc$l*HXzaF|+^Zzige+l5HX~?s8J9^eQVQ!Y|D+Ml=-}-1u
zA2gt@PiJ+eW=XKhHW~}x(lwN(6WKP_NjxoY?te$#I^V5gpOZHwVz_Ux+|5(e9lw_6
z@L@af+hSn;!MkyOzqn+(1)PpV_nd|%#YwdSuMUsT7QxZR>;enimbD)XI?vE^yn#GL
z2MhOsu9gSiL4Ss2N-VtV#hYm0pz26mhT~4|=)8qCH9@aFB^-me0Db00v$}MAE#6NB
zgv)@!s^oR&2_LnWO|iN&M27|5X%{w0AFTb}z1TSK{R0>?e9JTU)o#={=nh$<_hj(=
z)8>V><Kje}JWk!Q`1<c*Ev<_|3dy-X=0M{mvEs88hZ|i-S`D$+>UDT_q~p1X{K+j~
zi(nW)e`<63ZCBIww!f<08$R6=pXblt6pbvp3G4Pc>G!N$`~T`PEM)p_r)Ij@<CUZD
zSs33}S-rn)bxB;DN9M6vdM)3eYn#|k*ZW^`&G@HM&lWNO?XgsvQ<H4YuTOlBviesH
z(uUD=dcDUD#GCPj&(!R0c(!ZOY>8(RVL<cSqZu|i)yk%1j)ppm(fKoS>UWT*9$S6B
z^&)%j4XuJ~eG6Q@O-`jZP0WsxGHmo+i;RqD^uN0N_y7J3@mcs^cET>>x(bhv@~Tk&
z{`h_*-^h+Nr3OXDK%RxX>n1GK{5e09B&QlA(WKQ<>=E%-*S(bv7Z*hV^PfkpDCGwm
zo{F4IU|T(zw(1qDOfSA^ch{d0p(yyKl~s7YK2$<=?2zXqGU7mwk}8?HDow2SF|Bvm
z3RxmM?B|U?2rj;P4~{9!0hVNsx)_W~vF@OaHs+-6x26JX8(6!R0yk53aWr-0T<USh
zEq*78k1+i#l9JNyPaKmXzgZ>1crb5!5{GZCMTN<TE&leCUWUaP0Jj&5Q>3_lnXHl@
z1g~7zHqEh=mDAiaZ@V3RJ<__s+2!Vw^8Qpx$vB&6enyRHG8?M)Rf5<1RJX{lPv5O{
z>N~C_Z2xXj$V++pPG%@nNi@>lU&CkpJkREw&V0O{TxfHK0Fy!xBQcQdIp&#l^>psC
zm@>*8K_Ke((?|nY|HrKTje8LB;LQA8mlzE5rLb|JJzbe*&A$Z2e(zpigXnm_A4X?Z
z<%3Ggld^W(f@13Ao)(FpF8bq)0GQdb>3^9wm`T+hcR+i}JF?PlMY~_n{mFp=$(2V1
zaxHpFIG0^gdU#PP_tmhXwkJ{WT+gwozSH~h6dlJkXc;Yfu!C)YB$#Kf5D~O}81hJc
zA!Np@IkZW4mEQMVa#m`x=CK;Vg#VKB=v86(+5X`;k+r!4=VoN;0{SHf%phHV-0>dU
z*Ijh-t+At6ntMS$@~fwaytwqA8!2c}B4(yP!y~xOXF~|f^bh@7vB+$l2kTCuh{D9T
zf15#g5ZP7LveO68^fHm{&Xv{|`x2e(q+-mdeDj=v^G>f{S_~}d?4v<mdV4bO9O<{+
zX*JfU9qn3@Uu%_j%fKg|DtZ=nP&@0bHLG|ky#KVctVlQm?GhW>o|PY!nI~pQ{rfeT
zbXe5mX08_Q08s(WmkUoiTlF@Hk9oM=K>`Y{NKWY$W8OSZDLIzBdd_e>YMd4Z#J<Xo
zp=W%1O`WA9)X=S3(B*Tj<^3CR+S~L8$IzUd)gi01Fae*ZoW39-q+wFpt7sGM@~g4X
z?Rs6F7?L{}OX)L`^nF64vZ_IGhfmDm4zD>+T23Vaf2GI$$Fy7|Q+@3HP?;Vh6ew2W
z2rDmSjeWytPUsVP@fTDtC)RPP-*~?eNGzCI+~gTB?Fzw&gq{Vq=^C|;Z3uG$aV3<H
zjg=Tm<6t2pBRMtB#jQWNeI;<v!cTpX-7c1GGh`3J)w*p5lUpG1wg;{LZ{e9trqitL
zjm6(TTuad_@t6^iRET%|7yML}0<4Y0*c~s`vc`H>d_-cR$q=#r?uPaETQ=p9etqNv
zz|H1-*Ohu_5C+r@7*^IHeAyDcHpqNZY5RF}+XCxqowuVRpm35-A;o7LD|BB#9v>aa
zQ{o&`aHg51|EFoh#^A+q1d#m?FHmGO$AzuYkh$|;mP!oTG9I21>t+7qHMy%2VO$(u
zopAJeZD8;1zh7C6Baaj~5ZE`0%^<q9+%P1yn|IBwlq>XE?a&<MC+v;v4J99Y&Ehne
zwT?e=DMZ4$c=@Grhy>Yyf2`oiXx9?xA5K)9LR9fbe8na<Zoi!uEM$ofqR5u6uN%!!
z1A_)8|LR%g=RTK#_@$5xcxtmji+YpcP304hft;0nY*F{KQwjJKQC-UuaP$d|{uaf*
zpWhv}J&#a4rX${R7+iVaX+bxvE<|JglChcHhZ#i|gi$(24^l5=HuRB#5Pl|7_`gs^
zcK!qVP#{(av_1)R);_28hLR(JoOl*L6i)?w(cL?L2wI_U4;}bYJ&D1a>5>jEic)u$
zx{!{tAvRtzmm=J`Bj21`Kjt%1!58~aNY6MoNa9a8-Vuf=DpsLex=;9?bH0s=GrH$6
zOm3k2UtcXoN3J*4(^l73nV-tvJoPy)Ch8JyBbWQv*9N;V(by`H5-=m??YkSB^xrX=
zn?h?}b#rQKXOOgpiop^;g^=~~dR7q;G$K9;&Z>AEt0IBE2x=BpZnVg6Ai54%8N*&O
z%UwDU-%;0jsbvnsFe1h?M(b12_glBk@XCG7w(+tus8c@j6+4PgAv=8xco77=RVNRR
z&Bh6`^jtERo|_e4uC-oA%hvgZ3&+mKvh_N<*!3UaDn^6AC3g8^Uwo1(pL2^H_Erp?
zfJSEQvAa4HtpMnEb}Pn9r*RIvsAq6rUBb`B-(E;Coux&6sXmE$5EYq!x@nVcYobjY
z)Qmz39${oc-{Q?O<agTbemM2OD*C;>0^p2Fe6YhMZd6G5bLj)q((=2{fD@q!LI#%8
zeTbvVQhJI3h#xMY0=Q5{6{A)@2%rBx@7IMQdiPctI4iPC|3kUkmt40oaev42(}vy)
z*IvR4C8SGjOMqb_l&rzAZhK|>Ccm;U9q@PSef@Q!*6^|uN9hrK3(_lW@-CTnDCl;;
zt^CAy;P;Mv_pit{gkA6VhTt*fxU#NWg~xX<a1u%HVCpm=m)yb`a(R(Qy1HjeLwgiW
zB}cY?E8vb9Gepo@PHY2UcVaPc<M3UFqE>Hrses_OmX<j>4^hGTyPQFDGkK)L)>>H-
z-E}%Kii`7zwGw!bZJH*}Nz4o9ft~!De+{WoEYCDVlbmz8qznS12vkN@Mj$2EJDJQY
zyT!e$%)2LWf~W*w^bWLShS}V1N*eT~bg)_RM>v?1&W(9Dr~lH}Cc8-NP07i++#SyO
zhy>j2Ko*)O-d{r&4&l>h=(@fi9za>I7<nD<<|Vcvp{_$1tK%#86-{X26_PyOpt^sB
zu7;@c7Cr6P!jCJ8XcG!)5$ZYV>y;UCIJ!?Jf8%W<o?=FQl*ly61IE;O3Wxlv_+Xl=
zoh4dl-pmRjRk`Tz9IU^vA=RtWXcktzVJVfpt?i~or{sM5XJep+wueL_%>UeaFd~am
z-gL6f`FQLl*ljz&ZcDyADH1+i0|>SD!+V+r(;_jyrrk47CQ_~E>h1PYI%h86GP*Nq
zC5RpWzob0F?Pfc)cjA;G>9JlZQ6l2Y-b+s4`%tbk#wypL69MVPe!I!#`mmjcQnI$K
zzn}_-rwg?W|H=9C_5%~OqdpZM?6k-2;HepOjFjoz6C8c*$^lOL{GH7c#<%!<10=uQ
z8vvaa>TLadR-6%=S8`0c18k$khzWp%4bLq;zaVvjI4a?t?T}*<bAT4G#d*<ds(*a~
zM{5Qu7IY?lU#6vl&@&Y1t^JI@zDrt@$Ktz}*!Cl>hUmZrPzmTycJO;8@e$Tufiu+9
zo-4oQTw~ZVqoegwlY#kt{4A<5GEbUubL=>$aL{_Ldy-FUXXmU7EFLQ)OPRm;U#aCD
z*J1gaGdqW!8x;1hDp_1pk7EzPOB!YLhl3~f;d&kcD%CHysHp2jeC}RBrW(Uhnr>Ih
z5_JDm^`F6U2Tl7wc(3HA0$#nNo*sRgW<b)t5D+?ynLol|i+f(Usnn9d(rAtU_mL=e
zYJUZNR}dRe=pohgCnc{)8^iMF-@5KxYuH}C6uqSRERbP)gtS^O7-&)u{TyJQ^NhrL
ztc@2rl+YI$+z@};>T&5U@q~lq`oXJgXdea8d*DqUXVfme5=m$|cx2ZYvVx2}E<ThM
zx6y~<X3#~pUff5K{dfeA7x$oCOOa7DsG-=~htq|EMAJeunqddVt6>_H=tUexSGKwO
zY_~A-O&oq`v)Opk5S$Z?S3G<Bqx=vd^AY@}V8@dqO7|VdwA(WcqNL_YRAj~J?)m=-
zMW4nzKAk)i6>eS;ltNCG8-L*-1%gt)WP127G+Dv7l&=HW9vl6Nj;L+l5NP8>;{U>v
zBYiH}@b6C$--Lsp`TXjMz!>aBaT2X$1pzo^8z_FiqYSic?_c0oY?;1Xy`$8Zj`YU3
zs+Nq$d^pkDZ?3<t8^S!Gj?Fmzui92zel))yxa==8<K$eb`>#;_Nkt&Q6O-{nva0U@
z*Q2M|jb~xhDR29>*YG5pZPT0(7=XDY+>+;3z+?+^c2gkH<#3sDtFgpj^``oDUr?=O
z4l%(J%#INkC{Kod#Qo*>uh#POAGs`LIslz_PUPrZ)QS@`l9n5mIkTEgfewUjBZ|d+
z%}VaXT$C7_uJ2S4Nm3Dm4leH+E6Z{rgDT=jDp2Xd$VK^D6jbJ(eaWqcpvg0Eb>1nD
z@D}{>F+JMhq_wa@QG=<RR^i2cZz>H1-{y~eMD*;;VN&$$DDza}tdFIdV8Y;Jdy}IJ
z>}v-)k_>mNi97|&JQ~RgDbJL7@@I_(Bf*&%l^7VN;Zrf?Z@IsrY{Yc_oS%xMACzEs
zq_Tb^!!6bRZJOAP1L2OTBH{1ZvKr)3l*|9sI1q!A^np7c?IoPNu8e%>^*H{PYsP+t
zT!yU%s?9@`m*?HGroO;hq3fn64gla+kuM!y>vQ(vb%-htz7_4!G7yFbQbvnRW>t%N
zfoS$>Y<O}(UCOH;y9AL4?7D5<9cbxl(GDe98=ja@A;6I5JImvckcyYuE6v~XgoNek
zp=s`WE>_YfKwl>XmO)Uozjf{M{o6ZJI@A(KHHtSKzj@7zk&2rDbQ)`lUC_Tg%Tqi@
z^htte);9PkV!w`&&P98g^DfqO<-%4)IOYp!``A8@17MHt^IwG1U(Sny&~Rr&Kss0y
z7S(x`zj>E2RX+BlS%Q{oY%5Jd0od@jIBQa#9rhRJ+%4aA=a!b=1H**v&2{hv?7PGu
zZ3X>_m?TC|yaxG7Z!;eER<ds1^4<CpnDE|(INwXiQZ;Dv#fanH{k)dAE4p+&;AWrr
zezJ7h%w-_i(Dy#^J4)#3(%yIMo~vi^vJ@wk6o(IAtlN#3=T8z^Hm6#QUhhu6l)~w1
zF<Ww#3OEb#g~+Ry#R0X@FmUjCbO5!O=2vdUTVwI<%s)>5+%HVvm$>NLIF;N0g-WN-
z%X))>1ypGXOo`@QEFwQHnUya<PZbo0B)J)tpsh7DtvpcRYJ#k^nMYe-!2{>w_L<tp
z&&?Z|`NP3aSpNxYO4U#q0p=JfX}Ldk%0y`1WG^!?ZL6q$(j<BMNl<e>Gn~q}&Ho8f
zi?LFe2;<^nO+l2UC0SK{6xU$14v^qf#bo8>qzQgwYV6r-<Er7%#>A<^`X?SJRVyf;
z-t_(ZXfEb1_Zi7dhxw=eTYayLlJ@t>>yPhevql=EmnGPBFmWtV3gyI~T$+32%|13c
z2r|WU3t_CEo`<Ykp`189-<#<E&dKKbDqMU3t-gG95*~RU|BP*yB_>(g2NcifQMZK}
z7aFI{X3Jp_9NK>t!8P7{AeHVebLvk1LflPZ*CrHO;@_JOTLK89kJzy|)|LTdq@hE=
z>;RT&5_}C7n3ehS8U6oPgsJNe@A_bcm?p4rjd!?ahhU|9`rdqsro?Y4`&?1F6|@@~
zngYNP7t~d(EZj{YF|Zfb9TY&ZEzDmpSZ8cz;}ltDd(Q9zC5mT%C%E(72~V_49ijhy
z0~%@(SVD0=rQ9r=H{QI|A_#8)ztd|knLDnt8krHAQ5s-GeRkwmco;69d=wED47^u4
z&yI(zciUN{TK~Kn-XTQ$E$J5fZ#<2|A`RO{ZC}JN%QnlWKK<E;H>seFIg5m<>}sl(
zS-)fC1=@o*(OL|(&`;T?T)!5~0$`;3;~|zjxT8CA&6H2^+sb(#qAqJ6ub*m?Z}kXU
z<nF_JGALM*OdFmvTOO!t<lnN`U*zmj!pqDPfjU|gtiM5wg#7=J0|gWa%qAQ@^A^NC
zhclf+Qtiyg$!z0%VIn>EW9!#>X7@D5RRacKwO%3_m@r#DzUgPO`>L3ATsYgGka=Uu
z;zUn)u1&uW$Ir?JMh>u2XqPF$m?UmKOx|`W>J%+G`)2oXL&(!!(wUIa9ZE)MUfy)Z
zPu!17j(1~X)raDyq>DhD;aFejvsVq@xN<2-w`OngFye}`)!*Y7t5s8c49D85JfcBG
zgo)7XZidpi^<^FJv5Fz(z_ra2>s3#L$|K|{1DN&}H0&%BH0$4r+X+cfiFKbddJ>~1
zBh*6h7>c4pLi;<G>p7<(D*!t(MZB*Qt{SEPsL4hm3pU(;<Uu=aY$8eSzoRVb?`Y0P
zeeFmal~4{zt+4in&3h?*XWax7(}BEkf_oK08GKCL{q}l;F_4@R8WP&=S@BveRRa`r
zwLTKv$Zi{VAlV}R%LjprmPo}n>bqfoOvnt!6ZaBI1kcZ46(^kl$fuJNO#3L+u#y-W
zAM6ZRK295r8uV7XDB_Bf^fc(L?3nZScQGPmG%>UjInri=0czoIInG~vPtwAybV+aC
z2afKSB?vw4RSC{>|D88R_?z7_d$5`cDj-`iU$$l!v`S|i+xji`ONx3AtRryJFJ_h;
z2;F%CH0<wM>lOYY0B=O}JP3h>dP!E|K`~S~^4K;W>P6X(8Bq5X_E)1)2%d<>uEspo
zbaQO!iuqh=#mf((Ix`#A@=C}5^pga?MZfo=RNI+;PS<&jAZ#uX6E-SZa0?7Li?fc{
z4kCOG7eivVfZh9`V^|f*0Ao_vp|$+>2^2(P>c6c{*3}TSH6(tO1B0;@XJ2Dj3_UTL
z8tkjoY&izB#iNXNUpwCcn1>UJ=Iw`kUfEZY%i+$GFe4sd+(>F4PfrK%O(O*^z=HIz
z`nm)J7he{42f_*M@6OuaqQbnk?5ESqun$~Q%K@0D5=x8!f*mzY4N)H(@}q7QXK@c;
zl<*P@W^&J~2J%<3)g@RklV{hoRWsfQTH8q<2hZNv3th{B+sf3a8Ylr$Hx)N&@#}(_
z;?^`LdjFk7+yjgTS>83X+p;hBt*zE^LM$hvA6_;pgkV57Rl`u$MmDA!ItJ1%KiP&4
zFCjJ1eg?`?^O$!V28XgWL((w`oMG&<jnyYRV7@+}@_5**AXh$=j||#qM5+hTw$dzT
z(3RyTFB-O<3#^5*Y$l4dqsQ<(^f64TMEv_a5+DL736eT$_V;Rb(ad7JA8`gXU*5Ce
zz{=(?z(jqL?du5?IrFv?wYl5asgL=8R2cgh)0S5+sN)j2x63uSL!tYwkJO<JIAJgJ
zP|-YVv`*S2_uC@#J@NVp>v-#KKj>p;2T>(){Xc8X<Cw4`>QD}!zM(B?Z>l-tjLCX}
zqD%KS+|je5^Rn49n@Kll%sUd_d}j;Mu+zxu>c0!Z7tMYDz=Pey|LR=jU&$URY5=IC
zqA%K0DS7NvGyZ7N+c{qS2L@WNf*PR7T}O!mlyQJ%`qEq_T!Rt%Uh<VL*=L$j{>Lc&
zMZA)UCmYv_FYp*Udyt<t6n3VhFKfB$3B14y<r<I3BY6+B(rRaBP3MKmlhw~f?e)*$
z4sjQ}sPL`LKUO;uI=qWZi9KnUC|S}TX#eJCj+H$MGp>teYG1cu3(~bHnX1UK#jDa4
zqdl`I%j|zbB9c45B72}!z25|Do4?}5FnExV40YYc-67Z?8EMt~Uh3)%IgCMhL1N9m
zJUdqe5tI&)PYpIF$Lt1LeakWb1JFP(zgt&gsPjz2*YIcf7G~Uz#<pZ|Q06)PZYaEd
zWA6`2fPW8X!zwXXoD(`|`?Z>+sF|k^1O3D>4Oid>T+%Mo9=Ba0l`&$Y1VT&NHvE%L
z`k!&TXRT#D)*zUIvLfpq|AUf0H{iG%_>s8DGpW4MWexZ1yq|+JPWw;l{zEHdOE%cB
z3ka+$G1N^zH{rPMDRs*RA~8kcdN^No$<J@#mUfx;xrI+CA2B4iD#+g_#=kT0m5?{;
zmfj}<q{nT;2=`z1PUy7OcHfZgU+BP}@PBd>S4{?ODJRqE7d<Hfcv;Rz&dsj2{~zgA
zD@to^0b{Z~zEX3KU0v!%{{}J&YH?L%*X@9u%kT$r?~s_b>9kKGSc)zxzZ4J{$$a`2
z&dT#`?bz*-Xdrm1sC)<hakvG4k?qna>F;ONG~J@ENR{s$1s2Sw@3gmmOtxD?15`@x
zCJQbtwiJx$SnkjFdcR+HZ89{Fq?I-`l$PE^OV_`X^zT*ufmQm(+5gkl<c1FXCATzt
zT5=b6c}B)I$e^aQaxMJZ6Ywp350?}<DeM@N))4%OqOyy@h#l0@^;?SEc=x5{L{DCZ
z$(r!rM()FH_!iD7Z`6?L^fM<Cna>+~zaEYk@c&1+3KMQpyPdr4C_4C*%eJ(Bz`=h~
z4o=&R>@(F9%O@=>sMHSH^D6vV(!0-bUgEATa`s5Y=ad8e{{Vl{-u0lU8g5r!&?LR`
z>}qP8C<p7~vLcr=&c|%IKW$Bs2tU0n{>AwBPw*X_Q|>vJ-S(SGkmNNlaRHVM{*$sk
z&x-zST#?!FZ8vdiOB1iFL3#)N{aU_9tgvI%ZRkk_QPTzWI`n@*OYhIQ&C^L1|B3AQ
zUT8{wR0I5heuDfQllF|9x@fvV4ml@(%E9`$20y?RZ8Ep|!?x5wE;}=JQ}Vk*KUd%{
z@PpuYk|W->gA$o?V@h-cgODDWf(vk7n_R6!z^*<K1C+G%Ie6p%BT5|4xVUdCJF-1>
zi@7G@Bux6H=V#y|9G8=o*Wf6tL^WZMSE4kKYDWN1%I%3|_*koW-3|NBYvJ)(#R-XE
zOs$PWE^J_<Q)2OL(Jj3{@8KU%?(#GobD{Gzs{<nqqzj4eQ?~TH4`$$!esw_RPcB>P
z+>#MGs&2^ffZ$;*&F&YTBDeU!CI$tAbO6qFEp0E!y|%KCvYSs!c8^3siPlqoY525q
zSEKTGqU>6%)bZeuTFFPk`{AEwWVq&C^=Bjd@d)~q=$kh9hqMHHT5h|!KbGidikV6<
z0@J#w4*gFmLN?-tCAOX1%Zn3BFs?-Bn1}zQ*6=3T9R;zF-xTwmQINLJ=;ydX)O|XC
z-c|DA4wBQ-_$gZ&9_Z(ye8(P1@5otisbolx7F<sn|BtBfVn*LL7HGRIQbC{g!=(CH
zgQ3kJoP-O?I}{w{`7+9e211VSgnwFz-w~~&?Q?Cmn=TWQz$jc)o{dQSa7G}-ZrplP
zN~0kIWI(O>Q?@gBz>!pM)_S$5K~7R&ScM|Tz5JPm3vf=GxC`#ijei<qsuE1WapRv+
z^(BvMcWGhA%7V5oe?(OdCk%i6$}dd22&Klp#M|aWQ$xst4757APTl9U3sIWlwLdjs
zKLq2atQWipV-i543Q>~X5kPk8c1d++{lO?2ff;4}N^Z1%uj1|nXOIN=GYITge!^^b
zuX`h>&4WA&409#~rSe)}TXc&x%y}*7Pf31hQA@(ZZFK@GQ!wN9zMBPYG4!T00(uk@
z9SZCfT3aXOd*$7>m7-jprgP%@r0~X}l+)7e5wU(XE4@82r$T;Ez<nkF2UWD0xWzQ~
z6U{6S_3fme38}8wO=oMmn{PQWT+l>~v9Pl@5Z{Pf>(+7}9B$}%Q+DjQq-{q4%}D$B
z$YtdIzp0@R4j5BGLWhCJcf`H@A9DqQJRE>&F>E1q0(rnK3~4G<R@MIo0T=}636KAN
z6`h*x%8pU94L1s}FWU;gZ2*nQz;5VkMg8ef-&sy`Ny8w4Tu>+{;DjQNfq&~7e`m3%
zrou_R`hE%34Npn*^*B~-yNOYoY9&Z@-U$<6y>LvGcSFvDO&r^pNtuUX$?!=i6%-#I
zmgC)YgWF2kg+*=2&?%|PjsiF)`c2(-kkr}nW`6f5oHT6o7h1cgly@viV6}yUhCIl&
zY}jELT)CiAj=8X}`93DXd>kXEWau{i_p0u+FDvRB(B57Y;s`6s?rE+Tl^`v;tVPpZ
zdfN@o&dCn$RN3dL47eXloapz`-{>g_#sNcM-YtDjo#B+r&X{jH*_^7@K7d>)x<RWw
z(jl<`$+?*qE#1U`&7w%y>T<ZuVfW@Q&WjIr%z!OD?-wws8%mE<N93d@B8I#htXnB5
z<z$hMwi_6l%4avJW@@+}@_EX=DmN=;Z{O2UTc%hle1|IIjzArBal@Rm`b>H^-8jlP
zJ6N|~)Krbs4Oz{L^{k7f>DJKotJfl*r#j%ISw&|iYx^4VQ;}f5E#2->j=tcs=8;pq
zi0l$Yp$1f@mV<h)64X{TV8mlO%ziYdUW-6Kg#y(+CD#hh-mhu)nF+I4*D>e@F=Up`
z8~kK<)<o_^LCEz9<q0%<RJ)#c1Z&wZvP>Px2@49KSfW0XZCtSUXWQ{&-dT0M%I5dL
zQr>N#w+M^rZ7;f8PAU({(T$;1kUSW624p00khsWKBn>xcla%so^}39aS>2ITU)_a}
zBE=zwWo?oZG4?3G)t1CJ8Qyjg5cFS*k~EEyn<^7KqZ3C$7Q|>uWCRox3e@fXi%#s}
zL}<%VL0gJF6r^;1%rF-@AsItVy6r{V8S1W2EkTxU-cgJZkj33BdTcqtPNF7pylyCN
z3uv53w-{zDHcIS2ciCGFM?i_o0hl{fP?2VeTknyR&`lk==CbFqka|ba##!RD+-KYU
zMAJ!MyN^z2D%EY|)iG<>HY27X0Uo(QepqzKl8s*J7}~uB)^vX-Af<!<oQw9fvIp@Y
zq?X%%(UJ3sQm!Wb%?PqfHCSx;gvH}is9UnX=?a6=tozWG^fCtOkU-+X6Oq#rkt2GO
zhpBRUd81dmzq*^uY)PwTc8s?1V_t?@2Y%}t^_1Uo+OOqS07Wtd3y^w?!BoFR9zx{8
zbbU^Z#diMfG6CPZZHA*=l^4e7%7VaXrmo}E%r!f3dj|=}GGhB6mpW1A)orRXM{fI8
z0IH~ja)GRzXzs8$BDZqDWMJg@FNRj?<bio@#`CzlXk-oY9K?y+$yOEdr|#!sNtBkm
zypo)QjzTKwV)9wlX%2-MAdyoeotlm$o31fK1!pnFZVWwc!+n~%?6|ocWNjdmLJj9!
zccg|{Lj?!0kw7~r08@yughA68_uKew%Y3nSy!5C!dF~_p+tWnKZs)7X%0zVAa;t|F
zp>jjT%VN%!AZ%xuE_H!DvkBVh_iB>6kx*w8Z<5FdB|3h!u}HKdTy92);mioz$2GPE
zm6tVEz-4*3MEaoRKyD#HvE`N<Aw_{(VG`+A?3(J(Fvlf!)jp|9`b3U;nl8p;aBWAs
z((K0eMAe7eq~Z?0{rCu1Gpn&<@oL*CIa3cUXxXo(SsBoB-|_KuvVL&~Pa7)iX*(H|
zxGI{q2*eyZW0mbmC*iho4#K93e0AA`+rMQOR&~a<)vUB+Bt(v%w}U!nCO6$_+s-6O
z9OjW97JGCyHSC}+^pHKO*8$!tYeU<Zw<-nN)=!uRk8a01_Z_}!DacjVk#h0_yzPsw
zpy?8GZNt9}+o(A!rON{}1FG)srR7CZD=Qd0s9q=;FSW^@)55NalOM*2Io(L?h;^Hi
zy2AB|6K8gdMJHYDj<?}4<FF}?jNEUv+`9XU?h6B<q0KdxE(2Nt{a&HAbZ473*}ftM
zDyw?0<}BFQ*|8Rtt*R2KZZK`AYSwC{{AHNNXsR!3Ii$cFDk$3!aknK5jg4}W!In+g
z)c)KWI!xJxuH!6Xj8D)HmG#MbGix}sl|F8Z81f=7``t*gy`#Xkgv^=~oo1FcogI%0
z1Z^oWy1fK`LmP)IUC~r65l$pqC15%19NyC9)m`Ya%Y14(rmbtBdDV!{M$N>I`xD1B
zIBU47O472?&!!uPWo%&4^MXjySk>6RVbZ6ORX{D5cgkl?m#S*`Du&4o+n}9Q#uiSg
zOFUX~xMBlp$|{Lkhd$@WCc0<y=^`=DmMRJB#<VpT=S$^;^~iTJT5P%Dv0JDnbxBLy
zyYdz-hsabv5%^`r)6c4wwe~yJWrl&#ui;`-OUvt3O=4YFPOatKw^&zTPdEM4+?f5U
z+Z@z(A4~-Kmj1Uaere&#YQsb`D<HSm+F2K|ssZ-vCV%X1Q%CuAXTW(9F1zn_IB8Qt
zx8zEo>W-GZLs)lPXc}aTO-$pqvVt9F4>cu%I{a(IT+`o9Wl0y^&zW=A76^6Rn4V8C
z>M)-(%^f-Qi8SykENVgL_R{1P7fFh<9*qfuh8Si(lu=eRigW?$iU;;Mo$AvvZB-x#
zg53&-+YmL^yG$#gv99|J|8h%vyO+a%+R_s1vT$gL@3xCOqu;G+>GKj)Y9h#{%iK3a
zdWlO9GzCRj0=tan41LfAm?PGfW!-jWVbgKKDr^J>*mO&%2b>rT7GysAa$6e!7GVV{
z6%AtlsNVfpQa$H1p*f{hSA)A5KwXDLSamYoDB1UM7=qTdq<tmO&xWQcpOmb&$V)3L
z400;dDw_hYYL(iO@E2v3R?gKxn)UDrKKq@u;l~>CeQ@SV?ChS%84y8$Gz0t#@Bx-!
z$b|r6SA%Ng9BH!1T%aGbW^7iOF;A~fYr(6!y4pY3&fO*~!#sR~br%InoyKknsZBB8
z2Q5kO(9b8Bhh<ffG@a3xIu=S)CGg3Xyl;v4mR$j?H)SIy607Q<`D`_eZH+n#^lZgH
z7EUwHt7&Fz%^48wpnyN?`~%MALDIqqA(9AM*Ib~jist03D-UcMuc2tS7vM(!i-G<d
zP8j+KtB*+Mvj#v}qs>+WnOC%!s^%1c9RX*U*gvj&{$1ALLkU_#mZVvqwd^vzmhe{z
z7z!N`voP)qKqa2z?Q3X>dlq08=KRv|kJ>}LrCPb7i*4e(qZGDaRzq1Db@-?WDXUpC
zt0hd<9b$YVN2il=9Tp@2w<NJGDB<nsNYeYQAF5J@MVJ#BtFC6k)Taf|>KaY-+LoRN
zcs|0a+r4Mu2(#()0`?ue@JqKBHRt8CfVTi{&e^o4{Y+!8BZc-E-f8K)@oiy;Ltq+z
zPD{_5;rp`=o<v%H%f)$O*?vpfyz#*=J)eVju;T34nv<bX)^Pr$%_&PpKl9m5B4$BZ
zC1L7(npG`rUxY?WzxM(@yQz~_dh2U|n~qpN+BB05&Enta*gV@cbPcC~gCwj7*>lzi
zx2c6>AEXAovDe6&buHM?T$=@GG(~)$ovey=fBkMCN=rBW2A+2<ZU3N+?Tu{db(A$`
zn$rFs;Dd+%1H6Ja5=#|T8suC)#0-GX8p^Z~LWoQ74nDw!)3|ZApr9ecc^T$(*&AK_
zyDB@j%ctxzAXlWo*1PCuUL@(?Nt_?kR=jV{__wZIo-cI;rs%7#{>ZHMpuJ{2{Va<f
z7i5-N9@)aF1u38{@!LC-KY{+<Nu)Kh+Se3d!z~b;)sp#EQ*+AZU{k`^ssU=wv#Kcs
zv<UBE!T5JYbG{a&Q-j#9LxY6>Mx;~GM4*RmqD0Hgv|cSK=*{X;Zwdt6^IB_5Uk8C-
z7u@|SC$Va1Wc66&XhGk9Y6Kk2+yONpMlK%Ta#6ux5PxzLF+Nxh1J$xvF5QYz4f|C&
zo3rp6{A%zw)gUmSY+ON^Ml)r?uaw~}-09j{Srx$js)REFj+{f$)b6%t@SE1&b_|7<
z>PttmJ%lC<><ZW6t@eJuGW_Q?gmT!mp^{B52iAL~rT5$8NNTPYqF;)lB{FKb1VT_C
zPvIB1ZxfvYl1r-P=$9X08BB9U8n6a0;b-^}UIdS=o-J%DIEkGIz_KC_@3nYvdmNR)
zKcXUR!+gu-yIay;kKq?Bu56jTR?@tX9#=<Dbt(<@u)fjS>TUWds(ihq@5fp5w<+86
z72MI*zIC$*^D0all+~BZs{XdLUy$B+;epY=r3-r1!kM@<b>!%`0n1uD{F|-)wc4;@
zv7l+3wv*dy5<L$@KdZ*SR?9S?^9Q2HyRw3o#>zey{^yK;Ly%`clVS?)@*~%WQPb|v
zJDz{*`cz3MNSxhh27RESk6PUN*wcUFQt?Gs{?K+Jur2rP3H$<gjei>|?jCRvl$<kq
z5}AH0n#g7Hr-T2TglC_lKg)PnO0otETD<$f6i{6aM#Ex(lD^p{;eV?X(Q0<-^9w&q
zqpCY7xoIEf5t;N^hS%^b{MFB6sw?tRk{B<#c>0#cxopCMHuv83^T^uHL+xsTX*mld
z*lv%U^s98ipSbk8FXX5xZo4Y!AK>3cU@%rf(AG+(Q)kTw&o@Y)ze;*+!(Wrh)u)oo
zRJ2u5kg_W2^E3R?HOs})5JmU#mRsY{)KKl$a2Ngt56l8@DQ;Ml%V5b;i$2%2bovhb
z*rk2i@;PnZgfBj-i3Y>>U&1fKf7QbusifDumNXm8t;kpU8U7PIu)XC?XG4@+3^>*8
ztSiPh2fxFQ@O#(Nbt_rVyXbw>`GE~LPcGp9Wf%REh2Jc&n(((N>2nu;+?9TO|5H1g
zxHHouXy1n>{XG2_T<4<~fNhD+IceW+`ZvSZ`X_tjc(lboK?H>R`w^aa1r*Mo6aD9%
z(GmX7rUsfmfM2`t-*#!zl6tYMRL*LILZs&@`~*M2%U$8mO8|FeGLY^25`No>|BCA~
zvtnLcfnZaP(j&>g*)I6oPH!g8^s>m;mbCBB@H0I3^iz=<uwvLkHkMUWpy37l-nI6*
zuG$Z4{3$pcW=?zq7T^K=6@Gy?p8oCQgKmU&BX|J+D-A;rw*SwXhPZ}%T1x6PoV`+&
zY_DkV`ZMkQzS}j7s4N=Jx$4Id=v~#|gtwA@zwKOFUR6@XoG+99HR;Ydcp~gS>zc;c
zl9p*?OBXF)wXTV*zwNrWeA6XiEb`NIOPfE!E4Ztr-*37mq1D{SEnihtczR9K5TD3)
ze7tj-q;FcbWUXcUH^lvSBmn0ed3CZMHD~qsn8Bt*?;UC1*)I5fw^GaDzoKN-iq;n2
z5%sQg;oo{S$;LpN@_awSbNChh0q%AURNZorTOPV4%KNOC*8|bdn=bmPWw+d$+^x&>
zSk&J4U$yi*ga5XIk1gk|SKZ$75V?7*z1<lFirYAD(ayS%YLSz-a36kx-}l15(ZA2P
zSCnLVuBG3<?mA6!Qz-Q5`?|PrQRBpalli#Nh5w3Eqo#W6@;X0ilxL9sZ+G%<#d-QT
ztJ#u<?<9SH^6o<y3)V&dKGR&2_72+PucH5M_)P>^No&I;?cTtB_#6DvWxaU#&D^bv
zkSa<VyoYD-tN3@xzuy%6KHuBarB)QSdja?1H+bANjLzhbd9AUcSQSa3Ik*r1uv2@Q
z{ITNDwrWhuq&!gNzz28=kKom=^QbrBsoSqU<9b8eQlHP^vGxwPyWanx+{236vQszG
zEyUSgYb5&C&hLMcpA>Y(2@Tb0ih<w53wR7KcU?<ThL_68zIKg`iOlx}c<J_bXW(zk
zq6wDH1r3x8M_5zb`3XFMx4X{6Uxr6o5Bnab1iXoXC3ppo;l-{a$*b@hnwlPeRyonc
z#hstQQ|*mzb-n*ye)>m^(;id^z-*U?@Ce@SI?MJG+=ES+g9~sRhGj}@DN*^xMO$~l
z{|n*&oyPf&xV`ru;2AuCC%cZ*o)g_J!JP0PLs==;_gYEtepmRH;GvV%Q*P&SSqk_K
zJcU=g;(tr<FKTS^xKI!xM<H(eN-HULp`W_)J8w16GbiQ=zC*zO?5^~)pcKo3#zG9a
zaoh{=0-h@SvJ3s!;gv{WUi%FCU6wYa{sIYo*!ljv%kh7JV^Gw;*^~l!qU_i1^wZRB
zol|3Qh=^F~>51ge%C7Lg)=$i9tU<3FiY@8D-{G+Zzq;@jIZJ<1%S=pxzZC!Oj{nb+
z*q`B?hF$cyZBh$LeZGcIJKx`gXR`F)Yansn<-Oj)Q=#zV&iCId!ngo)8p)SP0j|L;
zJb?S~YS&r*OOiiJ8dFjVH9-Gs83CVFY_N;;wI>JT8kI7nkbF}bcuvJ1dp=_@sj<hQ
zhRLE|AGH*$xzGE(Fs9D(F>QkgS>6Q|Pn7q0Khfmk2{+xwN~Jz1(YNO_499f-eidzP
z2zm3O^F5zFjfEIjC6^hY%Oa!Aecn%m|7i_&OGJYoVNS(bdp-j&sck9)DoWav1^E6z
z_)8A|!*1#KiZ+6M+H0Fdcv7|bk6Ev#q)I4On(MNix}Kpd{E1lVlQt?X?_~X*@J}iq
zQWEj5h=e}xPd};J>POY?o|jbk2p{3|{`WVaQ90x46>n<v-Uk&bH1>JFPh*?MT(4|Z
zTJeJopxf(-v=MGf!!BYephaPEKl&en<FdZ2ifKaz*!+PT#d<UrVNC7)ITe1qhj*~B
zKm3XAZ&bb71sM>Zg~9{qXIj?BfS7+>_z$%w2g4sk*r*EkS5zmwc;Hellb@5Cb&yly
zKJf4S{`emh{Y<GSsGe2e3;QoM>(k|rcKAQyjH_DR5B2^jWP^bXc_AD7W6$o;UQS&g
zWpjVg<Da6N0@N1#n}^@~ZNb1E^^cWRZreWa<wzedxemik`PhegER_?x!r`Ogzt?Ra
zXbAa-f<M;b3#weI>gB`2UsNU|FGsX25<1jl?X8xLstAP#lYoambuX6>Lwp8*kFMZL
z;7_s%#oD$<eTXM=mzNj)Zd&2|VV)khPs73?>xX|o$&zqgjS`1~-?GU?m%Z6^jTQ$#
zC6|ndWy7r>e(!xM5-4T)ck?hL5bN+gnkQ$McUbs~V&S})w|u0v#sznUspH>6!5_Iv
zD@nFf_OSG8$jfKiAi6(e&F{6tvsM?T6A7rpiuSJe%|)smMzrmGInm2s0>9OVG>^Km
z+}=;6AP-Ag_E=jNUyQ$;d;iGaHxS5nP#*33);Rrz_wTD@zT|!1A#voxr>@2EXv7yD
zYaRT;`-|Mbisl!9kaOSvlKW3wjqR7bAG=bGBlEA}kADpP_;$%JIlqVh$RuPL?)n$Y
zpM&H_bO2gC3Gev}-A}LT-TKsbu&F|e^~0Q`pnAz31?Ym8wF+y8xrs!30R|MNv)0bF
z!z}$SX?W$JM&qxmHFEthWLVP2y@j`?pE%gbNc8=Fg;*NOv#%cJ{a!Z<%gP)#)sT5G
zC*(9ZYrt9Po4&jBFdz)N@nx3k+c=Do)~kBGewU*O%}{HHr=NbAN0DOxtE$33;E7Z+
z*PmU_6N<bKK7XHTC3;+{Ze1JW4}OZ`pI%Kksfu_HWu%*hH{d|n5(ym)La!?s4D?rZ
z-NXm;Uyq8lZRSBsQHGVn7`chNf3wiT_YXz?J!%*2Rf}0gon(iSKdEXBOK#<)6`UMw
zB*&_^HWH{QPrQ5>`O_!j>v10HhN?Xd#lJ&td|adu>0#*~Cp?QP7u?eMyWx+zK79;M
zYU%KxM(C|8%l+2(ckKF%!wF>wdo?|}qW+NZ{(fQFAqAKxwfVtZ%vJrw$Gy;yYL<BT
zGvSgnAK<N;srKZdNTZNXXn<^wTh9=Z)S(|_PcrP&?DA7?L7Sa_R$<=V*(j%_%_p>X
zE0q9Sfp_p~pN-Kdz&i~mwT>aAS*ohic`5k!d`gPZPHTW{A`<wd80(&4#)6}tNyXS}
z3Q)e&`S*;G#`^dS9Mf){Sb6FX!vB6T-aeR6n0Hh?4`m5}kLrouT?`grQj1xSspp}g
zB>Al9cfT0#xR8I`ZJx10v3Kx(*9^Ibe^ToYCRC(gg)(pUUz(nC0peqtJ|1hmOURJV
z?L+^u=<k$vR!1^omv#QR1L2=hhIvH%UsdM&z24s){|5i05_BQTu>teS^UocaqnpxC
z47n9`cKIRAwI~1f!X%uAvu+@FsM49&()N#giE&Rq6K-XJRVluPcl*oFVVHr_Fm3Z-
zRb=r&^z&&S`Prjs;xo#T=2d49as;no-qVkbV(P4oyardawXY-#tm?|#9>R-Vc`$-$
zxCA$~HLoBg+=LZZrSnuJDc#QmT-L_+Nmpf4mkgNI-t5;qP0=Epge!0v&M42(3y6ST
zs><o<t~4BL>GaodN&8dsy2DkKS>A*DzDT(1_ReeX@;Oy0p<=}#fS<wRUE$AZZSZwC
zuOyqb8-?!Xhj4!{{3kF2=i#FEu}mn>+*E(q1Go#1cUCzW{Ab{T)&W?aF_Hq9gJ&u<
z*aiQm;f9u;ACm`JQ-*jJ9>D{6v1^_>g%fZIE^Bj+HT?xDn1{!3M@zeR!T*Rh51mq;
zGVd0Cl;IsbfCsx4YL(y&T!*V#2Gr|vG|TXEfBeTVrKRgvv^>RH47;kFcB!%f#^4-W
zg^RksUYBEAgctA|+=bUW;V-~xZ5%)0c5ehHas^((6g*Im@-Fmq60X8^xCoO<uLd2w
z4zJ;{s{H15#s46jh3jww&S>>stl1dBfDBbe?{>l;=;w;ypKt@{xAcA>zQ?;(*~D5~
zbx~DP_I^bzR&P~k^ln!fJ_x7bqOdrv7=KkoE-&F8Jlu8Zdyt<u+|v1yt5jcy_i!KX
z!1G<{KY~d(4`0DmEzdCpvZ_k+hw#{16nDk{6}SkeCI9jgo(n27y$7$%6-)i?--bWJ
z6>YO>>puaTT52%_d3exuhDd8GZ)<6M(X}oCRX8onAjEavbe&;PYfHb?GMCiNy$HhK
zyb}!1{T&qG415dU!B^7VMVBW$EvYaFs2Vc^|FjlY2KWc0{j5pfvgg6CpWTW5OK$1;
zq+9$JT=)}kMvHp7;ZL+Q`#ZP^C*=Qye&s+w$Fx?U1dn#|z>HhF9L)D&*Xdc;4((oH
zWWMYDA-DqH!c|G{e&@(;DGG7|dZFF5G`#?);2Zc8T!$Hb$NbL?ZNd(9SKWOMvDOxU
zuchCEZagi}%5rt=8wCyi%UZe}%>RBjs40liPulYlca?hQ;0*i#-)ogrzZ-ZEoY0f5
z_VV#g8)8!TcgrnZx9mwpX7!+e>c;;Vd<EabEw}bL=Z0KtsCF+<)GKd8C2$;WYHjs|
z__w8*6ICs_32xoJu1dp5@Bg5s;lqlbwPgcNy29OPC;kVtw)Zw%b7jsU06hXza8g<=
zwl;O6pVROSd;=Gq`<+OJ9vAZjsd&E&|1q3`Yj8_zyNBH1wov9ft=jK!uGyXRa{;cy
zWw@lx=crqx2`5y08nnvmF8VnQx8culUGgV&lieF|M)#NO?9(v$c?)i8^<rKL*pT54
z)nCK1DtE#EMYs)LtIWPnbsoV9n%4WF@}tAQsZsA$_zu2>lM+!ibrI(jmF-hds0@$1
z5+8@ll75%8%P)xTf`-@isc;KLWw&Xt2q$FzUX<;IYhvxuuAY)Yl51WD&%oF4C&AzA
zTJ3YLZlp)ra08zB3mk*XZt3?h;6{_T)BqDuT2o`jwhS(afxdTprz2TVsBULiTcJYn
z@hf}(m~6*uuEO7@MKm;HeMo|-s#9#ke@Sagk1Kjwl=QdB8CG|-DYIf5|3QKN0nSS3
z=UuHwueL-51sW=5UfTQ9k{&nUw5khnlg=%$oKw+C2LBUq6@Gwk;Dib!^R8bza#Jv3
zskC_y|0&`BtlJ$KWrq{@ySj_2$NwZ;hd*fTagVOe2JQ5#?#Wbg#p8bjF2L7XdOw<-
z)f>4j%y~D%WxLy;q_x$5hAX1;*qKO8mk=zg=;OVI|10=T`0EpqR5Yb31lC7k7*bdC
z)=mGnv~<1XV%SYJAzM(FF8t?U2Cl(vp`c&ck6>x{!l)|eEvxj_!+*mu&yY)?)O8EJ
zDqOap-nJ7u0bjwN;HGw|=G>g%yq4Td%YX~`|LF058g9YYQq)!>9f{_K+z6W>m^1Wq
z4gLtX;glS=wsNR}k^5cIV?`{Sk)MJ7&SrO4TGw%2CYk+8Mt+`zTkr?Cq)W#2JLjc3
z6PJ~E+fe|0a$df2Yj+!NOm_IOJ~3liHQ%#ddf(9A=rN(#W(HVAOVQ0sTCk<ffdDVS
zWo>dTI00g!D<?PU9K40sHgIqV&cQ9XA<rptEvw-PI%EPaDjxgU+Kqzrz6M{zdCknL
zXD7DCf*<uvEZV@mAOM2%9&G>K?O7Y!UUbPebSR{_7~-*ga29UBHEkX)xk+EBoTDT)
zSm~N}nS*J#qTtbxlwPFzlUQB+fuz=T{ta`4$K-sThfz0~*2WIx)P*0xC0NqH#8r>~
zV0#9pYdHg@t>Kqb&{RD3lLbOba00GomyV}yxOO5!7?bGP)W^3B|EJ~oPG!6Oae$<P
zZbM7oR|Eb}x!_5k8}k;RC<r4H5?zZP|0ztuCE2beH*Pv|BM$@oZ8z?DyYO=a&cJ1@
zeYW6%Ne^SeDY&BE;gD8l5H7&C@*M`<uB=3jqp8Fou%dYUHvGW=xdvyT*Uj6@yWJZ_
zb$=yd;cfhn!ddtVF3J5%+^<+UOyb(_*Hl?Szk;_R^3gBfJyP&DQU`s|vuhd__txak
zarg?ZX%ltH&FYCX2`5p0z3!$PZ_A$$`MKaC$NuA~%6F$!6u4ycGvN65Bn-LAx0b6V
zieL=Rs&6dxo^Qi{9lloZJl!!X-EJy0zmzqeyzJq>rr=%TMqakv(4mkasAyXAGOQ=S
zjME*X>P~E^dDw=IG&OY%b?z?)Iv<t!@s(Sp+!pItAxG?HV1$s=$8dlWoR{Zlp<|OF
zCI(w>WYnr;$R=<CzK1_K-B*|A7P|@RZ8!RN)i<4uz*q1CT$A~ixFMDP<5rkh)xge8
z6n_fNn*~4YHRIvL4Ku1~LhlMN1-IY_xF`?VTGd;QhK(3oPKMV}@FU=FV#lnMrW@UD
z4#kF=Rw{u2&%(FxHOy#Mchh-L>5jqa73qLw;3RwpKfq~SUe1mCH5nWUiiYlQF$jq<
z*^cKG_Aq1D7tpocAdRX9Om3p=$yKegA5rMjZlUeuVL>|Bcz_AG4!7m|#o3+3x$X6N
z9g1kHo3@k998~J)tSZm)*<oE3x6L4@qb*z0SPo9YHMl1CDRl$(aMLbh{Ynd#G@!N<
zy!``QQG_FP0<h)wV4$L~Wlh*_1OXO;e_u&~+joqL8WL5#vOxm>ZlL(bEt#+V+3H=x
zeB^v;+|Y5R;0D}=%L<(vp4!d~O<Yy^vYO5sfq$<lcsJ}yx6J@UY-c`YxWA7<ffeB#
zd?WrJaPtBqN4BQMLwvzd$Z_HSk~UX26ufRK1Q$DFu&tEBHvA#@cU_SmV_Un(M9nL`
zS^2SH@-u{A&S;Z$OTJ6a;U9;>AOe@;9E`zf#m(|g;<vL#Tw9nNRy-+)iq6vYF;_Qe
zS~KT%wj}zOLolOw+O}PDMj=+Ke#C8VsA}A4P=ZsMMG=L{le3BsrcPpb(r?QkE5MlI
z(Lq&h|K6zkcq=>T*Q%*gRVp3xORGcXeZ=|921Vrl7x{uy5>;seBQ9<4rGLZiI&bLw
zc^H?Q+H3d^@DDmgVMl1WS=UVn_kB{fHpnhGrS8a*>o2OiFm6jmNJ;VJAxI6M#}#)k
zx_NDp!+c8-#W9r$b>JTqK6{+Fwd&O^H*u;8k-BN86prfba2<1TZ~N19MbsvVLN{k9
zQ%PfW7mq5uS(WLNxK^09h$oa8&A1w`4nIsO!j)u)wbk5?zJR<*IIBoS2L1uztmVv|
zhEqm~eEBKCpQJFT)?mvUd0pLdVTTH|phru_OGf__qUS_oGxMUt#A#AwO^>OSxkEn_
za9XLmybK8Qed=ywUO`LN1GDEsYwT$y2hHD%o$ru1T(-0%-fG-B-N`eGoF$I!jq_WQ
zA`R7PTR6Hy&&SjqYb{=hq<29-8j_Kta1zeK1Qc>G3dfc7Yig5C)9LV*lZd^llsjhl
zAA}ht?(=R|7fyjCGAu%o@wjTlw&l+$kzhf=f_|N)<m{nPq<a$1!4wqJf_`vFf6>AY
z#q7Et>&qH|aa;NvyT07KvpSp35e<hiPE0hLH~3}El9p(X%7Ti;1F^FsVri0a;iq97
z-WDhisc0;g+m@(F8y5&_>l8g|B=3QxL`&NTT-!`t<LmM+5*X>HdsJu}vXQMGjYk-D
z@hfk*HeEO)Q1ze^GQ}br1KBCJxDm4rXEkp@OR*d))Fy!;I40jAaYRGbYcRqZP@U&+
zD2M2EOUz4dXKGa-Y?q2j1{4KyK2Kvnh0EqU*-?E9mno1TB^3K$C8xsak^;>o9XL3l
zdAIxq{bQh?PD-Iq@#S9E_S14SXR!#=!P|`v0sax!HDSTSyu9I>v&Iq$>;N(*fG6a8
zSc7X#UyGee%qx~;0>55@AvKQVvZ!u5?VUQ->{p?BAy3)tLY%IP-HNbqBXwcHk^w!i
z?D;wG6s6e^iNn9y0|gc8r_kupc!SYw@TKi=-qcU$g!~a(dT#t{;clGEXf{MsmOxJU
z>Iw8Sr~-$QDwk?m-vRwgBaNbpOgsELD0(hA{%yN8A5}Nex1fC32o&;Ed*v4yleXQ!
z*_u;{ZOM;e!+)Yd`@@Q`;#$C(ZsNAbMP8heK(pWmW#@HyNp@U(?66x>wfra)J4=U4
zN}1VXia(Mg9!ncGTrYly&{DS+3b&qRCo!8y&WXdC=AM;Y>1oRmP|L+HVwJlGn#m`W
zEkDqdh{@@%HuU9o)*ZA#0NcUQSnPt__NLo-m1Oxjbsl9aFP$(X|F<Q@Uw0eFA~%V-
zsKL;=)a5%(V%8n$qL5<Mxsc{xbw*4N%l$H0Z|T>z`<GS}GN?&TIcVlxZ+OmG3aMM8
z9=S^~@mLBTOzNzJw$t)$rwnk8Xv?*!Y>)QOyDSGTgJzz;M@OnGcSnGZxC)olX?zQT
z#LfUnm6PuY7HE&!{>=T2^;g@@5@|Y?G=pqA!P=)v33Gf>=lI6@^;GxYV@v0I)H&Fb
zRp61BswL5zI%EZ|&7~SFP*lr^Wy}(Xt%i;e%Qy_a+O|Q_=eC>1?vDXd)h?7xt;h%`
zY7~Xcydzsv04*oLO{mx)kko8!A7nHV!tLEkM3;GGqdV}IB)4-ed)IU_Ba$Cw{k#Rk
zA5@FlRMF>F6BztP+AT-XIUCGwY+Kc~rpSTabi=$GZvRn9g&LWCGNvLu7IFW-A?Omf
zbiSF-h8<%`L0k^d!Y@q;@*o0pDUM5ml*x<Gn>ut|io+7R2HUo9z=^KJ{jtB3IOD*_
zNVdZ(O@|`GwK?MU8Ouqg5Sg0YlJ-cP%4<3$=zp<WJ+<9IOi|^WJj4mp$R8f}!?81#
zBKHN8q2|<^R!AZzu8hzck}{DKU50t{W!v^|(;Y{OWxkpzY^H46v3F!gup4SP745aB
zT|wkd7g>;^F3PmfcgqdrXglP2@w|<ama`<9vXpWzXiV0dGefZxSQ%97rrBO`meeem
zn|hTJXJ^ct?OS)*lf=<QGrI;XgYdkQp^*!uHL~)=$9OvQXf-zt9X@i4x{NP!E<d>a
zzEAMh^=~4_h_Ne4j9p+j_Pl9IqwQq1QEnjEiUM)1E)mcs<7{|}-buptmz!Nb(RMdy
z|E3``z!Uqnp0-=@E<;PQz94iovjC>fR_>tMENY|w$mQ_5j47`SwmHeNREaYtTXLdJ
zpQk%TkVU8>7sB($F#(Aa>9N5dxgRn^HOc;j<&)Z8x~FyZn6E2}(#nR>j0aNv*Q#Hd
zY-VoDeJ*jUYE8D}oHnexC1%^RjJBM~jT0cbEIroU_@zd}?HujRroV8efHxl-Dg&$I
zYVz97K(m^eg5#>hQF7Cz`8#Ey8?fnCy_hmEVrt8au4$}71!@g9THXS^MzN9e+>1KM
zhMNQ4sp%@~pG(d`iLxj#O<I-03~HqF4_i)*H?!%#noBF2TUH6-`MTSyXFO=p_rx)S
z`Ega1Ay!bdZQX6!vw%(NA|gh>ZPy#VW$;_jKFJPWX{b)c%&J6Upp)pe_;}4_4QpBh
zl{m|)t$%N0F{%cC*&*30)5xtuGJRT88FSe-Ry7=vn44#8oMw${{P;nmM57$<{F}*$
z<AS;*c@4$c%5wJ1X!2pRwsY~T`cw9M$G3DDZFfm#=;X3M)y1;wM*n417DrNaCa5f)
z9=Tty>n=JaVe1OMkmWWjpfsUmN2^;<x+-DN)>T^xU`-Mm$*<UTwv?e}?3H&K+eAQ9
zOl6sM(<C*+e@*AFx&#<mncGxJZQETz)##_H@0gEc@|Uc+k?PG(dM&NEeD-M8{BQ0s
zsN0hMp@y!Eu(2-dGVhEi-2aK)tu!>*VIy2X8CEoB!~EpPF>303oR${f&HJU>p`dcy
zbzYerZ~8HIx3_9@aT*#`v+0VQQm57fdXsE^$%c<M(eH(-nw)(5oQvnj*_oeZ!M_OA
ziiUgy8Zhg|__pO7?nrh(RWi48AK~N2vc4eFRaHm#O-(lV4C|(VSB3wgQ<Uaukj(_N
zG+bg8sx>z!s^YW+B7~|4Ggek<Lvv}iP*cZ}$|CE|0*ajtQFpdi+3oqRqmrg&s4A?B
zt0HQ;l#;2mO^x=cSv}J#98-l&O|fQkAIv1FYjp5JDB-EY8Z5z@STxPrgh|#IX{e0&
zv(f*SE9lER9cCy@oJg!{CqgK}sx+icLSUw-_e~y~C6Q<>!<u#^Z=x>o&ngMC+^<=%
zzI-845&kkPSsC$`6k<JFcu?0iLlZ^~&9(Uib7ARxMWeXaV8F%gyFBQ~nHLq+V0QS|
zv`Ixd09+G5xb}jGuO@?S#pr)i_fgjPn^^k54C=^<lZq<I7h$WKodoRBW*%5W<YZSE
z_-)JZzXZ!F!D~59)^cW?{hJ^^7HqUo#m)ICIWx!rX*=tyr84xzu;V*zsv>n>g&3)e
z<(M`zH)}%`xF4+ubxniRtK!>s77ga@)ZF0id22-4Qc?J-QbDHgO^jPYtD=oCZ$cTQ
zG2|kgbUuO!^VpF=Q)@%tTMZ`~^Y9){$%6OBX3&>8mknChc{e=#@8N{P(s_4jh!PsQ
zy#;N)`2^L5<^;{FDyql1?UA!AQ%60YVA(Hyeh+i1!pJ$@Zx)D&!HRmy-`G&5O~F5}
z;Bw+BX}urO772V(4P~<__~&5Ec>oCDnu<?~%E~OcCFvdd`6z|hFA_{0^&1Lt_y4YI
z>Gmv~RV>A_nHH4tA<-4x_hP{RCwL3<FzFm0lVKKmj%9s()I_Ci`gsp;U`E-|f-~h4
zY1LHj+GinqExccbPpYXLbE?bKpfOTS+HPKpH(FH<33{s<aZ3i&rHQN<GvNR8&ZX^P
z&dju8AttU{j*u+=u_k(5H~L@HkgZ__vMk7xlLu25Q7yWmPHj&Qqe?MuxqNxiMTZ*F
z9&cdI;1Bfk0gh)6TyUko1s7o3)M%kq!+#hRc2RkJZ+JJI<IxuVyoFa`s8?O|zo;7e
zhP>g_#oZf1Wa!^}<(JN{!8|O$1`O!47O+g5Vck>{$vb#w@Na0&)EOn%3%b8{)_^w!
z9!UNZY$eEW7;+OF+Y%ayquVV}{{#319vK7L`{SDbu^l3Io@+xR&7Q(tEuC+m?+myV
zOQuIH;%C<QC+)twYu(IkRdS7~huJCyV*yxK*x;iU_dT$QC{3kO2gIx90(QlaYp|fT
zt9M}5mNq9!tqrI=#HWIN1@>on3BPM1M-2(b9`(Oi4O~9!$hB4e`v`u9`_>8AQeJmV
zwFHT?H!b>Z((@_&2KQ|uN?m<&1FGZqxxbu?4cA1#cm3kdw%gcl(j#`k(8!fjEyH8@
z2_9G{bt9XXws5bR;uiGYa`^w<CZaS{TQZ>11j{K}vbrF%Ze5#qe}bpxg!CwfP;}Jh
zeaD~}%37O!*KbX&>HGbvjmhtLKNkL0wYGVi{%czLKI{fCwp=wsJ8QkX)<l=r76r7r
z2D|)*6WRtv&~^{Sze`>8-|xzJ&2F+{#+qCE{SY3)hd@6KHQJ6y2jMJWUw&WH+UPsF
zzh<DHqBy7D)y&}NFP1ksuf?B_{LbjS%R^h;d0rJ+MM=+~Enm2`#U1!#l~N2k57gK$
zcCw%h@8CDB?XDU947l;9O&7VvWeh_1?tQI&{%rhfg<zJ4#%1$U7rt20;?(<|{;jZW
zP?exo3zax_Zo?+L*XHS`P)Tt1a*r%9iv-4!du2r$p206#n%uApXgL}x><HSoG<@eB
zJc8de5^2@2+)!?Oz!gJz_o3#tCf*SNcLZSQ#?46p#EM+Ru7Gk~hQ=fK6@G))HjAVo
zOEE9e(dB{~p9ksrBm4-@{8+`hK<HIDh1uS97g%4{=GVLM#0a?R$h}uGEODlMm-hM*
ze%5}=&VO#|<5rB@$Tl-u<maWN_j5m1G}xO(x3N8Tsu%eVs}g`e!9x?^RRvm2jpegx
zD$D(?2!B8OSuicxOIBNE{$AC!G*&ea?N|5}UieV}miV%Y7&Ai(5<qp?T935(c+OsK
zt4Xm}Av~YBj&;m=t<Bz+UI^4kZ34YJ+K^jv)0bct?!zx#<6x@NRXxrF^*s>{1#jNz
z{l|v?ild)`BVnKTi{yJf)!O8@o`1vt%Uf-T%VD%#{4!{td+<BF^rIvi*=%~wH6(cL
zwWjm`3QvuHsuDFNH^SIVuS7Qerttrxx4v-YHC9X3=K`@g-SMgxSN;Y+?d0D`VEZVz
z7oZJ|bPLwQeNX=ub~0PTmnm3%%`!ZLp9KH5{AnxBU6dNjxmjkB=>N0e5B$5ilYjdZ
z1*$m(5a|lHU|yS#e-EAb9s9EEk|2rxdRxD}sn)yaa0h;c=Uqchw|3YYCi<(2xXr<1
zN&hEZ!>D}gW$HL?OCHsdJf~p3+}DoY?eEvKCcj0z%NhmvTE0uS^t1Pkjk<!Cb%l9e
zXlZu$Ft)mqsBLGgZo088!E>8+>VM9|zo|G$!4<z)7=A<N52E)+c+xf0cgvN$x17D!
z6zQyG_jW(p75@!K$reehs7fG6ug5Z<W<C5BM{WfdeYM){P1){2dp_=hzbtM}UGtSC
z>#GV=dH6r?ihpx9tN<}mNPJ1tWPgHRyM~dKoes~tu&32G1lD;1e}kXkMb}X1EqN-5
z>*cbf&!$FEzR>;O?;6NjQ_Q9zS#Mrj(?w3-xV`c3T#H2~iEG)&oJ=3%;);C7A0_=V
z^k3G;arG}aTG|x--+|xYNf-Usl!vrfb1D$3T6eYRv|so6Hy!?cv5BPtS7kc=DBF9p
z3;vpZqTs69+b-<(R@yu8@4SD%Ara7aMWcqihUn)N+=ZXuZWsM;hy-wUQ&UV;Rp0Y7
zC%xBqg+G=KHxXo2H}56<g8Q@3MgL|7bip4<`z^`<{t<pP1GtmfYr-1|-9=5L8Bnig
zh`YUm=kRFPrR6Pn1CiosXVsWjknUcQ`Tsymr@PDOKGWGxzoMB4;R1tAw4%7zYj_H8
zyGH6);T0tM{&5$~s|fpVv@!kN&XM{pP1mc*S(tLe@at}z@4cPUGd!#CRA$1Q`kYcn
zC1rRE_cV^U;a_grU=aDKqi*G7S#jGZdo9fl_t$_`_@Hsp{pv0YwnqrQJ=8ejUGV=%
zqsN!DH#(=3mxUgmz~f!<|61x|RkN^rb^a|WuqPUaytEU4bFzKc;I!t>S>7Yy|0O(v
z$GhTx8J@`4UUnnp3gXZOjq|+^FS<69)Zwip*0LMiXimVUW=Y?Hr@M}%pBK}w!zGx~
zNV%4n>WxP7Ki_qn@+Y`2MRO**lB2GBeF2Z)5xnghN8Et7*%0r98)4LTOT+KM)4lMI
zJ~gMlt{!!?t-yPDta0M=JKqlisU}<TltwzX#8iv$79MGwdb11uSp^=#<qgXr?SmG4
zhG%dW?!(JnR~pPYqhU=m9c*&Zf{^!MuW{Hb@W|c&B#gM}lfid<0rz&L|2n*pt+fgl
z#5|s(?f#zcDnH*z)NH_KH+VKt;lWFHe;;<G|Hs;Xa#lNo`!vpJNrfjbG?BeS|CumH
zqHPq@>J98u&Ujwi6z2C?Xpq2!wy{j9*}SQx?Q@zz^?9fCv1brwG?t(!gAVy`A2b4}
zx$Da%n9|tRF~uXd<b1zBaA|S`$KZq}6%4qIQY$bIv+!=$VdVDoxP=shE(27TVtBXD
zLW#tU2^g3E*OvM94rXC~|D{}GFr$SOBgzSvm3Vx6;6jN$n9}g<po8kFu?Rfy4i@%V
zXfx>++6*WswxKNZJNR(mLXsJnc8#@Um)6hiPd@{2+)ZBYSMkTXjPQ^?tnB)7L2dfS
zHIp)t1-Bv(bawxRB;#5Lb4&|un&Qfjy8Oj`;P20(U{K>zSKtG@5&j#yzMO+&TG~D#
z?SX<f@3c^8PyC~n-Wk`}o|6GF2d@rXC^VokC&$G?iTHm(B=B)B7Ay+>DdiJ-MS?+e
z1=YD9{g1naRzr%2mQ^Ki@cH{;Qj>1RWy4yb$=m($A4`6m%I=fcl>7+vzrN4+C*YL!
zp$sV;UzGv)`oM)k{Z53BY4W~Rx_wgp+P3u*?fi{iB{cfo@a1J#Iposul1u0moCUCY
z$dzWP-nT4%Q?=J?hgs>?r;qn47*<xT@Y;bd8~g*VzupSB4+Vdum3+O~VD^^El@ECO
zT{hmLFzea}crgBRy1XILV5nMNInqkJVZ~#E1z$e=-tIo7qI#tOw{-dIhk?H+`1{>P
zwKd_te&G9w=-1-eEw>WzU=rwcD?M}at~az2v~l43MV;SD0cwf}9t!?Q)dT}>@ArnQ
z7CzwV5&u{|sH(x6t9zLq{wZYfpNjssOBeTlV$uJwoam~Mzq<cAHN$_u?ysoH!lCkQ
zWvAn`EB^c4()*mgzkJC1i``1jUVY!>*TLuSaVsGc7Y<%K<Vs7EpM45P8vKWnKZgH)
zP38#qx4!F2*Jz(#%E<z)ABJvrKB=p|I~eb~o+Qq;u^sKn$tnMmkE2K?t_SaEkCiSq
z4<q+>{l2B2zVLl(5B);?=f3#-k!xtEABOICh2Osag;!`Pts|o_a+!oLr2oW)@xSE$
zVmIRCOGqGf)2;S~Ki>0lET7T%vhOD@p<4T5_^m4T$d8fW7t&AU;Qx~ETj%H(-v2-L
z{KmigiNJ{SO0ty?V;+Vl=gL)3o^MOdC<i}1GB16$Z0nF4k8z_k@-9_aIpoH3G?mXZ
zWJS91Q0I?SuF@w9cT?{lR?PJ%U7weQUsgBe5#Toz)|6^I*t|j(POq}9O{Gwefd8T^
zxQ|pPa5Vbqa~`Jk1t04DSaV#8S^RAto=t4BZm9Y-_w2Cnr`aKkmg+n#3G`}uQ0gqi
zjibU}a+4b>N1~scD_M<QYxCA&;E%F%qs+-Z0{k|x(&*PZD-ZP~I)6b?Mk9fPC2j0V
zu8XQJ-L$sCL&D!9`fp|L{|M)|(Ps_e?@;h3uGY?)bB{oLNO+eZPs%@+f4iU=^ZSXB
zRC~K8mFH_KL=eJcA9iLq{4)R(8U{J2+Yc_zC-p<@r>%S4#`pp4=L*!b1Rs5W+OAI|
z_@~@{t5wA)7hrM!rOo{g{*dFSs55NA?-<zYDJe}drKQhJAur@OKJ5>GqRFF^E{)k%
zB>(_`07*naRK{0Ro;%Z@vlsp+6(jAFd9mf*U)Y}wO8WkU8Xgk2VIsi4pBhPERKrN8
zpx^D*Sb-(@v_JfVFs+UAKES`IB_E#-q@OVbc!#yRX-iADJKd@Kyr0wiV{V*YRo&3@
zzQ1uV{7;C4EyuF1=xj)E?8nBvnmjt;I(}+el9K7K+x1CdOkw_G8ve2-%3rV@6?;8H
za7_5`$=?6mfg9J+B;GL?5evtzPUC~`uiJ}$#zg!U<!UHRWc_h_KEop3ey4*gS`YPp
zp9G`9@9C$Mt?~V6I}-N6|1l*<t?j{TROhv)W&qScE5ASZr*(5Bji=a9C*FcJckcC+
zlp~nzjP8#3x1!z;@Cu&ol_d|SKk4>%Pbf*ZB^rLIrR%#?IYe68d<o8K9K4mMZm7!U
z5xm)phDYE$T!#y)vWeVSlFzDAdbJl0bJ{z81<pzawk1G9lJ5n)+Uxws;1ryP^IC6O
zl=S$hrT5QZb}#sc;EeXxpVmZkOY**kC-4kj@1;Xa*}dswO48L`iB71}`nc=+<Jwzx
z2~Mi>-pbTwRig2FFC8=tr?vWOMjsEoo1w(xu}UO%kF@u|3E?jkx|GyMx}rS&OLz(M
zyOQxFT+rSotFo$V>3n$k8NA&K{vkLk`a7m1bEG8iTX+Ib;q^Z7pVr>4Y3;5+9^^8-
zfyeNC*D9Gv+w3mt@S{*v-B>7bdIHbk{a*Al4CmpBzCY+1L|5PyJXVR^?s>wJ`1cB&
zb2h-btgpBFiCy52VH(cEB~?lFxfuH=x3T@*UN*!CoQCr-BLiw!9z^IaelGki?F0Ws
z(O<ulfmO+$Cz3ziVmyLrxD1!In7dyn3`y2k@IWP2d&-|PS{OX7rd40&`5K<Ui@oq4
z<mXkms@eU0$}xlr3xj_b`VaWO2w%Y&7;)2P0{zUw3wQ~0d)Z(Ea2Bq?S8%d(_>^zt
z!i75B8R&ugJ7r)GoP}HPEnIf{xg)pZEf8iuB=FXsA%f#t`*|IX>n~aTc}2r81|Z#8
zria_x^&Om*^s1|bHiiv-Jl(ZYuL!5%YwZo6b`#*MQlh7|ry<(8G<!x{7cas&?azti
zd2gsrdKB{Ta;FR$hD-1be679hgZh4ufyXrCJr7TJ>VO_N@8F+ujgP2L;W!+FUU=vi
z#)fAcuE1@$31`Ip);6&PA7NUV%U$3fhl_Aii|fWzrkg6yd{$ghf@eE*a7jzUZ|JXv
z&iO6Hf7W4MByfM{((B`J9ljO(LvHH+hRWPdX{`OTo%AyZ=Y{{Xs&TZQgDNa&sZU{7
z_=|8BzK7dd92lg3OO59hIOA5jz3-y`6WaQBC0ixZ)W-QqxfKO?wG;nCa0$MLZ?*Kh
z=xWY4;VhhlY4{Bu?z8~{|9<WGw@(AatJ*PnR@G-wSEXSLr{IQOHtlEQJ=Wp4Vgd=g
z-sy*pY3=n@xxa&MeLygRPpfwH!LIawLHBoFWtOqav~9w9?S;Anue)r}8Lh3p24iaa
z8&IYCh7{8<6yW(z{Exs@EnPpWpRiV)kl{Zm!TfNi4(k>Cx8b7hFQ2u+&baEwxvux8
z;imTfoz%>nNX#ERm>FrM*E@B<kQRr2r`@f+5R)>d?x873^;l8mZ96E1lkhG43BH2q
z?AE;y$scojRq7f#(|M-gCj3Rz+wXE)HZ5UTPHk`tUU(WBfQ#@w{8<;6$o7vl>}gzO
z%K@Jm8IWjk;h*8UTin>zAOl}|U`>?UdVE6K=elYPM`XU&l!lmcW@!_idpoF4Tid?X
zCYyf653F?z#TH}NJ++PhQ*aypq{Vf;P5>wBU>$TT)tcUp8-O$L4SX-@-J4zZ*bwrq
zaWez|5M0#K_DkA+WX*=5iL+}E!XqEcN#F$Bg1=~SVV}6aqyZ>Zm~b0eYo2}r{{JZL
zKjM~F)l@VwqPPT5h3DP~?157fz(&7mb}aI^^Q)t-UbR>T%n$IjHvc9f;f9*dCbefM
z{HJXhIte#Ke5al7(G>la)JJT!vl;mN;GCrQRaq&CNFz~7Wkdp?DYqs=Ki|O*vK~tM
z_ZE*HP*BV&Cq8-juft!oG``O*$_m1x-_^YY;bDC(9r`;Dx1IJXxpu+Oz&WglNC5t^
zr=J|0f$v2>rhS{P`)*uYy(})XEknoPB76(qYGM?c9}%c<$ki<Zb$I6GPam9xZ{QE`
zm4>F7^sl<L3MCQcLmvwY`2XI~f5FZ73a1!S^Fv!@8{0Pc2wa5Q@Qo&Z^*IG-Ea2<L
zUU>x==;t>4QHy)~J4FuZcCAG*8&X>IIo)x%thJfPUHY~m8B&lnW%0PrRyEiU7i7Di
zmWpU7OP#uoucDf6mK0d;NRQ+2HGBtGw3)i;s0Tql3TIu(+#62-m*G1{z=_MxN1Dem
z;b!7*Xkv3G@^THXX%XCzx_pqhFDR{)sBdP~2rxMRSK%ADsKs%cZsuobLJVCDRyDT<
zFYOLT;iB+ABir2<H$-ZFFS?+{TEKr!=I0H#ApFHLP_1z@csUg}U2x8*cgC%4oe-79
zqV<-Bh>ZiTpYEyA&lp^STaJEgK1p6;Bri=IsquBqcDBwaa&bx`)=G2|*YT9oAdnR`
zuXH{#CHL)u%F~N(;6>sZPTKM;H{glEKMLov?<a0ff>DvxzSdOrw-L0@87F;)1+X7s
z(C5|yv_!x2rhR7MnrzRJEU0bi;6CB9p=QPnBarbOYol9kPC&F{2wbRI3~Av`<m+p=
zskO&BH><lY<b{8%s9tu{jQ9km1^-3)s5v*aKi1@f)H&p9UI&^1b`z%Df{>;gG@i<)
zZffuPN}&HDoPe+3ilk?%ciNiYnuh_Hb`Ir=De%x9bOA2H3GKp69rLv`b7DY2y=7R2
z7oLAF!ELw!$Ffu6ZFWLIIpr<6#htQ_5WBbuH{gWqBRjMGZ$pI8(765uqn|0spG)#x
z^yP&AqiZ{lC6z+!(5D~(3vdp8fZN*pjdO3?&I>bbZg;Q-Ou=pVGhC8`VWS!%y<Bi3
zkD6{{bUR@0I{X>FhDik}aAjVRD|SmHL&F`^f&dKge-Fpq>Dz+eMo{EsSsMHsfqR1V
zJLeY98SAvfgNa+%v95%AE0{l*;SX>VCKdT=yD29%4f83AjjAfUu7q1T3E#jsa8hQL
zNgv$Aom4WgqKUEVz%lq5{scE*+AYm$3yX1fen&%P*Nfo;L*l+0&h|~*te+$cLg-yx
zH2O*5EPMyI;e<1g&3!Q_QkmZs<s>)4{fFSe1x08q^3lq!AWC(+>q={O@PARkuha5$
z(j8klnr;GH4L*7JPs1(vTJM_xHsrS*{=)@-)__co!7cgDrhQv-92>6xKc}DG@c18u
zt8zchb~m~23%N}hR@{VRQaA_S!*_7f`HoR`Y)4yCB^bf0(!q=?cybBG-OlIOnXi$X
zIoXyC*hxqR`U%$CgnYl$dD@Yp2{uD0R0b>t1yq3Z@F&6FuK{y8$3k^C&neO5i;sc+
z$Kkqz-)!$#C%2g)v4C3LNk6yXqO`w7ZW?aiM$xvEshf8x1Y!*Tx8b;3`i{eVBF-@u
z2#J~1p#8pqKfzTPaTZ(Nox&=G3yP~QdiXDi{%7R!C~y;RE1NCNWt?!WumlTWTH_u1
zo%+wYI(I8IF2V$yQ9Ln8U_^b5{n;rohCrMi%xh`*F~2nYgzJuNxvyEB?-o?tT+kB=
z@&ok3loEeMd6-o<D94PLq84@@S3Ee1U{u|$qpr8TtUsIT?`nt!N8zM`tN_DsQd#pp
z)nFCf$gM>5oL4LKIlnZ#M-7_8l5bva$8Pxy3aU&fBpWAi3@*bJ-Cy2uUqiEw%o;7i
zuxeC7IE5kA><ncWhqg6(rsU$$C7pjp<E=ZDTPI;$h96Fcn9^zre*pzEcK%^?XOFqK
zd)xhKIU{yJ#5!&CW8JB~yRz+iv>R@AULHnOBhw$=58ct{G$AD~C%x>}bXq-gQz!_n
z$Z@Qt;Zsr_k%UgmU1TEq>vjCMP5;x{Bi<{TPlRxPmlg{>C15-F50yV7GV^oUi8Ha&
z=Xtq1!B$Tr7*T8cKz46fP2+kRj-&eFw5t;6+~17bSZ>x^lr<w_mDi5K37Aoh9wl{0
zkIF|#92QKNN77z{;@?rj|5=T9=*dp2O;v`xrP1F3*{AfEJNQ4Y{9vymo>q3-Pf<ao
z2{`TXALu9WbCm7MXggYn6w8~0Gs=ggF-*WYIPZ#=dK@B~ip2(3rXQx@j7+@_{ZESc
zi;ml>P$A8DVFgv0j%t`p46PpRTpe(;iv1HgKOp2JN3>+Tm}j%kH5JBgtxhXDX}c|(
zuwS#CdtlAt%YBOTG@Z$9Q55^fCiEzt)@uTEL~-}L12wjtWNsgO&;<S3n_VoiHKwwK
z9w%rUZa+^=7C><Adfn3b<wQ%+#}yyX$%V}c1dq-j04Ig-1e*OS{Tp%t6C5Yx+^U)&
z1B~HI1vZDW3v*M+P;~yL{$=>Ze%0xfU^Nx|V=@g5|B>!8mhBv?%6h<s$=ZD|rnrC3
zaj!9b)2$m%q{Aly{tHxwRQEdQ+z)SoH(eM$=Qeit!b%9t91}h*&d_oLZ_SP}5*pGV
zl#T$MkPR~+q0w@pKXLO{+Zqo%6nu>$Tf^?nrZcu%&WSax7Rw9>hh2$8Ps2(Y<s8Ap
zZt`SH|5#M1<WPuI^eR7>cL9&st<1prl)*v^rH+CAH%7#aiKEkov!Q&iQ>s$Skdj+X
zV8ECu>oqi;%kI@c(eZB#t&$rJV5i0XpV*BcHTf0BNfZl|M_hg`??eqQhfx(r<^^xS
z|7t7&I^|LiM)t-wJ{nMvybOujW}ixb`*p~iZr+T6#L-eX?}+H9F|6c9Z+5~mLZSJQ
z!My0x6qph$$4U+jh)yHtJGV(Qlx#9M@NlQfx!_W6^&K+A2Y-BAA{aEO85^-<S~J`m
zP7fkeq@WVrf*DAr>CESio!eOyNh80y=vJ*a3vL6qncNn@Z@9~_U~b+9JLlNy7dCNo
zTib%j@YfVNEFoINjfgfI*?Y@P$HPtiV<b~BxO2@ORZaJZfSb})sq5D^bmrY~{La$(
zk~8*9X-C4VM`tSOH4u)RQ-a(g7k2ZhL!@0{PV@N-w)DKH`z*Qv=ZOmecr_9W{-L0-
zb5wejV9dKW>sdR>nk8c`MG1eO0t~oi5!U?X5z%(Wh*{&kqW^99OV02#nlaH?b6Z@B
zR^(S;tIv%HuRFPG!Gor1Hkyu;J1m#Fwy#2VT4u{>vZlLKW1#?lqof33>H<!&6I%9|
zg=P*(n%7i>R1_Aw@7LTFG#y1+r|0&RZOg7(w6^YqVdMxY&QA5NT8c5vuDrC9H{9Xd
z>QoCdJfBi7xSh{dveJ|+i=129${s$4tifT`JhrtTFmigw$C4}BYG<Ep8b9^8)Kl!_
zTHC2X6DI-c4*wPfc`C7iGeg==w6wJuHi*2AJnD14OY97&CTRzfLbmNZ^3uXoPTj5N
zFjeezdMq5&Of4FgBDZrk=d9Mcvwci9rq0mY{&-zyNV2-O;4ZfA&eDNBb}Fyo7A{0C
zXdLe#<BqdrL6*qb2zdpnO?{hUNo8eq@E5u3j<Z5Da-N^jvsXijS!WjX#7Rt}=cc2C
zIE%>0I1Bg9mXozD_ZQj@yQ#tN@85@-Vy6;3T;9aW{ME#9uW51<U5SG$u>eKn05j@s
zsqT{O5>hjW%pmO?vd5L3ARr@$j<zFBQ*Yt2>XBntv)WT9Zmetr3BYZg8@X<~ntQXU
zQ#2i8<IIQ-rhSy($iC@_r0q=Uwo`@;%k8C-CV9uLv6H)A`^DK`kBnwb#d+&5a?I0m
zNR6DF+5T@vMMe~!JA7s|!zn&pV-Q*h&1+RpT{&~>4MQ#WorcRBm|M_tI=Ex!#?l5w
z`F@qG@0&PAs5v#Bq5nvcx1u{s<d`qXilA-$TUxH7%{$(GM80q8qU%0AvaQ)I;OCi=
zpa7EWDtl|Ks@uk>nwu9N4rhsY<ISeb@0!!hO|K>D`iUOL?hO%b;>gpIE8EtY(S*6}
z#zv77sg_FJwu~C;Rjy~p3UzsLhWECkkXBQ%{)Sr^6FJtaIh0y>w4(lmZPnFqob5%L
zNjOBnO<j1!@LzF362pbvxK9OpKC7Xn=RWD--!YB}|GuWzDm$0XZ^^VtvKD;ZF@3`=
z!ZCW;9u{Ah=<%1@%3{ITutP|86DHPMLXTBC7j-FhrDs-QJ^Y2H#$<#eSmez|UvVU0
zWNP@%WJzi+1lJJSak{A>ymnGmb=8QkX}qS-AU8CA!APs+&R<86Z0Y<hC+^G@H+I0S
z4_LJ=2X>fL;F;YLJItFJt-0J#%jl=>#HcBNoD*-6`wRAdN1kj7nAi#Dq7&L(nxw8z
zHgEx>svA~eHk@6a#n$U?GDh8pm%V=S1ZP)hkXJ1_+;Sq#yAL&&z4IC`?`+!+M8-d6
zD{d<X$FPypaQ2yP{;g?sTtn+uEER1bq}XYN4*rdwB3HWWP>^=ihw(#Q476lrWLDW#
zQTeNx9Cm>ww~Ypa*sRxd6%8RIaz=<BHr5h7*OXk{7N9lR?mbR=+l}F5VCuAS@I%;A
zSzuZHXI?p*;bqiYaeJn#HMMzHB;va)kjM>-YAL4_Qt)LQH?F%yWu`ZAl$g4Bww3YK
z(Mp>Y*l>2KX?IJ4CZhhj<GET@lLR+pfwF@c>Q1CtQ_D&y!KiCeNPxdbR35t@YP!`+
z4N>T(=rlt={VwKJcmKcb{DZpAzY=!#RwMu$I%>n2S&?%BQ`NAB60$O?sSPDDBLdq;
zl8q3oCcCO5ckmzJ4=BvJh;=Kg%yJ6QG&I~}8LAbBOz(f2xUk~2oZQEzM#FRzSXpJk
zTM8t#y^oeniL`a(C7W(jmIA74yM`~`vB0n)zb+Qo2=ue61=LN4aFaD=NyPfNKXRv_
zdQA$zH*fhPw4GcjYft>Tt;{TIo@F9I;4{d18IUc#zpSwcRkKM=t^1lE<5|-X4Sjz#
zxR_Oy#C>)<E-ZNCt%tTGan2Q*G;ljMHlg3Sxt6;(yDfFi0fB|;bxG2ybM4llCLP*E
z#Vx_V9N=$Am~J`mK9Yp26K^MR>Jqgp8tStaes)vi+!y6`n*~}>bfTajWnOi#^Pvo?
z;+D2o^`;3ei?v%`yP+k=AFUB;L&E~9a<8qH0@0&+4GkwSJMWhr12?j{qr|yzv6!K%
z8pI{2R5j`0vy1uEWKic^Y}R7ai!f_LLE2j8F)yB|=rUW53?mU%Rrvbo@xK7?RTkNF
zq}z1LsI8#Fn&t{ELA5UUSG-f_%)how%xd=jKLz}?RFVEkG~}7qhYEA9hIXS1|Dh&w
zHJhxj={j1@rfsT*b}le*O_NI2oEosaP0QK$#vyB};p;R4uDC@OrXMSAV-muMeSZNK
zx|VLw3I#QJM;1<~xMWaZzy)|0NWLy<wCY^ry0gjqT)HjD(6VX>JI$!;8q%{YrD=J$
zhO>`bV&t-FAh+q~qXx-u>7A`C+7K;N-CV9EsDy&jKulXA9<$(!&g4ZM1fOBf=*ReX
z)g^~;Zk-wQb$NEiFKb3WpY+f5thjEA5b`1_d;eWv*{TX!SKNxeEmy>E!HT+6|0lor
zqz%hj1hA?iI|~QZoM&bSsc5pvrpN!BblRp{bK1<_TwMnEC!6b3)sUfONB?!_Q{<Fu
zFzNph-uR(PYZ|h)nGIpKp%Hl3JAzxVpv?)Zc7PA?9A2rkHFAM~$m!Hbn{%JTv#@!j
z4GZubp2G*(5iPM%%VoJjcxw~p;04TCC*7)+Ieb=Bq?PTWF()Kwk2!ed-?vWbCB<^;
zj^n)!XsZJKExfXwy-V;4UP=J?Kyyy{=7PdbtMDG)z(+5D-oi@_s@iZxqp>64kWE^L
zS?w*`W`GayLaWSL&R%P}=tM<^^E<6R`xI2&3e1UzH{HHL^OPE{5;7qBCCnTAtME<}
zW~xr__%M1?KM|~jS-<&WQ#2WLqFGIr3bYtoLx1@tybj&;ZHNDl;@+CPvDDps!|hX=
zlfkhK{~P!$iDUMfnQs<X+JJd@1G8R$hEYB5WR&_Oxbba6(lZQod1amCOWGu~sK8)N
z3J8}kP9^;p;5Ez}1Fvh&)O*-)s;?@&-E`8Y?10@?VDI6nBVmJfOI&C<n81I};Dv3j
zS%f$6R)Y##S<Wdq(=FJ#v+&waWn7a0T$A0^aAs}Yv2$I_`pz4m0sbIADp}LVBB!Z5
z!>TqHzYP4_){vkV;=gsb!PA~#!vt$JF#ktS|FfD9W?{#6Rti}Zt}Oh$u}<u=_Su+N
z?&@8!sD){-AU`|wvnYD{<m!&?<$~J+Ue;ls!3U$CRUK(g1|-`dpoIUun<x{;H$2qR
z^(Dhk!{zpSoym*zcwIw*-@`pE-F_c#sOn74l4O0uEfdd)DpugNmcBo?v5bM;d(^D$
z>jtB&HM|8M;2!+VFKrG~QgF37M!J?(ZOOHI50BtyxM$;Ns#5&@s${gFxsS*M={2jR
z(=Uu#>N;ny>tajXy|?uffqEXouYPf2+Zl_)D$mKeh%pkQOE3pd;11liwaIl?)slCP
zPunGJ+R|RH;4a*SSC-zWIs6T%jkM)#|EBXDX5oR>9&eALXsIxz-<5+{2D>VBHsBLH
zg<m}VRK#K>hyO$|7fbd9>)|cjfjjWh;I{(#L08U1Hv3?{)12=Akr@Ce?9l60%@jn4
zJ~vT?Pw+&0zvqm8647sOHgyxrgErsyJ^T(o`%xx!89KcxPK=$^A30tO;}3q*+T~@V
zpN0&rUX|CG<7pOTOWNZB{OZ@<RuyC$aMP-A9(Y^hgg(P7_#J+O7skJJ6$cNh7}3h-
z6Q@Q^`##j#=S73RF8UwTuoPbhVb@uf_4$kFzk~m_Bm30J-PGxT2CTws_#N)S8&5xx
zic<$%31CaXI};=uqQ4)z#;KXDSWsO~$#pcQiX8-B^D+Fb3;sy_XT|WTn@CP}ZT7G5
z8@x97>*C)5;V-vcERgWOlM%Ne>JQZW&ZSE`Eb8m)QvG&Cf7amxJa$XBtH%8mK~q#7
ztnShxu@(=$hF{=s@UUy3o>faE&faSaw>5Qe&%r(T2ly4<cokoEK6h6sdQ%43`i`aH
zH6y@^3)CjgaE`^g6-|S@1ApB)OwXKW%iWpJYSu_ud*4m_Y{T#E#ey@xEag+x*1pFQ
zfUk@JTPh*2P!TTQZkchT-k*1_?X8Lp3U1wYBob(7{K6Z!2S34Y+EKg>zZuRxZ)B3N
zrS986Kfl0BPd{Zvd3#(nyl0>mtPB4?!UHe8$p$Y{7x}WtO<j({1Ndd<()+3eeIaWG
zSbo&ZuP5+x7yP)5@oCm&H4--4^Dg`dzjqCD+saxTmdS{+=l>x3ztc5L&-gE=5Qb$9
ztt8u9Kfl6Dqo20;FR#PLE`-rgk}Vj%cVs<w^KZ+=IlTcD*lA0KyIGIj@~19CDd!})
zh0W?t{`>;JdIPehqL!Lk6|I@S?F`s8(a$gNxQl)&l0Uhu_H8*=IvCo&!heF_y6C4O
zB57tb#a2yHbMXHNKYIhRt&l<D)&QiE!fmO}1$Y6!!7n?9Nmhi+yc^(GhXx7%O9lO2
zz+L#8ZhwpHv+1I&Rb^(@lwS&whli4$&%36&*Yti%Lf6t<H3gZU!LRT)_<iRn%Z662
zY-;xArpq^l2=i;%{=a(lXU~SR(H8TteBY90Vc(bZdeJqNyyhaNHT`2ne7`8_|FuhZ
z+tU*KW-m6Rw3e0fdI`V6U*V27KRo;ueZT7DZdtb1TS@=BJMmvp9&z1e^IAgThO^y%
zf=4^Y0XHqF?25M<x{FnLo_FCFjbhx|3IC>Byi?Pcx0Iy}+V5}hBRuVbzb+l#b^%K(
z#|SZoC-4jW6@E9{+v9&zU#^IRw-ia;P?+Nm{4DsZUGQ(Za5-)X$A<Pv%)(>%6@Gzd
zyVAcUfLhthTZDJ;OtyQly)*E)-O3;fU|ZO&Ebb55;}>|fb0YJmf<IPgWciSF*^c4+
zzrwSwiP$xF`IbYeN^XCEH}D94mG9I|KMh5{t(>XhZebM`#6N!({wq7_$0(=aqIh-P
z|7-E@&+xcw8gNbg)0WE<iKv&A@eBC-3GVM4#o2QAR~GZv1izQQ|FDnzDGQ4YSDLpa
z`TGX$!;kROuJmu;tUK*f6Y>K6+?DqIxEK6Yp}pKGu88=*=ng{Y{M9~tmt!qGJ`E?-
z+n0zRLg@CnkDhitJ(}lzMq>>M@&=X_`hLFa%KPw?)TTY9!I6EU?SR8qnulHA_5Da~
z)@R|gn$z1V6PZ`&@ZDbL?}w9G1~8&<0Q&fQh3Y<L!;9Nb4yNIp8;9H@^Jm?KUg!3j
z=N)OL!WqFIikd1)biPokU}G=*O~7fm0OvH<#j@^i^!_}1JOwxo=QVkFRF_xP{XJ9m
zpt{#jjKE3Zf5fd_HW~I9-tGl|zvi)@Rikr3d6Ri~rD5#zUDDIzKg<L<r%54JwzHws
z$%}p9k6=ps45kHtTVmv+LdEa)+G)}U$MyXgx6NnWEggRi^Lx=x3db~3|EvaI7N8*o
z7U*YouX*;7RuWv${e_~sAVc55bJ73q@b|+hI0vU;MiZIC;Xi2}{p-Dk`2$6mhBF#R
zKA_e*bKf2-RNvU^Cnj|M)0(}S)8wD`2g;ukoPx_5$KEH(+i?3(UhYFbMVQi<ifK(I
z3D&fweggh?mp?}TXC%djH5)R>->2{t-t4s#sZY#vN|Rm+ZuDl5-*5I>W&#voMq@Fi
zI=%i8|4OF(vXb=&JSBB%_h%>gEvtB;vCqp1jB0Fj$&GIeM%2>&>vY01pkU&l2-xa2
zmk+u0dlbf9v%kH(1gm>-UwBfDc^Gsn4cC?SU)qax{4=Dny+f+1-I5IWbV&RUYnw)o
z;%7z%OWt$d^%>MgsbN>MW0!X@{)b>(@284itZRDU!SLsVLdyqO<wU5~Y#jLVgc>~y
za;_HP<ALkk?EGW09zsP*O%rGr_Fu;oo<0~63G^xPw<;1ifPNAf)})m_NZiKtl|wFl
z@6%+D0Zqne$PQdM@cZHYac$Hp>HO=O-nViX_($CRSF~|{MU_GaJVP+5M(;3L!E(PJ
z4?{maZmHM+<du8=qycpM(NF5ej06Q-adpqjhs6IN47<@>Rp$q;9Qb}wBEaBpDh~1S
zK>9cQhe<m*S7g0idb|HK=tNi#5koTK8VA1WpU4dtu!u(c@b4G01+~=!<yO}dyQVB_
z6FS0^y57i|yyGJrzTjeTRn<})`THSvWNj};cp|+Y$q}v`>HVS$gVtP-@W}65Af|ex
z_aoPw-*z<thkd!ENUG&<jz+&WIR=G8js}11@Zb1??-yM>`N+t_<v@HI>&Wk0@5YzY
zkA*Xi!2GCwC1eu61pkQx!7`DDeImDwrS)a-$1eQwCGsnB^2hl12qMG|-*jE^M|g4_
z>pj}%FBAmbk);z7kU0sMt)mLa#Oa8Z;$=tsz9Fct!0V9_WCFT!R20Nwu=vYptdOOS
zFJZ7Q_>U%_x(K}X1p>|{57ZQXJu>{M@^nXLuv}Ka9QpoAuF$9MvapAJ3a(V5@dYwI
zapm2nAP-Bzk#nz&_>M;ZHar4Fs*fxJQdgf<Km5k`@ObIi_yYP%9RD@HKz>9{M}MJ$
zFSvq{FE<d|U-o{K9q&`yCoJCm-C@l>>X&)E;j+C4qp@Eu>ag~$tt-B>au^rOTs$jT
zYADsacwmSlm5Vtl|EjJK&7yV3?f(qGu;4cre?<ezmk*R=B^XhNqE`fH*`~$)IVqKM
zIOKNTZpy`6K4gf)LQMUtF8!>H^#{LSa4x2=FkX?1yB{S+Fsy~I{jR=bQQ7W8;orHa
zAmlBnJNtkq(BCl!zqRr%9~dHtU5IN))5A9vpIAJM7#o0b;m<-$%UUpSaEP*B^fTb*
zUsewpA_yU}V~THDb>WIaln3+Qpw4eOjt#ZKE*=K{lAC?jry*n^Px|pd{t5UWQ;5q#
zjGGEk9sK<Pg;<8vxLz0Y9V$dw6bp_hN>|bQOKQZ~^GRKZZ&3WZt`Pg7_-{}jA6AZV
z-9>p;50pPW3egWML}*dQrJeFKqW}irB%IT3*#U@^rM6uE>|Ps>QIhHmT+rU=yvqAO
zz<YS7hSoikY?kCYr2(iRPZ_F=EXlIF80Z)Lr`>wbvZCuD!}fL`_zN%zr{N@=kPL`a
z=<`8IruX}>(HNYDa~gl&>qdcv{<`P;EFJIH()rVxo}5SltiT++hFAL}sR~N6orWoe
zNmCV=z0$_}kNfCQOVXZEk}p)MZMr1c`+btkGg_T=QW^5PGGw!^pJ?|dB-u`?$Y4x4
znsp`l-oTrElH@T=!3DVN*0#1(Aw91=<(^5xUhNV%qwn{+!@tnR_>cSGe-h5RBx^xA
zu21mVE$!aZ8nYzxjK=qeqNgo&cs$z&{s9$PT+rqn6yRSF|Ge1;{+zyl(T&e<sYL7b
zf%r$e2+oN3`!z)~)EGZii{tKUv|mZe(<(AaWqz;d{TKTbSri@pTarG~dK5eV+&*T=
zxJ1ZFx4|S7_`ZYJYEj#r{s$cMOi2M$wQ$*z#0L~9oOAcz3oY%Y`m8#S+5P3`DdBHI
z+Sj_fU&B*)>*d#WC&aL}hJB^QX_jFut4R7W{0@(IEfguj2`%2cq`k#C^+zpgas3l`
z0Izn^Km;e?27C>dwH(~S)MZy`_}i{a-v{6ld<)mza_-noHkyS==!3_*Cdzx@3|x0h
zyUhY<s>AvjjKS~leAjI6BwW_o+2b0=9xF`|3N^;H^nGnt_;15aA<xXeV1c|>=}OYI
zaoq5K7QTjCS{yhaBQWT|pxB4u5xm_ggM#$A4A+JK9(7+8pst4Qd+<9v+ll`^IIp$4
zmtk73)gAm#;4VDcwa}{or{OwWRh7n&l7tmi`h`l>JG;_P0;jY$`#PL(7UY&JxLLRd
zcXrlE2mGJY_fKj=aGwUIh9;HMY7%(2EB;Swao<I)kI2ahT9aa*fV;bvIYwGL{4HD-
zsg@K43N4P)dVd%Ahv1TyreA;wwJP{J{YP*Y9>DvZGPLOSzF!sn=H<c9!x?xD58?M+
z;Wzrf<<{Qj)H&3E1vm*~aCc{!H82cU;09cVQ?kCy0|+`ONY(Bpnede0EZl-ya8BAg
zcwlAOpeNx0+}pKGIfiMt<oI{M2AT_hFX0(Hg6BK4y#fAPa81i#ECd~fS%oH%!F}lG
z6nq8WXzg>aQ-GW5()<;2zRAba&x{toUex{^t5OO{_GgkSFU^MTD1d$~UH<_t$ac0)
z&)`K4t8OAhB{QX_Ic+Vx4OeA;`xvg3FHVZ9Uv%lQV{je*349~&wV@$x)g6p$GJn*y
zbh#HUXzBGuw|oVtYM}S9f+`I;g&FLhh3~X9eONI_Yq@R0sH<CP!1FHnC*X?K4xfWz
zWhf&Zctk2Oq)nfB_<P*a=_{(wiCm-Ekl!%?t*%n#R7>Ok3EU8pQ_becYry`H8|(kJ
z%MX~+((_v~A4AVZuMj>dsTc_Jq1R!(qMzGv1y0EPx9K<Ia_VzxK+EXh2&Ul%d=Hmp
zewVVd4QHekLgC<xE*&-vH{m~nZ{d^_B?@5;X(^0#<KFi?+$Z?Ig)@S)q{DZW?gXDM
zqr(z74!7XX@D)s`KsC}W45;*MSai7y{MX@ot*!3We66AkpuE=A1+(tH$A948Kf(`i
z8ICFV+7h)K(}ox8BhIwi<lq#11Al~DT6}34;t-7PmsJ!<sGEPs;gUmPzocJV`H)eu
zU~ns+c{_9fuEL+;wl1$<bQWUhqppt|Xm-J$O8)#sYj+E7YGF-5l~G56ue#u$*4pu}
z#lNv@)EZOu*(3}@0uOy0B3N&KfFIyI42k#>mxi2iD+Rmhr=YdV-)ZsafRNvGLw2U+
z)6msu7_5hz@Vzz%k7_-5tSFEbEd{IeY0$x`-Tp0c@1$7I2q;ou@06SWQ1M3Kgx1F1
zf|G7_Rit)}l45fuN##wyw7pLP@LRa3fe+U3-B3<7?{>y+z|#;9jdlBffXh01;&`*i
z33J>uuT2mCRbhX|jmhsy#T|1KU)EsOGvHa7U*}Yz-qP(MJG$rwepYO0dJM<mI^2Tu
zid?iMI%C~i;))*1GHbTszYO2P4Xu5(`ktB^=|^2oGoWCb28ZB0d<);fMYTsX+{B$k
zGYj$xjBmi>fd54J|07(3X~+GRgtcsPLshSv9{<BK|BZV4+~E1P#E>64Q--(x{lI|V
z!5MK$QzH;^ZgNB_ENsF<+u3T;?<$Nr;EOVYiaL3uh}?!30LP^LzlIapX(7l4C}<E$
zB%5y5<NpHOhOgn2HfI~mZFR6tN_&OE>Br$4LHZg0PH9?4Lmy8yu_`BnbWMTj&NC_c
zzX@kVzetA_r4I&VL53@M<>~*7+|TpU(y0AArzsS9m(VWT()1jhhHtbsxL-71(BT6u
z4~dtmJK>*#ujISTK(CwCZH@OuZDfhm*!I@r|04VV-)Wa^$;}6kv<joBJxdMIe>2ek
zjPsqw#iT7aF1oFCM0Ir@l)e0%glk$FAA;XDt)b~gzxTNI<%-AuxYkDB(k|Ox$?4ct
z<4mfd8`y%U&`N^z_!|DGrQyE3-}?81!(w8ztei+kxLuI>aY6khIVZ!(j&8}hrO%yy
zxfy}*l|s*ChjrOy=4HgMz+#XgqwqESXYe%~lgVZyLd*^;Icv78v`Po9A$W2FPD^{)
zkO8y9TW*;1mU5CEBkm;JhCj;rExMT%hyjX@svD}j-wM*}8vFp)V9FUjRh8D}9UuXy
zvJyJ&0RIK}0ltOP&PHg7(TohN1--7a;d(%9FaW-T%P=AMQ#Y`pDFsv#8Gp8k*JE%4
z{vzrdkN|Ayvyt>pqW`p}nt6Z%oP?Xw{$tYEc8if)WsuX-_K;Jn1^CavAK*`LS({|7
z@+EQQFf9qgwgRgg0sfotXSktFv_(g%i6h}~d3EJ<myLcdz#rgyIPZK9Ygvxn^r_zL
z((ZZ?(Wl|I%=ZzuJuY=wx>)b$6;b#U0s#U4e-QnckBFPpXp=)?)l@7R{Jqki-@qy7
zCfZ|^SCkW6)BV>_@b+uD|Hnk(4ap2EIkhP)4c%WQ$e)|=C-@3R1*g>rTNO}KjMP?5
zNC*BtxB@>&e)YOLi`>lGoEunM75{YLKMCK$_X>a5v>QWP<Oc8r=Xg=W@wS)VUzhfY
z-OSxo0zBs=(xxvc8I<(CDgH^E*h$^ECTn6Y>-@D~0GxsE;5+#)7XD~D11NUzH%<P;
zv6c>>acgh8gmzwYxCXW1{S+MYOT$m9yK&I%EyvlhZH@mQRHE>tEln=LxDvMmZbVK+
zMyTN}@W2opS2r%eh`I~ML}+>EfkdK&g4TnFEZ4N{3=Y2SsF*X(4vLOMgKZJeD4c}j
zwlqBC4lk-fbWl`QP!)uq-q8!=>L%=*e@uzPX(b)(_M2{ibLu{RUaF#l|ByRAF6uMn
zdj%e8Tku)93}^h(@F_Jo56X5krZGFbrI}vCFfBu)ga6ZN&FmNHHC)enTQVpV7@g2x
zF71SWFl+nNCCMwQgzbgna1l=1()eCAc%M+^R~O$~wR}*f!%CEm2kID8clI&O%TFA~
z<ef3p7P2fx)uEq|dpxd5X^Ep*J3kueHVqd%{C#RrJ|W^U>ECkezWWpf3@kWf?~lT1
z8IERS$If3y-cX2Tolr17s0s$*gjjOWY5aP2fK!kHgIYR%Tw#h%gYzZ0pwxv&Sn8N6
zFBUwjEIkoSxYp*p6S^%os3-UleQE_h2UA`0=PXR<{3iNs=E47;xc}ObE6cV8u(4gZ
z^xo0?DtfY9cb^Lh5FkMxq9?rw5CjQ&kp4j5q!-<Dx;(4slRlZ=o5*n6=wZc}F?K{c
z^2h`VjV=~<#M*OB??y`HF_^!sj{cby`Wf4L;)4VX0>((fHvLC{8DLhDET%jQ{NEx~
zzyNTbnxfC{?+Www{fthlvUbJF1wO~fX1NCdVB69RtJW3iK`w|79<f30ysQF%U*ywc
z4EtKFTEu)4pFO=rF2x+9p=>kIG#!6WU<)h?&SY(6NRZ`O1L!CdbbFM^m!{l%CcCOr
z(qM?Aq>(%SG%zfNSFN5;e(=ZxBLq2bU%((R&bhiwREMRoV%b|82<UsiDWFNtR##*=
zD`X=?S)evDvVoh13?pDv^4pdoHvvv17gHfZ`y8u8?fmjIFePX=eh-db$Sdpz#^m=r
zcm65P_M7`?`M*H?(2P2C)MnuwJ*OnOZS0!Jx|)>IGr&%-4g(MHmB#2h)~WRC4E5`z
zT#0>_mXAjt`vxodC6_h|f_xBJq|F}!icWf_q-3T>HxLZBnDJhP)0lmx0LNJM)4Xva
z`}&P29u@wa@qZutF3b7Bi#q?Y6lh6&2*+<xiS@BjSv3k#U6GI=0Q!M(V3uuWZT_ET
z-)u!z%V)ANs3qGNOT?(|7=USZ-k1S2F3N-`9u?S}wMiAp(p_~gPgeee%B)QOxzNQw
z6EYfYo^~oegq`1t8I!=E<DUxi`_{H*lyArtLR^ADc3Ff0Fi83r#pyDc(;y<CvN&h*
z8&H2vQhtr>by*^(*Wqz5aK9OV0eSgAq`>D8L{sEvNaZ~&(6#TEs*ffREtziNNGP#X
z4m2EjY}7=a%<^`u!d((YpjKgxZ9rHDg85>BU?4o`yrE(dtr2ohEPG#lslb^fNZHpi
z$RBU}72|ANw;(kWxlO43J(lA7WvgylV!Nsdn&sUixza?Gd5OK!r6Lacr8<_0(D*#R
z<zg~^#gKuvhdfW2^w;D;i3&K9{7(fK;-EnRXq1`UwaitPx42f&vZPiY@2^lqeTXId
zgekp{{=P|aKrRSbkex6ADCM_|$NVRdMY171W4l*V?{S5WPawgXX`GlcoJllO;1w1M
zsP{4L1_@kzX3~`wszx2X+A=*T^f&8i2Gruvn6g=sD{oYoS&s73Z{AlTUV#(%5gRm!
z&??H<n#qr4K7HA~9I>oF!h&#HAr4a>EbWqs2W~3OBbg$mQ$t7pVJgI;IA9IQ+(CS+
zG6zjYfI@^uMf&v<%Ac9BK(vs@=SP_7Xlo*ij{4;M9#u$#`^Za_mtoSs05nRR9U5fn
zzr`1h1VAm5yL9j02Na5k2RL<5&bynQtPM61Dg!NG&{)Q~bpy;4_+q_`;e0_-%w#FE
zlh>qZ%uKSSva+!*WtfoS+{an3LTGB;lJ@2=rSfH-gtq2Q2M~J=*$3eZxilk@3Urs<
zZ9B7B5e5$Pk+V8+plmvtMWL)H24yhgONWdpAZoyhyAgQHee4sc$fDvbuM8nY#ERe;
zq=8KA#u7q<>Qb#A_(o1_!yxZ68L1L6W|6pM>d=Rf&DL77vM98`bw!a-*B8&$@A`^Z
zcC4pVI%ZFle(n$bfU4M&72gH%Jw5(^bI3}fusXq3B!`Tt^gRiMY;r#lC<qU<22q}Q
zjS(`kgZ&b8A-m6CwyJTRT?-kB+7wV&_w829=@dBC0ZPSn@#U!I2Kag-T3pd=rzKA;
z7i5;SqCQ6^BBha9CILmAiYerech0Z(7mN1Q2*8sC;ugxgSBR}aK>5U656y*{{91hK
z+ru7F){3HG?2S^<>c)Z{=8Z_gGB3<yttu9TUtcgy%N8FNSRbqaW$_pwFRa(836apk
zd~;yzz=<sX^<`~{dB|axFb3ki+LM|1jh^sDsQhnp$CmI*sgcZ|=`tzpG*M_~KFRY<
zAe__^)=j#6qD0(;Sg?|<{AEY7H*qaM`(5+(mbweX^g!4^`+ADD2_=;bjoGahpVS0!
zES9m8lrrx=%mb!+@|VmQ`2s%MQS32Qgqt_F3VJH`&=(73VJ?8osM6#q0=ce0)IgF)
zP*mrtbFF5UEay??Z+i0M8GYS031Ycq<1)+Qh2lFSpBHO`wkIN-vFLrDl9<XuccXe|
zq%{(B!Bj#<`{gbhWVGlg2jnT7idIyn4y7#IroVAsOdIC2TO^U~=g9kaZE|ROQ^P0i
zAb^}grMmr>Gr2q~U_?@VP`{}l)JcKNRCL^P5F<g0PO#?k+6h|YQ`cB#ZEtF~tO6Y5
zPu`|~EJ+r7ktELx5L%?7l@qlI)}<K9V_DvAdQuo_L5^Xn)0zbW>JkeJ6H#{E^f$_x
zc>8?2nEVrpbKhY#Pvq1*QOicOq=Fr=f2eB9P3$JjEj7()syoS>_b!vz`4|aIg@tlv
zQxnFBDUgOOjtL(C4HiLyJoiOXlQT{3*sil@BBlTZqJTW1J0z=q2<#E3yj`iWP&1(F
zwu?=Ko|aT_JP<dzrs9y@i@sEVyI$HHhZc)Nou&3J*IJP5uUbFN;ilYf)HYxyJypt3
zM5ayhYQRK2<5$EjJ{il~dg2pA(iD|Ydo~D8(3ZZsP&g14F}}_6O1de4d>ycWLzXK-
zvBas^LzBc7JK0)B|9bxS^_`Vt6ZyTG9nu%4CK18XN`^&AEfT-({<1NX`2d;xa#!Hv
zlUd#KEo;lG0)5gtYR0T7_|JAPmRN1W5o`i`K$BSj%l>+zTQYjNF>wG~gI0(CGLk@f
zMi;@D_T2?`9k`&K#Qi-pacSP;3FP^N%-24Dl}L#VHrZq}H0FJDHAK{ue`+k$90L6U
zRBql^C<f4DUazYBe+XoBuB-_Ic|&6?Y(>f@$P9-R0rj{72`CbBRFmiJ(BCcdJYZkF
zY5z=oTXka-K62>zrz#KI$66^;G0#3Pzr+6~|8=sLIS?_GtfsSD`Gx*T*ZIxAGZ8HC
z{H8ye<i7);71lYXya9|$)xwg2;2&BYC8h^och^!UH4+I=OrFiTHRm+uaP)zD+bRb&
z_XRbvivi3CXR=yn$AQjo5;1#_nXsau-dRMKNFVXOuHHRhPu@Naps9P+B)%Me1K6-Z
zF4vQ}0?e`m?~x5n!<i2CI9Tl3xM~d!Dpw_YtR~wR?Hkj2ZUNh_2R5bx+L3yp+C5`t
z3w&zq!QFQke8^QdJM2{VMctavf((>$Fmd0}e~U|68WIAV#`on!DWStw6m-63(j=Y)
z#N2BlG2vjVk*p0OzY6SgNXH6Ap4^<(#O%o1cK#(`omrJI?>!(dPYHGYt`p#lQ>H6i
ze__e^P!xdOLW7$*7Huz4l_2i>OrzTiw<uleSk-|=KHsoS13SPj;7c(<eb)+Eu*uax
zJC6Sou9`VG%FXoh5|L7YnY<?8s-zWQlYJpR)rOb18}0`hMDREOnsu(3*=84xC%${v
zCm{sNhOKT+MIN{FeUE{}nC6xz<i8c-)VJSfi}}k?elV5)V^z_Hw9(exnu=9m3z!x@
zwNz1oPg?k1SJ3#FtGr!LwS}gEK+jwC%GhDk<R7x>WSbZfo@g8MWG#cUFMq!troaZ5
z{EV|YrmUOfJE2lehOe6fbqu`>Tx1&E-X3(zEHC6Pd{IHe4}oRi18|OAdL`i-(EM2t
zvP0bRHK36aOK%C7U=w;I+FvCh0S>s6Vy)}^3&4eZCf1aO7D1~7PgzvZbGIsHg4J33
ziiB>GrNlG7?*gy^q$#isECXxY^kD4T<l#$nT%GM-=>omq1{Q!t;1Z{3GJ5@4k3Fjb
zOWa7btqiot0Yvk0&1kfk+H;Dn0c;RqJIgqx{}~v{2Y$W;1Q~hZ09XM&DQNf(LFZ@4
zx)ua!|A<HJy$LJ<D+(IEOeBm+Nk(gEuh+3nknDA!l@d*DiSQyC>IeD!Rg1s>nF#gm
z-(LYfbBM%}M^?4f*tJRYt9OPvc)%;U$mKbv{aW<z8ZrUn=>5s@kJ0ZlCkV>I{HBaT
z!Ejp#J}dw40c$kKBhs%A$p}J{MZ1*u9g|<jlINLXFAE7Zc1}1%@fr9CYyoLPB&kop
zI9+y-`1<=EU60R%ZRzmOC*YlHD`h9OWqvHR*`oDYB>!(2|E~ZGz&abVEMzrL#M~ff
zK-P@^Q}V+i4M4=x8ZVXUK*w|doWtEd{}%9pYi2^8-@ZuG7bz!j7F7Qn0&7%hv+NDB
zI(Apus7`(F_&*82CXqh&0`;+@qb2Fxgu%)>VMAXU12lQ=2{wmXwK<c>BO~bfGvIgN
zDX{F=BVqhk5!`a*YcH0@-v!<PzY;dH9R)_xoNCqXOnx~g>wW;95cYN#q*cgX4s!y%
zyxnugtOl$AFM(fyr?w3v<9uDev{%?3i-y?14A=wS0>2YDE_Ux(A>4a`5lkpspe^<3
zRLxW1-@pq~fK7Imj&cqokY?IU=+^*N37q-Vwo;|+t#J2Wkwwao>EV>_=V#y#;8(+4
zPnf_hOK*;4GJQT@qw0D})WHL#e~CS`paHWUPD=(ifHlItcKFAAD$<M@9uveI5;XjG
z!uGb$U!>}_CVXq9h7?Kaz#3trUmE?5e}>qO6tbG!%Cm4CSOuPwe^wwTERgutif&ak
zEB|J^!e_t}+vpUtfHfrTJX#Gl*Uj<ovo6%crt|_0lK&y6u*ks8PrU~IAn;}i@lngv
zA0@HD7Pw}#zVn2A{-Ea{An^*4Ke6+BWWFT=AHFsP*rd@JA`6@FCGtfsa9iX?q!&tm
zkG+BRa7O)So{VGUUEn=8l6|x+cqHeKL~|ftnU8@j^6x8^Ur*YBO#Unz<4f9jlh40U
z{o}EZWrXburn#+$G^74}N7(R>u7clYG*9~RqI~MACHthF6E^gd1xTs9Ls42$HG9{j
zqxOujyB{=dR+mOAEB31GYI6GcE$}PwU%+#-;0Y~tiPZD+5d?q+m$Cgt?YC&!BNGub
z?d#=bD5azGn%eul!hSYbxa(s}W`SK<sVI;=;2rQA@N<^|HAW5cP%YLl5!2CmDxle&
z^S2nSm&kxEHZf<+<vB;;HKp%;7yYf(%Q~_k`Q#R_aDm9DPi?KLmAOLN@UobJm`>~_
zVV8dc{#MxDG>=E4!D}caJmfyDUw~h`_F-A}*2<TecpnLU-%Vg1_>Bg1+s>bm@`e0A
zkcRvkunN2Yej(_3Cr4ogm`FNytx_2?c3A^n5VZV*?gFZj{v~M!g~UGYy_^Dn0sH~H
zwDVi<MNzV##u`TBDz*QAb)7#IwC+N_+o~b0=yv{J3HyC$=Rc(UT8Ed_t(uaa-vQo|
z{v88k-9k1ok9+|m<xbNT(*Lh6*yo1W&;gl6%p{lXeBal=6X5SI;Kn*LB%u@#Gikd^
z$v^Lj9Q{i9$LXH}8<C6!RDU@Uct!o$p?}6IlgH&xksKkD%$wt1NB@sq=LgMTnNFMa
zQzx>)W`W4ZZ3j583$#YJkVmSrQg*;y$ba+n@4B|pHDq$4C3XH-tjA~IIq)0s8}O!U
z8)iz->!u7$nb&_HxY>?-o^>rGJ(OQ=iAs%G_1OU45%m0T;8WLD^`_L3kf(c%kcl6G
z=fI!bM!JLirzZ7cPa+_T;@9N-PrH^uRaplzxsQbkE^K~F$NP^i?P2t{cDGolXkq&T
zrT5p~^QZFrzQq3tNw^QJ0q>~&pL8uWJR}cW-ZP}o?E%{a{QV2~75J-n8)<_T^o9%&
zThjc1n8z=HUzB_8lkxr<Qn<CtKVaT>9{Bx8{;A5$52O^?<kpG}u4De0)<0MDYw_^f
zd6W=C{D7FR&uM*MckzFg?}JE0dt{XU4d6ZSH}Gqh0R`c{EnBPipRn%o33vwlo4$8b
zf4{-|ZwdU7BVLsy-bFs|!k?`!{;%dyTkHsjtTL|>cm5X*pOt@N@?w*3)a1^bHQ+<u
zd(54`N%^yQ!Rl!(R$bSq!u|r@cWrb|ttg#;sf^Sy{`*71&)mz`c_dHtUz4hTjr!wf
z#?D9Zk9E^DCE;Bso`#EE-1GBY8}Tz9A(n462c#+d^C$4D@_%Swu*e{5gZrSYA^ea@
z*7pRBuA%3%#scXoa08fOI9j0Z@Bx?yUIOorM9!?ysor8fdXAd`YRvn-2VMj3k6h>M
z;?V=(BXAKo4@_~Er%vgA!F}qj-siW07c5bH;zr;iOWs=oI{op;mG&**BMa24T<KjQ
zQMYMBUmc0No=_*c0?k!moUy#i9<BrPz*FEAuz2Jy-gV$_;=5ntplCmz-{C&>m(2ck
zf9gz09dP;YsDO6w^8BxX*B1QiXO$`LZD18R5A;jb)8!ps0rSW0V|TH!D^Onn&U3Rw
zChL6P0xy9#N8*kj0`pAKf8_InELyljeE?np^S$)XfX_feMRA485+VSvKv2JyIM;~h
z@VaZCx_$P5dD_Yi-~#b9tY>K*cmuoy-u3Qs2A6xqz&3D|=N|?Fse8O9q{DJA{a2*K
zaEY+Dt{_<BF7=ndOW@;?^1B5*r>HymnP57hMgukvyaGNP39Hf~|Lg-x+^0Xt^Y3yW
zy`%qf*TWe-;saojQ>reV++vr*(#iDS0-jOT+?17+8Cymi|GWU+9(jI8e^=sM%tLZC
zn0J2z%mW{QwXOzCaKm<$FF(Z{0D;sE-xBn_dL;k6=J~JYXPpn&=wjzTnt$H0C*&hA
z&5WjN%cydl`fG|{`{Q+e%_uNKym|}doCRAs3uttOW9k!}6^&?qmVhO#eH{7mxLIII
zYPt=!ZL9!mCxhw?5cGM9n+;N~Ub5dm>HKal$TVM}NR_xn9k6-w`F(Dtu+<zsgReDU
z30OTD)MyBp=H}(WJYxD1@Yxk^PNn}emmBoc0B+0gS5J7}=ca%eK@>_E0j#ij;FR;9
z1EvI#s!Gsy*C}$+`OD;=X)bq2*ygg%D>&)=0jIjBS(6Whe^#}2eB@^kn4$=bxj}V4
zynYtYdUyVF+^i5u)8|H4@5FKEALIE|fo>5|YyDLInIiL#a9M?`@-MO3>7?`9W{GJo
z+wkdtxx(+p$)Ii?5y_?m6w5ky9RKW`@canZzK?P_N0U7|>%iuz^mqI}$K?`5%Kr}e
z=M?@AfpN<3IM))_*eU6HHTF+<eu(EEV@c5i4&>!GPd<N<n@Og*o50KKpVd?OXPm?z
zW;!sVW4O#_zubi#@g04#&@*L#SUp*F>Zi<p&a$+g9h@B&)aMaOT)$|of@k_v`1=Ja
zKo^L%&iH(p!_|>=Egzm8Ry5$Gc$qrTask<?4|{yL!t7&HdY!+3{uTCQHyF#Gm3|=$
zl96=1pOpoCDFym@{)x}<+wJ@7mwv_?d&STC{-Mx6pbe_B-TX}SAK>rX<WBWu)75)^
z%jY*31e}%syZER1Mfc~)ps*`w)_MN3@^3`?m!)s>;Oz7dNCI2&U{f-u^N*dsUjU`5
ztTUYHQzrcjUq}C_M}7}^iqrWA)O=-D<Lji~S@}Q8i;&6B7w}Jo@^87=gCphFYdg7T
zC2iNwg!%F2FZg)UuF^QWY3cfXOL>0H`K_-pIlF1lhqc=GmGp}w-~YuQ>lFLS_qQ^|
z7hj&8Km70fzP^8aM$ytwB%Q`z^9pC6zjxBHtTvl`9sd_(g~-?Ne<<|-n)lZuG{1uW
zq0s+p?mra$@s<1^e6{@hvL*a0=^qLGzvldb$nSrPf4=7O(3am2_nD@3!!4)3Ng&HO
ztx=*%kcR51&kwL$-sQNP{Bifcr{6@0yOb)@@v_h9y3?H;*K*cRBOftwxog8cCx80b
zZ#T$(Syv?9J^T6l*)?V!>K?FT(}8C>!mz;W)EOb|pT$Wm(LEdB`K``(rXMeI`Jml@
zgLSsuv(Voc7U*M&;D7)Hr=P#Ve!l_csO{n1vuFb2pMLrv4Gu%?bvwC7$yLD1>les9
z7vAlih5!43VV*yfxlG5ur=LHP=beY&WR~nq^6MhBLD@hSvrfKymh+eSaKCuCyJta6
z*YgicyHd<6Jd^zS++EnunG56pgR`99wOkK#;K|(k-81q}nfyN}@IIzLPUqhM800QC
zlix#r#lfl1SES#y!0>jDD)e+qVDj4!ln~;7%TYKtoU{aNp71q;z&RG_#+hQMa};(7
zSOiv%W5W=b0WJaOSfnbkJ8Yf97>g(5*gPg!&k?k~%p9dF@+~q=e(d>&`209m0+!fb
zyTk#@wG+-i1zco>W<)@<-Eh+auzX~N$$fn8VV!20aDY6x1uOucc>ZI>sLb8v?g;&4
zAj>f?oJ{{oW;e&U(cUVbZa8oA1R3<CKrq1^smCJW3eUfB0{w@%3VfRTEDL0TWqJRj
z%}4-D0q249%+dGBWX@vOeBN>Va~`-LIbx5U2lo4GC!hZunSWS@z7E+Z@bLutN5cQp
z?6auTfmj4S081z5_(y;_;3Bt4TZ+{c**^ixM^?z~GYHH8=eW(gp9Wx;^8fK<{+}ck
z!L-1JyYAGFL`*;B{IkGC!c&_AxhEoYlKl4b`SV=BoUqqyi}QG^C!Bwp@;gh+XlwHL
z47>*xPtd^uS2?=^-#DS$YV0yw06qXKCzuiU{^x)>7I`yzPwS-r@^R<4-#^bLj{+OS
z-DKKlVC97KoBW<5|5xDQ|0x3a4tNf{JQ6e-z&LOdxDVVVc!XWxHU;P<@a#y?<q~in
zxCPvkE|-#YXnzJ?0)GSZN7fmAU=H{e_!hXz3R#H?XpfB!&w%Ggf<}YDf%gc!Xmb<p
z{7b+`;05p;_}H88ECT0&ZwWf@DvWOO-F4W10_K5d!0RKsltSPl@GWo~xJU(3W|?7|
z#Ct*d$48$35|L*va-+D%8i}(Z&w-~$f~J>%i@+_m2#piLp+y7m0eA<z13n%}{}7l1
zZUDD{b1c#gvqrNAECa9EsI+wC`KN%Jz_-99{{9H->`no^16}|xjs(4qfD6P8zrrqu
zl5}c1{&@<#IgWp3ftvz)KOo)SuJ!2+@HdA<j^_Vcz(e2`Fd^^*VE2yWzh}V5URSV4
z(Dge6Um5{?wokh@-Iu^$!0Y35_%v_>xXmt?K}o@Eu*+ngT{cIYe*(Axe9Py@*sq(g
zNcaYL3Ools_3}>{xB%P$ZUR?;8ChG=0M>zzz#HHh@X97cy68VcFs5rfzsEj*_Z4q}
zrv$A(!Ul{0SAaXf9Zp^QY;&xN2t5Nn_4*J+-~w=m@;|{AH_!oi3;Yc{1Ku53rw)J%
z)IS&L!1ePAU8klSvHIJMSe?NCSE<AMWtWb7|7U#u2>Cq++yK5I@{Z$w*W$OqHhI_O
zcQpOy`1@CA0E-B5jXL-udmmj~ve5MfqXb_25x516<fpwGz$G$99}oa<^&N`9W#Arg
z7r4XQ*OA{E8_(Pr$g5sIdX}K&_jr0M5;j?9xk$Gv1wMB@KLp$)aN-5YLKW!2T>vf;
zmm_0ORp<Pd@~C$u*|D9lb!1vV!*eq16Snmy0p2W#0;$uX=x1T%bC(Q^0N)U}@(R0z
zN_>CUJbYDvw%)1?IQl;V9s}2DJ^R^Wvd75a9IZ;X3<tm*a2I$Cd_&OlLFxckWV%39
zB^MVjbm<^>{{H|T5_+pFII&IO0x(DZOS<SE5p(uC;I2S_fpZphA__U5_C*){Jzx&F
z13U!o5Dlr6UlMYjjmIvry4b}(!-D&LgG+}J&M<kvJ}@O~@n5SA2nn0~BVm(=xTxEv
ztgLZ0>_V(}pGn{r@ICN|u+x!riw`omGQ-r(KPJETfy-o7UslfTl7xML-_@#K63pKl
z{Cj0tEIS8W5R|`Gul&vd_xb!x5BQPK#_c|8l%whYJ-=d-%pY?_&^~aEty>=OUj67$
zV8ibLa}*LQV;vAt#wnFoUG(?KKR*!oaVQU3-(X~Cy}zHj&OZU%CUEC0Fv%__@cwrh
z7M7_$J3deexB}b*9soCibK*U^{MjrpE3JReyXGuRe|}Hc<ni`BhZy9#nSK%=WyeLQ
zTf7gr0z4${@hpRnCdn8HNJvDR{>FIl9JSvqfu8bYZmPehX=Yvx$T)Bvcno|8To=%J
zpF<Y|%&HY-)#rv1&?&$lh#OyImqf_FP?24Ru47<9wZ}u?pX8pBtlIQA2p+QO=#U)~
z9R)B3+yEW|_kk;deiX~z){Nf>s<aOME5KFYd*DalCNM26Ix)AB`b<hBJpcRN+rLXJ
z$_jro-~dfO!=i$0o$7G!4S`J_;GRL-c~O#G5FSnPYA^kr_b@1H>H<!{THiv#WNt?X
zoG1P7Q_Do;pCX%G2gyYNMY59*o}~2MCFuN+EV*!$A0YF#IMvgke;K$;*w{O?vc8Cb
z$Cn=<b?Odu+dV%8d;>fN?$P_G$ZErw?{6`2l|w1*&j@fGcti)p8k`#(0v=`u?kHWF
zrqbWZuX6yd(t9ZI<beP*M1soJ%|AZ^x9R)$=a<ylE)Ac_?7A9|QQ!vfJ@A;?*ONuC
zZUTQuI{pqB({}jh67ZO?&l9rQ8Wf~5t-r^<wU2uKJHYqA1K<kzuSqTr$$4d3&nAab
zI{g2wfVNM_{W&kik!_F}p*{A#cj)i<=X>BX3n*3tNhmyhqJkR0TPG$Sa1MAt*yK67
zZlSFDwOuR&q5?a)&~e}f@R;6je+N${1O$o6ZBaOo?U;3A;9K<FhXgC!2oQ3Mz%+fz
zb;E5Qq4s`6B;9fzeaz!KL`;e8a&)Op+*u}1F41;*vJpSzGNB@Cg4?V~HS7W{c%K%p
zGZSEr0>S#6HQr`f6T`q&;1Tc$xXiqBL-zYPP3F=;7rE+h%NWp=4z4otEamr2wPXdv
z6j{Q(LWlk?`S6(GLnXgrH{iC_5-`iQ<2A!w9|SG}w}D#>J8fTDLbAJ1W{@fQZD7+3
zKna)uz5%|Wk5iD#kN7$f!@?G`lDl^PNhaT}0Au1fr5yP!%FtPrzu&Hj4bghsXXGDI
zfmkmV1oXbihVl2Ny)OXwfJeZU{KC$F?n*{8lnVcJdJ0_f^8xS;!#~?n<Z<0{iJG*)
zrB|zt|NEKzbEdmbw!Z7gxD)0ZOvkQU5bKhsw}Jb>bs}G<`Mu#0FQq~e*i`yE`MoXl
zgRBZQF0KG`K*o~Nw)2CBfJ?w#TF*XN^_$AVVE2dDWVP&Plizc|x4<pnJTqS@V<+2o
zk^%$33~)oXjkZjF9sO?svs7^{GEYR!70J?|9cmE_0+)gNz#ZTmYrm;18}tRX+ve^z
z%mFL`bHF#i9pECpH(%EAn-60*<&<AX|BM0GfNwZo7D@POj8JeGu>hD+U03;a=f6TL
zlgPdTpI_%Qqw4ChZ39qb^79r;C$?EALV*7>>E?-6s&f9l>e%8YgO@2Ti?dxnnY=?R
zxJj-ivjyM+@B{D@Bj-{cdd!?ayUBoA_4d%m7|riD#I4uvw@k#q6l;wO4iB9IxCZ<H
zJOs|MaNCdtM7ll3&sSEq0n7o9fqyV|2<f<Gd<Bq`Bhs>_LqkJ!|8KBxRmeAxWi+5A
z=}XzLjmbseD)1BV18|9vr!NWYtmplk9R1pM?$3STJBdAlo{h(mtPyg9gO2_!XTWa(
zkAQE12`ZWvt%oUKTjpZ-w`Tlv4fqLo$c3M=FtEqV5A%>l+q5B=0v<B<8xf?6L>4#&
zvc5iHO}agFGXdNK{sG)!<{%UUP>|P#Y;EiCkISDq_w~F){%L+;hnHX4v&tpMm;oLF
zKLXcy|DpJ<c8jLWGGLb*hCBRkv3DPDxWxm9!c?Zi*1PEcnC`n}PA&X|z>5@U5_dWG
zh&d+TEcUhr%p?!a7RbWebvyqs&wrgoAM?004$s64#=0@t<v+fs{PyPo+)M!$D1R=m
zvO^ML8h8MFFY@Q*7i|T3W025+ZJ*yIFTVqB1JjffyEadjpBM|ZfNk3t@6P`ixDU*U
z0fpF*RPGWi>$e?<Y2p7-*;sA|hDcGsntq4>UFq`?W2Yj$_a>0Q&n*qkXfei_Ck6-_
z?mBn+rEy+q7|9Oh3NUVYzK96olN23KL?@L^)fqd~T&KjefgTSr#W*fWvzF{BO=*Y0
zCcO#H(E;=;&NDnl{`^5+jhBHD;2eLi7!t#FOwR8KBu*+1mw;jR=uCEjZclQw*B7H_
zY9L?)Q|4E=JY(B~on+fbK}0tt14iPL*!#LRi!dbU{3K(qQvMV^7qOO^h!`bkxKlM9
z%;Z7lCrx#SG}?uH95BZ`ciXR=WPQMzKI;;-L?TQjU<^3VSTFQB9m=F-&Tb%od5(%h
z9EB_!{b$(r&?lLTRJ;e9c+B|sXN>>t{KEp0r1$6Ny<v9**W(m~zyxrGh5u5%^b?3M
zXY>R{Sd6I{|IV`N-!JfszPx`*W$5^IhV=x1)4fwnItP6JMD`B(+{o&hWM`S_D7o`Y
zbJM&zzFCj!m2xTWB=h+l`JLv!tFLD%4Ab5&jv8m1&#=*dnwh%-f7W=m!0#$C3p7l9
zH3k#|o_~_DpWhQc>0LVhJ>R8&W`H^7*GqZf@noPRVLHy$Lq^>9mI*pPBHLz-Y(bAM
zC{hGRfiU29hDq|Xg<SSeK}?#PA)92Hs;$2#Wwaf#<*b1)=lfVVS>aO407%Mgyd7fU
z$V4ZRMU;_1kM}blTPUE_Cqr!zrEk8seVxx-&Jgq2?XLX^w#`)X&U%;oVw$v{c~&1t
z`+)I$<_Oe*<$N>)<^$b0`v_1hqA|#;OPDtVQy77S!!{Q;Ad9KnHf@}nMM6<sSw4J#
z@b)lkQXvSsJuG=&FRuU~p@4g}A?CS@CB#F7X$Yxh1Crer-@)gPyYst8+6H}4lAfmY
zB5|MX7D8e8_W28v@r?3;lciviybT!TE`o9y`_2Fii}i-wSwi5!OeXq@<lpvC^B{je
z5M7z_<dG~)Ol44^&-j1XK-9T8H=rjJlBq1C=#s(C0F+9o4zp*hAPUfzEQ%RZN0$n>
zidTTGfc!H_7AVNITNWgy_5e##BjmTZ3=Glw1yY*#gl8c!Wl?bAqY~<a6s9P@pW74D
z)sl^<6{VNQ%jlO?vB(cUg+dXR#T{iqKLp}F^3xE{pNaJ_A1ELdO9Xiz28ty#OqdF?
zCN35Q?BzX4=L3#A{&)N{#<s;MuWCU02l9L&KPc63_c<mk=;dkdi}z;t=j69k!od(r
z^B!Fk%>Y;;$TFfsqAa(3862nxtI&*uon*34l~`t6=5$*UiGDZtDWfnp=tLSn`+~|9
zga*GZd}ntM$|R*_0#k7=+qZ1?-m;&Ny}?Nyyg?0E1T-QpMkr9k?7#)F-KvixdP8Bv
zv56w3JQCk6l=bQ6Q$?~Bw(ZpPv%nXSY%N){pGq%^sS)$l<01>>8aAfFI$as5j5YZR
z5QX&X%lTj@llb4XlE=oferEdGC$Y4jslHV30=>Kk6Ur`-(5)OdbLI0QQnCJ?ES-)8
zT%inv1$rOlo@_}X!kAJGHL0Q7dDf75hJwT{rm?h=r0J+g%A$#-$INb~A~3#0gFcym
zfcOr@0M$Z1YO<s=<ef7Kj!T?TEP$M7&MwI3T~2r;ZiF$fJAYgv3ztQKTJ1F;=LURU
zol8H=ulX-Xq=S4W+`N*a^1sgju|TObbDIhM&2<a-3Kk;7McVKRYn3hWRgLZJL;+c|
z00f16g9ylB0so$QrO01?TYd|o56$Qn#FJ7VI;3rhEIV4@ZjmzC)=rz|i%I4cl_>u~
zNV{oiJLH+q@~3Kv`c2;j!aotUg(Td=jvcXMg!vkKj>O8tK9!kibqJ<mJ3!Lh62#Lr
zuP+qs<;#C+3uY>s75rt>fm(xz^SJ=O>lFggRhBW<;LC0zmObdH(N|CIGm~h+UrW~S
zcTZ-k%V+)~WLdXmKNDF78w*!<zQUS>nshb_LX<@IfVYWi#0$v-f!LZfpH9&K?HoR!
zOc!X`HR&HvX+-2_bJ077!IN)|#FbC-h;y;*!%gi;D+`#*XiTE)kjP`c{<7V$Wez<l
zLN~<?h$J!IIe#YiVWu<5b3`U@Vctsw_!+y3;`}XoJ>P3&3e-!~%z8^+%nX>-ogl}~
zx|k=DEVZB49>}XcP<~Urf>6K`l*smQn0Mw{_Fk%xLwWO{=O9tuJY{-*mFJkWEFkB;
zLozyM;ZsivJz-v!?4K^B=H*pR+e+6Ndt1+4BquT-BBNK_5WSY$nv+PdXM{AN%;f$%
z+<xS-P0wj5+Gl2>^L!xQi#^7QPuRAG#*Ymyv2NOm?iODt7Ika-ETG&Q(QR>86G#$S
z6Rt6f(KgG2k7VqDsu^5Ui?{@);!~*rn^ue&g|=j&cnp47gWAWan#k>2O)KN#tv!du
z8ndCZMKxqvr6gplaUnV6Uz?C!ZmqnQ$VBd2dD8{rR=0n(No!=pci$M~;SZZsOij+E
z>;cUddn#i-kn%4bQp)@CNvVXLcrA!Dd*VuKK&R4^Nqzm$=${BGMUy_c34j%F>?>l9
z_S8Vo@G3D=0uszJ-ev>ri$eaL24ov^gBsI3pe{>dmDxmg;>X}&4)b{ybwEr!)uC*=
z1TjawlykiN%XcL*n|$Ra#2cwI{}>7hBH7p1<jXfHg6@?&+DjehE(2<I2kfG8A{@5_
zY`~0zFKn0adH3dZ&L!=*Hp*1=i8*tz#0#F#%o>wZ0k`PTqb}0o<#mRQ-UiekRk!}J
z#WfLizQ32ZKD~CcLD2X*sLytQ9iU2I&!jb_o=l}BvngvqnkK(Z`Wot%sv1v(9TVf9
zmMpT=Op89O$4N=Aw#qY3Y>H*DVt48cfCaK9qzBw<kw8Gi57>jG)GOpYqXu`{uQ{CZ
zrfecq?Qcd$t(nK6qCMNolFFW$Dx=t#;0?ayMe3AL{rm0nZ*a$f&1qQz82M^WQ$WLa
z9q98mum#llHxgkzlTfuy)<RQ*tP0u$4n<>w2FRqxnsE=goa4aGzsprMAt_R0L>cj0
zJW(K9>;kAY@|S7#!lq=K65;-sZMg1{_LQ#mgoZ+J3*@yWH5QE7eRm^u*>hskvlc{{
zGoXs~&_K6qT!p(w%vDb`c2PPq%n55z0Bhi$r%lI<6j7fYo0;gnMEHN7IO#Fi*S*hd
zr)66bv5kS4goGs6a{O-&kvdRa1t7C~v0Ppie3@<bbe8zbHiDLl@Mi46buBC%{@Lcg
zTW-aZ)5en8$+*_S@qeQs#5m;rMRMl}P^VSN_+x9fH$CAt@GZ)3O49|&<yj&RgaFzz
z`8{M?=B~t8&P%cO2V=gJ6D&?%8YawJTwWCN8DFl$%QMer7wG&Z=~2il3{YFf9E*X_
zZ%^sJN&YWM9PY_oq}0|;u6Ei7nhoCP9x#*7lWXSN<F_1g_4<a<f1mHO&kVI`+t^8C
znpcU3>?3dYkye3KU;$WlPO+q5%(z+F!L0>YB5X?B;+lnZZX@-j=}{Bt8AozAyB3e{
zv#)iXV%W+%)Ukvp4X%~wEG60Js*-gY>rA{ZYp026dRBo?z$dr6z9kKniG*h=#u>A^
zu18^$OI_N1^n2Xpzb#$=VLmP^P?`DcVO_QJ*9fMu!0zHiI%du8nrBku?*1JhOR0_A
zlT9g~+u;-XF^7Er1zjq#CqN9=dDWt4(FykGdu?&4%eFgFjjMV#7*|+$-4$@hv=*zt
zXJFGsVC!6RvrkXU)R>uJ^MDS3CEydVW`jJtToSb-wU&(9qbQUI72rB;PTQ$90ZzpA
zN$6Q8`PQ&4u0>b@>M<J<w*|P*44oz@e0mEjz$ab&zQaZIhj~S5ENJCZk8ma*fJHn1
z0k8~w0M>!JguDe2KD*8;LEAsMHs+ZBzCfLBse6mGZSpbUW{wSD%g(<~Ach_CpPf7r
z|1=Toh-C$Cs1nX#2gs;4d`YcALh(B3wc>WWwYa2dh5Vo7kD#BF{JU%p90Utvn-Of7
zcUMex*QxE7fmIuZJ|sVH(Os}4it<t>-h-q67SKw7Ee@3&ilM7BvZ>M;cK_*s$W<#w
z|2^&r+F;br0stZe_`gax=uZ2ApwEkz0`fU!RuPk#NNUF9VUNhQPYgCUWn;i_z6sl`
zxoM(0LFaz~p4lW_B7=rDzzUiFu(GgI0Pld`fxm4dP|RMOQG$j?`S{Zp+rI|P1Ai)T
zp(j(a<$S<sUI*w6Yyhu--xRa8#SMsK+}}}<1Y*iUjf=~-3H$lP=pQqIKfp|UiNk~i
z{%nngxej~=o)Y-dl?vKZ+Jh_&S^q*Rfme&M%L}5$tr`8b5hfJ5(Ok2|9!9^<cK(1z
z7?E}o&1LvBU7NrH@C^6^c;lkjgw2UVvVbfT2f%b#4Ojx6618*79WrKb*P!r^C;wRW
z$9Yhn2%5cMDXfUigu~K>lS)C*;*cuv3HTNG#m=9xGuj$SG!PFu-dNkzzW{z$)Ww92
zaU(3ETCukvLlDmTt^qHBC%|)H$;m}adb#=qwp+QteVq1t2mGPHi|+UPq>CqzJVl+^
zITrzJ055?*fqC1e&;rWDnHiA+Yi|msLF`t?KMSURTr0ph+hIboT_EfnQ-nVOe<^Tn
z%HJQBHeidoOmj2|eEAA^qV#W3|BO<FS+-9#Pr|hf3tbzfVm7Cb3(B8mq0|#?3GDXo
zUHmh`Mxszo-W5LA2;BLHqQ17+h&9Y5#(f>04e);vAl@plK;+ikI*ewX+|~$UC9{m$
ze;Ig2)cOr$t4v&ug1m#Nj)VhvuL-;Q(Z&ER-CGopgHSwR3rO7hhChMdfO%a7(8~9t
z*(`@AzFU>P``^Hyz-v?Bb>=zBY~)GvVQWm^X%%=W)+aVak+LyofI8Ux&5W;;0=vK)
z;1}TMF4$PJ!TszI>1Wv9WRm+(aLa!Kp4%z|>l-cdpRP0_c=;Rr{ojf2*QS3aQHxfi
zt&)*3_Id;SPT1#N<sVRtx+Y1RbgL6!o3OXP1J7)gL6i62PZ{<E0@`XY2fzlc=U=){
z&w7=^{LGle4kms3<e#5`m&*SMyI8E5*>s5Z(o})Zz!Tu#z~5c;FY@vusq9)ttHHo~
zo}lv!O8+DuI7d>D55))A5$*X-S9!E(F)9L;WKL|r+Tk|vj-cT=`AtLw`WS3z7SO$j
z@$VBoe@ZJkKp&<kTEu#BV)BpEey?<;Uy~%Ph|}juI>)@vMc^r6hga<Uklq??aA}Z{
z8o(}LlmDRnw)IcO*uO;88Rl~_9%Ba=>p!Cb-0zZKZ6}H(^B9wbcgR1#0)LwW*5u_`
zduEXT$K-c~`X|T#DGSfm8(Hi$#Rd33a3A~z@B#P>_!;=SYaw6E2+ukkY$7J%f88c-
z@ozkURc1PQ0t;T7^c@=9y|4<r27U#823~b7m96n%UpyeyzK6WPCv;zP%aAl$)V!hf
zEYt`Z{TJ{D@ElmS_s@DjtF3v%W0G^5n5n-5e*o{g7L(WH$E{n!N|`C8?=A2v@SnY)
z(=EQgnNia?yG+t<2yXNfV%+BF-;zqHX>Vgacm6+sUx7DWOJ%L1n25z~@Xs^GKJ&nD
zz%Rh_F4)~J{_%xxo6LUi0PpF2JndTOYMmD)4r*5!^|&PODzFH=1^%S(*G+#L+p{`o
zAO_U<|4-mQdqLNm;w4$tQ_Vw-nXf;A{{sH#T83B4cM(BiUn|)!k^g@O=6mU17ZwO4
z4o_L7TqFIQ_FU`be<NX$9*6ba)`1Pu&*-1&?>DH%OA@=)DHkm+)chIv1$f<c{+iT&
zjdCVM_WMqL-0yeO-%`!ig#`)!4v6{uj0W_R@_$W|KPkV(qQ(QJiJk+$0e=GTx|WL9
zrF%5$XwNqNVp{J{^nRcB%5U%eWh{48iJ9!~&kSg)zuy!$AdsjpBWS(-__6xO=;3#)
zPl(SPe0HhFSHMePL8<3HyTCjTv&W@C)>(VV;Qb}nnKty{I`ED@)+TV3Q)wZ+Ij4Z$
za-Hl}*N>-MED{5|+^8`k`_T8e&i83A_1$L+c+M>AI#-I7sKfWU3;G%GvP*k90?xC*
zc>_4lxs(ze_Z47)xbyFNSE6UYBFooXz!eTARv1pZ)bA_0AB(-`-vD0l2&-JEnozgZ
zXoKDo_xlL?H+j`JxYGloLq7p;fEU1<-c^(-6>^oJ^>ZA|bxv3vSOT5{e*!O$r2meb
zf1RN71u|xh3g8dm8L-f`61@d{qR?#vGjx9|r06P3Kd*o{N3Nt_<`+40eNhH`Tg+3x
z<vQ_CDt+#=$`tDgHzH4x1Wh(Ryp?snx%1b6H)NGX;3_w27q~dngzuRK?_Kmy=;*9+
zowdto)rh(6`2WR`^xp<vGD*0VUniXr%=|Ui!FTh23@lJ-9dey|NxEPT=)nEmyGqmj
z!YBUtCUA|!vug4ocbU87%Th1>H-M*n)$3dwJH*244)6hZ1H1xW_40oM_`rhh5^$b_
z=|u{^qu;9&)<G}x{cB`FOUE_2IcOeu3B2jDUaEgA&wL&j0|FXQm;8Lsb?~3M_{W_;
z<KMeNJd}uwPnLjpz-!<&@S%4VYet8@#)9_c_N=^z{|6(NMPh<%^lE?m4024|ChL86
z{%o<<uzVb*NI*c33*6LDpj2%DE5IVKbTXjI1fdkhWpizlfEJ(50tj;+xXk6#{oG#W
z6yRHy508HTh%@FFxap$6wI!=Uzv^*69uUxBj)S5D)SixbANl*~abGdSH4k&#s@0%>
z?Y?3OSkpzyUG$p-E&!Lf>A+)??514a{&7F<0b?A?pCtoER3OU)6IjRbpMGGj2T-Uf
zfEXW60Msaw`DVC_x(|q1qTB4Mogeub=l#ubFARi#J`zymSo-(#{VxF10vf)<+WE&5
z07*&$XmgHxm0IFJd^{BpW|V6(W@I$VZ7y0Opwdb79|Y$3{4ibFm|E!*-~VX-34s}|
zX_yp5vwhm&&nKVX<C=(b+$>^QT&IHGv-W(f{Ps}<=cs~8KtlO_Px(KZ{t+-i#P?|e
zGJ*78qW(FF|0j8W6M|^h;Oahie_J^IGXTsHkY|ip+?L>ZESx}pr++T+dFKGP2!`>I
zjTW^Net%Q|d8U9Nc=%tk>2Uk(kdOl%NHGL-s}6`tc*>_DrHF`b%^q9r(o>%=3TStU
z<+JM9Aqhi5!VPjb-dY8>&H^dd$7*Ch1^9qJzJCJi+NaE9XdhFO4S|HLpZdJdf@q(N
z2U_cH?bPROGQ5wMSLZ97iT;s1A2EA-$X7W1_k9+W`$YjC5*6`u`Wy2NaBg{*O{Uc|
z^3M<<6<tBM#VGtt=l8gM-+B_P)pqynkdUJ^pfMlbJ|iS#kzf{9&~K6dcD_h{`(%Yw
zjUsq>>hpba{~^!s8s`trLjST90}715t@ZeH{s%X`4D$-(uGY=7{Qe-1;4voLWrN+B
zp6{1}xb+R6iT-{bk`$}~9egPy<RGoz4i~qc{_+s{9R~W~wLzgyKX>oPN?FNqChOFb
zSSIaszB!VT*2r-Bz4d-V*$&eD(y#FT`=H)|j1hkg{r<b}ZxaOZS?t!4=XWWD`0IZE
zzxBK)wb$m?(La>m|4RBR|9lw<eW_-CG4t71*a*?roZpicu&<$iNa6n)<$?b`|N41c
z^e<+CkR3r^PydJx=>N~@4+)<dU(Nr8o`nzYTou;(Lm4frp8Yk;w2;oi)umPVtVipU
zcA2Ie;Y^<jpD(gVc3@*AXC!Tz)h_F^ub&Y#9kPn2$cANCYC6;Np7d%IWi<KV?2=ZJ
zD+IGVUqJtey+$RbDGxafd-}&`my!+b)j14!R{ED@n@P&^AATwS^h=HC3(oKH<*nDH
z#Va@~|CjjV+Nf1MOL%JM@3(b!QYt*t^MNo=#KilT(Z8RUAF{u%#+L6heSFz|<H>0F
z;aSY2C(Ku1cvAgR`d0)rzDfBx)BS@TGnG8N%vtr1C(6qc{;i!|O)LLo^88u#PaxZ)
z0$ySD$n$p_fI+TqwGcg(E}dmy{bV+BIl5UvH_K#_V+q(inT>s>=_X|bUQ0mFPs#BP
z08`u~IY!Le9bg?;1=ejx_HiGVCY%DsxqIFW;0}NPBsK_uabTJW%6`CSi_|(X0(MS#
zeh8T4`G*Db*ywi({mZ}vbDX18AT=h)*G@fu$TG{c?2oLoa<r=RQAd78h>khM1Z$na
z?b=!BU*an98P*75<_FgK{wIlnD^tu8^n8GmbsGd&SU)5EU6Czdm)DN)|4Cw82F7^)
z5dp{B<#5vG$!atd{+T3ZY?afUYaHl3>HITHFb@#BL4{}Y<nvcpW3mEAEL|!qr{<V_
zV3Z={`XszOX#463H9i2tTwOfDCX0;9XNkjDC)toP%T&{>@%c<~uW%mhl=DyU;V~|R
zZm>~k`BeQg#LFKOd_(KfbMtX0$*=Oi)d(E_Zvd-0A9&=)>7OYsL+zJ3_Zpwy(Fbw{
zU;?;G(BomQ^b6Rd_n9q1pN`Yv<G@vdR-dN=P1vFO4wz@1XZy(Ki^Pq;CF{R!^MmX1
zc?rz-0$#e$6mS)|30#%+*A4c_Ed%d?dEmowI<Ox&2V4WLut-v3k6N8g@-Kk5$JKet
zz&YSLaGmhN{ZydaEE2v2ULOfM?g2Bvb>JpT^csY@9=+$J|Is>R2)F{=B5dd=;S;Ll
z-goTsIhy_z;3A>FF0jQSB*d1RCVj;j*X~aU%<<)~05d?D;}H$%vY7`y9mm7Nz(tba
z5-`phi)&L}23`{}{%9Ri;t{?9u9AcSCmdGU={kR8mx&Kd16P3Sz-3^L3xsV##I?A+
z06rhb|3koK;0ADm@YEjjTRXr9;2rP=_;4Kk%fLn87Wrp_V3$=X@_q!~9|!tA4O|7j
zA@tY)F&15g_apEcm<K)|$3G*$W#Be&hsZZR`+i)H_Z#3f@cxAJpXdG0u}*3^@(uFO
zYwfY_<^LPxpCLMsyKJ#}4SWQaPdNW2-rog+M|mt0EwIb)?QuH1MBMe8LVqxPUIpF(
zFM+p5^1q$`8<Aft(5!Pi!W&@WI2$xf{<#KR5crQei{!@tJ15ZpCUA`o$N(Y2oB{a2
zE~zDZCba)IL)@x|z<I_mW!7n3p?ijI!Ta7j7ze%uejs$sB+Yk=i0`iZ<|R7?*SelB
z0+)bC#4Vm7o`Lo39+0puG|%0E2h0NB0QU)7Ya@fM$-BxapGhF<LhN>*AzH6*fy;D4
zO0v3n54b{{k>_2Y=^*aXJ>WiYO{(-Z<LP=0p0fS&bC(Rv61MjaVN+AaawVV+T%lC^
z!24buG(zCb$G{B%9|1$?9pDPV#9#Eva0$2!`~ds_+$3<FFJIww;4~G(l0F>p@*Wd-
za*(Q{$<Fybg4q|k0H2NfZvqd1yTCPRGp(_Uc^{aiR(o^g`7aT7{2L;tMoeKH&<4#2
zXnp776E*Wbt>*|sXrC-}O?v-6_MU$LxCVR&JOr*2dMo3Gjzi!o9g{xbd9M#q0<Hkx
z0Y3oO>3$X&LGG~8=8eOd0T#OWX9l<pJR<$Z^USwHtvpQm?dG3x(*F@~i*n+z#^7ww
zC2G}Iy}d?d;$A%hE;D>A%gF5>N$BX`-RV3B+y%ZPaO0>faIZ2ZnWhf^&?~<q#J#%9
zF2zCFSkw^t34r-tAEHd$_50+XK|u(2PPlb1KJA*5@(6nW18@hJp!e)ju!oEwKRS>X
zn^D%!sK7?EN3%09;2OQms?YcGe_3GLuMy8FlARef;07>5udpq@p>fYWYR{X%Bx@@+
zG30TPL76P^R%4k#a{oUFJXhi0YjA>YoN;+U_U?29@Eq`fz=0Pf{ygNSrEwX=c4d<e
z4^9DhfXBcc;0CpqEt~*TQ|`xIO6{%YA_fS${Q$Ve6xaZ@N5T%Q5x#$uZY_+Amngl?
z{Tj;Seh&a+j7d^fH9B$3I6<2q0oQ;LUb@G8tjDI8h$iX1I)OvL4QlT@giW;>&jBFj
zAb!Z{tCyNf@X5eG0uO;%!M%@&GUaB`2L%(dL;o4z0r0;79sqMR-A#HhHn--ihjR(u
zDgX2X*MXl1Txm_tsf-_w2%HRn*DAwy{-1z*M4q;h&k|9R#`yk+%nNnSe;@c4aF-7U
zoc;7z(i)LL{<Mq#O9X>`MBu+cxvGqd93}{A(gdD&(f>Sg?;ikD%)W;lUms*@XpHo4
zbn(wE;3wcQClvZ+T&thw9~M}P56VAIe!mC4qn7BCg_|DFKSBirrdHdZfWWW+0`34~
zG`+U!0z~QSm$G%|{FA^f;0NG);HvD52_&UHM4Tn}{%^YY=PK}+z_Vjg!^!v}eUj2o
zyC!u5;3DuN<@cQI?6C5FQ3^}${X6HM0`39-1bzT6kaHsntTnBTFs6xtcfI=OM*`1|
z%7hZA@cIM=$M2d{4}kN)BeJkr*@UXHkJ+sX58syG&{5{9U~U?h+N5tvlp4zj>~Zv}
z{p&NpO-UXF`R%a*H#n8)P;Im2GEoKikmh?xb`Hd3Lvv_TSvp;H=fA?(<r1;<EO0Oh
zbC+=)*kjjM`}`g-EwHOovMs{i-xnnNCLOW^(?=7)P2fA=E=OE!XF$wBc30V(1#UB?
zxaa&xr#<cf-x74a!XE@@FKuT-#*)mQDd3RW<40=$iZo{gTpLi3g6ILq_S@$l1FqBe
znFWSKN^SPAD4-ZMwrn0c`d<d_19ySTBI2M93g}8(MsAxc`r7~b9B`kp-%x&CN?H84
zOtyv{*tSbM{{x19gEB83%g)b?X_9SjTdf1A(0Bcwpz*e9ppfSu3yAlaS8Vgo6mSc;
zN8f))8l)i`<RW@C`y8xnGw3KCklVb!SUOv5^GTWB(>_7d55dgQL*Nl%V=MV#{Zt@7
zGq&vREC0*^cNn{j3PcxVYh8ti&P{HsJTM*R43JYHbF}-A#fR>@%F?&8tLMJ}JOCar
zb}--9<I6`f!Q240xNx(>Kld2BObP6;$1!D(`<$*aqgz!QG6vja`eIsg53yWCL<cIB
zb?Y_NKaYXyz<?}?_4v!qpDHpbSLN2iHgC-U-vSRfqFEN^wG9FlV3J;WLjI4He;xw2
zf%9}=5{7v;dtYWo?0{Xzjlh}T2f$ClJ;glyTbKu3PGrGk3c8;U2>Kk+$4GiMo!jKL
z3u-qV0Yl&-@EwDfF+p4Lcyx2<BeL<H6;x+{N5DUM`vde86H1?LqABvL4_K2;0Sp2+
zfbZx#_Q{&MST-JqTtDZ@#^;7R{}teSQ6Ld77P2qgG-1TOH*3xbnIvfSO(K651=Fb@
zOUy&MKRYZaX72WH1OK4=U8ab&$U4?<Ya-d;hU^BIzxfXM4!BJ0Lc1zZ`-IeC1v&p7
z2)g|PLBk`VUm%-1eIZJfV;2d45jp=PSrjt!9_IhoV@YPqDR38lIrsId?EABYADQe8
z&uE_ZY{~UF@BsKH@GXG=BT?7Jg(2@H?V|q>@D1<-@PMiRKG`*u%0B@KSttKBK=<Dz
zKQGIGZYr<sQxHN<|7~$QE~bEojJ?O@fq*(B$-4nQM=#q2;bGtoy$8$uL~@E$_&1Pg
z*e$2Po%{Sf@B?s_F=EKCC<@?jz<Y^h$#?twKLOuRe*{vT%zFAo8?x~_GyZY0?+nu}
z;8B~>=4p-VG0U=b_xG*<KLS6}fwB3>27A0i%BA)7>~O&@&H;~j{&5C~ULMxWlPw%z
zTTGk(hk%>3zT@<Ge3Bp$U6}DATkL6QpZ@}rcNbZA%!KFc+C8EFhFkFKdxBtHq3>lJ
zZ!&1MswlY}m&|T=-Z;nCnl2BxL@l5n@AififJu&`jevf^1aOYNQ-sdPG=3;MrAC3v
ztTRLbFacZwE&x+}3v>U#*rCJ_ZXCD(TvpKML4vRkb26tUeTQboG8$1k?4WzTLeT9|
znNKT-6XTJhMIvNhB<Q>!0Mo3UTHz~FP(Hqh2b6#zg3eDVX!ulnuCFs%nB{NIzM{fu
z;5h?r?<45^m?Ukpo~0}1$_%s3W(4qkg1DRGl!~C4-U|3FuF!7DaDX7(7l09Z%`x}3
zSgU(RKW=~s?K5mMYJ-Mf%4dH39_?vCnB$krz!>0r0(xGOza7b@dWav~2h4Dkw!=T?
zShyV(>`sUu9aD!ENv~PsAM2d6yuaB{Uqs(|A_Zb(yHQ@><;LcuZO4>$i~qzTyraNH
z7OC3jcPw>@A`yz+(pPYvOaP2>6m`hxZ|4b_bBXdR&LJ74LDmB&fgoV-$z|59EbW{~
z18kjfRGEO$md?NkaE_zp*0$G@Wq}Y76373SRR1{oyRMA_txS?9UV+|Yi7eFS|A^<m
zlm}h+=q}o0eJl)&uuZaa{#jsx{R(Dx5;3|KN|f0tHL3b%Lb_%W!T+&a^vJ?4oqL|=
z^}-S`BB0@+Bwu{#gHME*5xaa#z^>0l%cF!Hjq)yA#3x(i=L$i`U70*7aaZynBV><4
zZvt#~JRlFcKEZMctr4pJQr_O0(k?Na56OVmB)aW!QL8nbCi&qgPh7c#-#9>R?*UmK
zFe)kHqHKv)SGFV=Vq?~)Sw?k2b`un+^fM6}Rp13+5SXL^KJfF4etogto~TY!AVt3K
zgq^>_p-MjwM43nsl}H>?WJ<XoD3nkiq4hKF&+>p9zOb>+VuHIbguoE%Gkp^86(v|o
zh$9g4?^~bk5|C8b`Zhq{EfufP%kPYJE$XA}GcJ{II4WByjQL_o9;IA)SLEeS(&5?l
z2pX;qlwO!o#O~6m;c?}k3GrTBjjknn-->C*i+wf+TS9%9J$sp~scQ*LhbLZDK!?Tg
z`;JF?jLK-WhAci=Am+E1ILhpN&BWDLrr5OFk|{C$ST{ug3|dL8go7~(vyHZc!qXt-
zBc|g@ECx7Y9oT)2F-dLSgZU7Jd=N9hR^K6~Zyfz6C3BL|<+s;1m!<>^^7n>;VhOd8
zJpaZtlqL>=%mkPYcJ$j0Sl=iME=NJCZvmY#PX_1v+2rX1scU_mq625D3i*!HRB~aX
z>~bl$4Zsu+RLTbo%D}#y#OfPuhH@`7l;8JnzKfY(`&Ua1_&`!%b)`sW39@qAg1;tT
zz@r!yfzTedw2wKPkOP}A*^!;2-oBioKv!q-7{nxIf_W3jq)4MI%kl&Hdq%b*Ra!_L
z-e<cA3aIwUV19wew&~)iCnhS&B80Y6&`;AFF>~eR4Szxd9CC?TKW%=iLa=~><P9O9
zVHUoaA6>Cs9tG3~xuv-%?3xPY?JNOxh&hKH(D|}JZTR9xCz3@Ah2Mr5>m@duAIW;v
zP*^yWVos3XB@z|5n4px`b+Kd)?M6VW)EU!qGiQUc*RM&oQS0sVE)wccyYfaE7!n66
z;5j4eK#%w0vpf+os~;A)7{p>fbJ}8w8qB3>F^+DV8e_(OY5VEQ*&65h;se;ESyHC_
z4+#Huxh}|W(KQlAk?e==O7*mO^G@LUS;i=F(yM6lYbC}~-f6KWf<#u=M|%H2T;Swc
zfwtB>wuHO3LXm=sOqv!#)QV#F6B@0M?IV`<1z#tV-OlX<g}qor`YQ`r88RaM{SKlF
z8j&~zWfncufwb_|<E55O|AZ#ZL##80RKuxUraoPe97iWJS`r`IlyQ@n=~1^fIaSvg
z-c}M>uFexRu2GEB;dO(_4J2t8i<@bFg!=K>QNPjt*h?0~426<FPx(@&!`jiUWj1|M
zzeP4O#~~DxlslBkbc^5Z<-essMW@QdoP+jB3X(R^_lHoyk}_HTzjj(p`q(s<iUEKG
zR7Tx`l_xCMtG}G&AG10fs8}tr*(OPo_Zdn7qA7<D$$Taoom8<efuFv!)n3(U(DZ)j
zQYex92eV!+KA(wUuylf@rrHABPvL1&a>~-J=23JLlE6}9p7Bp9kB<jhf>{1n%U~V)
z7pV=k;?NS`)eM-GH50oX)8=L+weV=MuP-H)Sa`|Pudo!iW-inGy~KIZ3cI+Xt9BfU
z6j?x+OHHYx2XU9I2Ih%vY4Tq6m*h=){7(B5(`z0TcDqS9W6q_<w!MTWs*`j3^H$Z<
zBip-GxSyyb$LUf~zEIxOdz+uF;sA*NB00ZLj)EX#cwvi{%6ya-%e1Lf5PfkXeC5+L
z@9%nY_4?=b0zO-KEV0+t7?!aNL_R&yn3mUQv4^zkAOo8mHn${!`Fqv{WNJ7S1j!ae
zR!q2)JoUAgD5ElKil}#>4QjM<D&m$yHM881$@Dg8xcm13k^4kMC+@kOln3b?&%8eK
zeT~sG-lZ>_2|6~oA+XkgSTA3UZcCDS>Yv3i4Z3^(l914<u1yI(Q$ZwdK?Z%dfhvW;
zE+wR`@%WndeBD69W4S`RXH`AEyd8K8j*F61I5Z|~uoXK!A+RSb&@}UC%2_SB6|0-v
z0HAFZ#;@)3Hv|FQ$!%Snrbu!bCH~1C^;{N9qS_+BOyX^e-Lx9rRM7BP&;=vWewGN%
z@@|wT0ePGL4YGG6<~F0cH`QZ&*)WI-lYF~aPjqc2QD;d)$<DpRY;3FNa|gyP$X6mW
zb}{WTuh?mmud_?rC;csDU}bIhpGqbOF&7zD$vmF4n<iwnB444!34k{LXtA~;aY!uj
zqV>{+G(ZLFg2b*+0~}v!|C)|Z#KOBeK@z%#EZ0*uKu@c|%^Mc*_~HR4!ock{lzZ&g
z_aun$B^<M^x}p?X>!v`fz&=pRLry|m7Hh1shua3gS^_o&@i`S{$b>5jB+R~TeoXl?
zmM*jZV|jCHXYu66cYp&&f<uB<@AXu0A)ILZP~&9$o&(+91-5{F7I7fgZT?L|M7Js+
z<!$cYBR89~Z}uRQ*U6}Gtqn2(TRQfLy_Yg=5RiM-FlM~+9X-E$eo*pfp?JWWmA4+w
zJtqk*ZgSiO>a1&-K36I>NrHypl(#@{V4tpOB(@SgndnqtYj=QJi$h|Ege{5W7ENPX
zypnRcMTh?Tv|gs-%&Is?$I_+|EyTXIQPu?Xyuv1*l)65Y2uP#RDo{1h@D1Pqn4qqQ
zoZOe4p_bsOQQq3Cuw6@u4LcO_73ffuOeV`?tB%H1o_}9pN@IcgEeI_^`CD~sJ(^UV
zao&bQemP){#RA8S-`*Zl+XeQi5LNY>N!Gk%otrgw-1$uf?n<M%Iuk_=xyeGV(WPnL
zY)xL(QVGgA?k${qQ0H6c^k*urM@E;x7e+9b(t?h-iv@=z3ubCBl4#PrXZOC1um-Gg
zk$72>=|(qSvJ37_4!GCl&}oKUE*WVs85!}7V~L0$q0OOgFRhXR_t*rUNFpK@=O|#6
zco$fAFHq;0`#C0aO$Yhn#F|f^umgI_e*ci#%nEOr^wG9tA}6B4ay?YL?!6AUW?-jh
zfKQ_V(`pBt>f8nzZXRhF*al|vi7CsaH6(a#0_$8#bLhBnlPCb^WqjWgug$s$5&<t+
zw?U*THzn-SwTZ+PRQDrbo0r?Hu>-Vfz&fx9Y;yiiElw<%P^;$e0xQ6}E#%na{nzr2
zrzgv-GVv1kxx0Pa&Tr=#pr>dKwkI5&aWda+R@k=le+E{`{~ookCyCHNtmitH5>?$x
z?f^@`GP}4!ahFUCE&OeAf@aMIk*dTKS!OXf+Oi4&0Du5VL_t&}&8J!%@e_)Ejrb)S
zK)psNtShX8YpTXlJ=Pfz)8KC#|LpSoW0c=gzLaQ;++qXmDnZxV`oOIjxyYHdfR0H+
z0t(B@HwoUe;pBIV{5&L$WcH7hZmgJP4oua4f0I4EBVv65xkB@#-1E!8XXBqk_N4Bz
z3$Z0Wd6B6(kG_ec*Jsy;*_2iC(@f)c1<RhiNKDZ9HRGRM_LLv8u__aqXQIzTK{VP_
z`CTU#zyy5=&3~C!ze`iKM3HUyAQaPUf+57rG8XbjPw99p0dE!gWEc1hd;r#%xAi3o
z)YMnT4(4SQfI~9S5>TVAYLJ1Fd=%a!_`@eRO`j3Tcab{WYNrMHC*W4Ab>I{54p>nI
zv<SQdR++}Lq?zV#8g#$6fF)qn&R^rVuF7ni#l`j&pzpN-e6aI7`o9N0<=2UKl_<<4
z?*N~<j--A59WEJJq+hM6I$vb4$q9(hz>3m;1$YN60*9=TTH(6FDxGzpyYqhlHr?~<
zMAF#<kVmW4PSA+jW&wDwTdnrF=Vpcc7qTp`Ih%}HWrfHyyCwk3+%vV!!gb12OH<;=
zMjrX3%|GkFJn)u2OiT1W#G7==WRtjWi^~5Wf!Dxi&IWe%%#`VwoAr7REZF&-4u4ND
zl$JDqq&@bu#-&Q{Y?65mcniFx527NdO%MU^&pCW+M*kgP5m*3rB;?Jc2y5BS7SI1y
z>3_&QP49qB`P<qK26`U{#IF3P{PP-k1*}m1x_a5HLg1|DXTEGz`R4<dj5Xz*jb;aw
zize|?R+ax(flvJXP<Y<TYt|f{bzD>5|Hol;cS|?YjWm*yf^;`Xhe&TAh=jC+d<aPq
zq)R}UN=l57Mi>p!j2evXcfY^K{rm3j);af_^L#&FWFgQhB}6*98oY*c*rv!!glBNd
zf`}<YD@miRu~dRZ%b&FqjJ}a(l#xf4TB0Sgtl;;y40-(Hv*l-CkVDQo6+}VMGKtYC
z3bNGvE)TlY_Zk|I@ZPWSsdx4?fg6dqfS$;=48A6V#WBwz=<d;FYd%M2;G}P72w0Xd
z>b~adtanqBGE|#*EUJ)wwfs{jqQ4zs8<jNIoFb{sq>FmNyZ^v$e`w-zZFxPLJpYzr
zqRtNxF*wHYcL3~Hgq2vTY*v)1UQpkm4xC7R;V=yS;>-C^TbQqpPXj0w`ah>nRYOH`
zRi|fW5UytA#A};_DlqHW(zplP=Vg;Ir6X$D+t={oD;DdTSEb`lK6q0pY?FWbIF9~>
zEg-+<vL+8Zr56#wd*L3QLl~;Rr@Gnfc|-Ux%-gb?(_@sOUVAR$yNMHM>0wl*iKFb0
z_0D$JTdlrP?8MeAv#u0`!AI|JV&ZcN>x+4-B+qKUs~~P1>rhY7EN<L-9Ozh~_-I`;
z=4&dr-q<tYO=fjuO19Fg3q<^-qvTyrrtQjxFrhhmsRD(@@0Gdh^jiNjZ}T*M89{$F
zd<ylk)pEV7G5*)YR@lPB1$QalOE^BxbF3)Ecg{-<E3D~+0^HGhcBzo=mtjqg%$Y%c
zfC5;DhDeo_&6<277JPQ=44Q#I>ym?(0;%R$vo6&*LM?hWtVyh<n0|<-{n+ZbncOt2
zGHcN0n`K;o954fT2=eRr7?Vsao%cIb1A7hE0UJYv`hq)N@QdOzR3y%yjPHJ&_JIhs
z8ijsJRFJ9xO&hBJ@xJfm8gl~j^M7{qX(DZC$j~UG4?A`PSrm!3`s~MPHf~o+a=(Vt
zPZJtr4|B{J&C2XcHKu47lFlz)_wIl6P$d1D*Vzfd`X!-0iX7FWk0n)W^RZs-GR$D)
zcPGHDz@Q{M4kHbrhwf9AJ?;(kaeHkg3U@2Wr{0}1tp2ZZ+~IUV3+o@T&C$sN0phdM
z<u?en+nnVPTNrqxyoA?2>*i?@DJ{DE>PV%(opp&)vw&hX3}OzrW*ng~`}=`^e}dz#
zJokdaTts;u$ZkuaISuY9^fLG^p)@o7Ka|=pV0)ryV|jk-{o3ODT#O8`qtuz=nL4sx
z8!cuUM-V~LLF8)|g<h?CfbL1vF4+9ZO1Wvl?*+x;l!O*LBG-lJho~|WM_>@Z{F4i=
zcOT+`#jB02b}#;&#I*P@IT@OKBc3txuI?FK+!8!<{Jz&&P3)*!AkOR)eDj7xx)jS8
za+8jZi}}H6>tGmZ`=><I`|wc>#&pHAw4f|7iTdl@$deTV#jaUQ@V0EP5TZi_9h}OW
z8skJ_Re6ehEWX}O_jhtVds~0}0}a4n!pY8X0aeY^RqkkWb!dW>PpbYhAZft@1INi2
z`hKn8Br_7LG}^ir1y}ex%suJoF80P9b(pqf(KWK@4Xd%QlRrk82UCCUez_Xm28zZ)
z1BeqESLJdB17V=ohXy#;+&}aK(3&b_!G5vSg0}T`E@gHFir=I=cUT{AgKR}T3{zj9
zAc)teU%%Q52#racFp)iY<u1H+l#PZl#*V-C!}Z@m*6@b;lvdByl6%HjcE>8=T`&FT
zYfmDQ(tuD|MHZi;Hvrp?wS7ip=Z~}-Pr#!?3lLd&;d5FTtL}f2Un+&JuE|;Z*W)k@
zXF8`s(+Q}M5nHbmLcbsepkSg;=M3}dQFy6g#5I4z$T{ilhmAw^1ti5tv<R|Buwm>m
zK+So=Pq%K@t8Z*UAT#&;Asx6N5)&hac|?e*q{fyRRSk#$_lNp%Nk5n|B(_r=5g9b-
zw_-F7U*TeovJ!UXDrq<D|G_+=u;Fv#4=7i^C+4yXz7l>OcXdy}V{%Rm;n*((efF#=
z^{iy>2VNYsEVH>oB1VJlH+MJqrd8zPj8~KFPh(VY=&iUThV_oI=iS0SzuAj3)^Tw;
z3jVRf4+p9DjP*X4Z}Ne0Eqc1vV_uR9**)`r%SxsqMN#U`6h)#FlXu&Zx~i)<G)<Z>
z2gd2eqfgf~TEi=m8jMITJ1sly=C3`y;s*Dsx1YSc$d_MRcAJaLI_(c#Rk_#7j*OET
z<jS#R(e%aO3bX29JagwLpbFMQ4m&DHvq?-z(1p&XKf?jLe+c`-{nlhDcItL14Ejnm
z<J#&DiPejz;&t7X555MZdgY`Z!?bW?;aByn5or3kO@RC<*twf~M?8h(X!v8Qdhs}~
zmdH|26-SNga}HP)n;nwKec%WzMT%&rnE9&0c#x9dM~t8@j+;#0S_D+SsO5S@^)&AF
z@Hv=R7>mqFq2HIcJkpknXJmM5QWa+TUH9&4wxeMdI||tC-4^@o)n0)<xwV%J+V)90
zI@MR)db>JLy$pKhs<-{R@n(Y+j&}u<-n+YVi&!84-tH55bjGOa<UtR#xGYb}hO=bo
zXDtf&<?$bIW)>|KY!ds6!&fmnBh4;R_XUR9hc*!^8&V^BNN3Z4jIcnKMTPA9%UPJ-
zA*VI*_pxJZTO@M#Bm<^*tS6*)*m@MI^55Ax#MvqSemwO^V9B`(%w7}o?7vHPcQY+B
zp$FTJAE#Xi3d9DsP3b>|xI7Bb7F2$3%uJnjS%qnGBt{6z*lwAOBw6VEXESV6-cOK5
zWmKE-ik$?n0~RkNKQOWM%Xh3p-UHz#FQ|wKXQTR}E&t_})1L0(x;`f8S-?1l*H{Ov
z7{vq+{y-HrAkQDU^+I003ezQ!$yfUN6r7F1MV&ia1XB;mDRz86+J<o~h*BVk>q6nO
zu%D{D*@0<e4ldWM{-70p1Q^;r%e`8cdvPOIAF{h`tctA35X?;w9KUKsV7i2Dn~X{)
z%y#6qO^zXDn?s~OCqvQBPT55^#J=yAxG}z^<Na87UEZE1-6^j}`=jhpTdYXdKAu?`
zk8IcI(V7vr*lfp<*Yg?kGp_jAj;Q;rG_oqk%V?fNl+}>I<lQ-MVTmLS#(gh?CuiZV
zJc+Z1|LoNsg`4=?OL51u>E>(trz>KLV7+dlz5)!_{G~GfjH5ONbArTS$`6oaxNwKr
z^75lo_PCw<wjZm_J{4T===+HIkZ3GdY`-Vg9M{RK^;1k9tp4)dMB^6tv7=mnE;`mB
z;m|P>J=u_mIYsv^tg)`6p5TkM?6r-$pskH9vmFw}q84K`o>QJ%?(3Qdo?8kqwi+?f
z9Wh9G35q#6UiXTTH83G|9Myi73v}H+rq$G2w(r}0h^d|DbF6{u&v2RDo1)JV4}!qy
z#xWLDlYNMtMt+?uyP~_yxF~+>SkK{GimTf%rr1AMd;SWa%We3B#A*Kb{(R4k#%M5*
zjv6b2d3xvG53iW}m*9AgC;TWG)!wH8#`k+**_3D{B^Y2**fH}+aV2GROaJ{PhK0Z9
z?vrl=OXls%qVtVHQ{1hz#xX3buq73`^OXkwT*35~ajla_OkA&GqS0tvGg_vnD08PR
zVa_$IWVxS|_nBqKww`*;=57(Lycy~67TlUCaKiZmN9V#HCa!?u9n>2rt+>dosm#!V
z=Q9aL45{AxPrn6T?fs#hpj-z7?RG`Iq9@fa^^<It&DDiZ@UYSn7a<((UJbdMcG&jX
zWUU}`Mp1x{wQEY%pV%i4O6Xhhg{(y1=fS^XR^*Re1zUnnPFJ@!UK<%s-x@qm8$0O~
zGJCP&X)N(xLW^Z=;<N;?<!qzx3S5?QXjcv{6o+VwADc0qhfPxRlLsLW#C<Ad0j!7C
zjNL5lL9)W$H7^43LjQ>Xa)%mWyW(jO)C0({*WUxyn_rI(`J%`Gc;c6QAa)nN-KMs0
zla3z_;4r~gDJJcXEiOC%WiWNpGJ;A+Q0jo60d9*#x@za$u8)sA6skt>oe~bG#AxJ2
z+$#np&zd`R%S(yHhlv-oxWMtpO92o|=snFOJ7wlS{M7#@@lU8E?r_eitQ3siM?h%<
z6U%^ingzD~uxE_*zw-K~pGp&WlVoe_)Tz;rhaa(PIeqM!TOc|uY6^klq_#cgKZD^O
zfU{u3^;BI?4x=V<*v^@-cHi*^rE9I>F6I*N)Ht+x?36OO=opyBy=j0+?$ucsi*x5d
zc^h&{IZt{Hvu&M={m$jlL8vIEE)97I2*GlXdXO8Iv6jCp<=?;I@nlQ!%|shaUBdvA
z&7&^TXdF9&-FFT<#5{GCn|oV-!r7Z2H8ks0;zE)fIY@+AJ%2rQV(tRjx#wuJD+<>y
z_)%G@oS<=>QWmK~LUG1Sk*{xVkn(poSzx;?aQ03A=r8d0JG;nLq-R_m)bQsUoluCZ
z0_p7u?!&?PLjbWq!#<9$88}wS2uvR*2$u}}aEQ?tzPHI*8+8m@MYL|OpI@hsQerr9
z8cu<NXQo5s-%=(xeIUaveixwu)y5U%SD0lj-Ck6L7j75%Dp02_N4QKj<mlgfho?iU
za<it+)J3Yq<J&k<Z(AHlS3rQqMdPlq*5dF37>_+%w8!qZZ{f<0G~5Rs;M6CV%ZS^6
z-5FQ(0?U=u**+ht-~?jCb$GPo^&B*by-q2A@t>x+c^K0KjxnIY>F|zB`2LVS<s6fS
zMJ+T{mq1%|`XP?0eDd$MD9V}oH1oYfV;pD4Xc)o-xbG)uQR2YQf@LHCJ<oJjn-7k5
zq6d65YVdAnzLC03hye)pKU@b!28dnWgiFsRa0F8U${vL?f`j!3K^o~bdo*Ecs1t6B
z`V<0C@T4o=tplswa+QORvpmLl;d4Q#SqeLBf1J*^!vREdP%(4U_&i-dN%LBa1{~xZ
z&|h_#WbmU~yYR852r94=Wke(Qz)2RKT!CZ(mdEi*E;|(ByCQDl0^)plVg<CW&Ea&3
zk<B;5+;FbIBxq8IUCbQtm4P4q|Hh%yC_gnqJCMH_(m*QN#D%6A@dFJ)W363wZ@&4X
z+WD?VT7w{vqU#wBDh;*r^;aqs4=x(Vu(!xtC<(GHvxp^*R?5b}uwTawCm4$9F8FId
zD3H@;<gUj8>&s+;1YNxKGuc$%^MMh=y3RisabWVuX+Be50%1^zQn(u4Zt&r0cT2n(
zs5dO@#E0+?<lP#IbStu-Bs7{3kRV}q!B$wDL36xiH3w#7A*5R&3^#Uwvj&5E0-7~=
zSX#@*EpW2?Cj<)sge~N&xNLh_wyD`qOUf+-ajX}R1c0mP2Fm>rMm4m4f=q#E>m2DF
zML&bm!Bx%}!*$^rMf3i84z7?>OhYO`4PXUQg|L4M<;@OiRF6*~Jp;y!<Se*-oL*O2
zMmE>Y6ZL~P?0JT3WA>Lop!1uu6|+!`5}(ukH=4eTh4bde46o9m`8Pzx=G$ZJ;$o79
zWffTARhR61FE^gTmBc}$*l&9Koi$9C_;TwHQCj5=Xw8GiqYe^sZCfSmRp!wvgQ_98
zeW5yxx!6c-Cy-b3>zP;Uy(X+Lup0p7!*ClLPKyhPnm(n4or?Uz(LR;I9Ra{zWPhra
zAW#~ZZ-+D@KBc0~02#*X0e=|_y)IZ}HuOb9$mWL&W7R;K+hkRhXS=W8n2Od)9&*gf
zYh_pT8!xVbv2JR=r$n!)|HcUC{0<TW?j%JV7_{CrIBY>*Nn7Hqu32cQ+Rfgj{b%;@
zh@$nCF`g5lyiQ|>Ci*xUba847jadWhebjs#$GKZ&?_6~c__bo-rO>CKVZIirU!&uU
z$AyaEB=nFfMO+aS&*|M<O>RvA4&O-h-@8`LKg;ZpfJHbfmfGTv_2ZnF%E=mZCGh~`
zt)$|M)aZNaq<1=a)8*v7q0)rO$a@-VIm%qY(wFs)l0rSJiyvoWRY5lp65NRnJ!ND1
zKliIlCUb}z?EJV>$mu9_S0$oFsO@I~6q(RiOsa5vqkKjYXV~F=l`AQ&{a;Gj`y$2b
zdnw@0szQ2DF~!p9UK1oUNC$w*Ahg3{n~KgD{gR$m!5w4Alhy3~?>z!X!v=O57XVR@
zB(NCBcp~#DOnVpNY<(|9wps-$$)#^h`zMHt53!DX6J^ITjZOo}#eo5oj6r~xGhotl
zb_@$|?0o&P4u&fb$RfJs4@`_pHHX(TukmABe?J6Bp})PC_$`jLgX>}vf~~M0Ti2er
zT|_9tXbL;`TcU`WiN5PBR#r@Ju<StFIz__QuOkmg6j!T`dBc-qfu{e7Xb8K;K7bu!
z5id1y<GWWKbqH>7@*znHhO);SW*6!*i^D?;LTB^S>w|0=gi(qk>Q(ewO(P<Val*g-
z_nQuJzc&&j6q9>@x0CtK^-e>N!RrYlI)*?AU-YcxE5$%{UX2yyt1U#pYU%xEO!U)P
z&j!b#^cv?;f#V>^mkMy-r>kCE(%)-7D;g5N?P=NSo}}9EVM_453MSqs`>NwvqL&!G
z?omeH)?JU}2LIVBtKR!vkL<i4zXBc@9JnJ?+38uwcMrld@FuJF46})+dif4{VAk#y
zbB+?;(%C+ViSctam>eqw1<vb(9^-9G*4qeHU$LfNWwcnop=8~S`J)fDzV9z0JJMw?
zQCO2WfFuR`H7hlrUx2n=iwH5*<03_9wbaiX+`oOiWfPfxcq0Ds$2H=|t)B8B^LzMz
z`G+4=%=(k-vng)3RPyotSRRV8&{(TZ+rCowneW9qjCq_5?<t{-F^xUsZwodZsGE}1
zWGQ&+YXgd|q;F1@s-#B(h$I8}oE@{qQl&qD(L}3d8Tq5ZwqpG>f3=)e>@wf!!6nXA
zA&SC}mOn@Rwr#q<sp9K%KbbXz6THk&Uz*C=VK7?GIDLIp?+8Z*NNS%AURo>WQAxV*
zMbAyI*TeAAhr`RC;4Cp7_cr>8=<0xMV&6m@(w8=mQSesR2VGRf#MY?A;`%G{I9PvR
z&Zfj}UP}21GOzhwd>xK8o?NcyKGxnGnB48XqPBMr^q3i>>;_RDN8~qJ#k3J<Kil+u
zsp$Kx=GNA}I49u(35jp=O(k!g;cRDOa4K;fhJ`!Kc-jY^vPE}eZT#-_b>Ip=c6ktq
zz!A!zZvVxJY>bKB!&0P^9NJSxH|FPKxO5hl?|^3*2SpN2I4*Jhtia85_pu~O91Z*L
zCY<G#0^s66hIXwFTe1FE2^6{3UP_H+8zE4`AAafEm(^{OYuk*0S?NNnc;ihkZRx*n
zO+U1~{fo6vP#uSWR>tvBfcC7~j~UwDs?jGE@d{Ff##*TV@{D5<UVY)^e*KN7>4z*7
zLU_KwxZn17Ar)6dEM-P=)-|l66pT!X@1ST7Q<}W@4ubcofA?<NC-8tg7N%}HI7RP+
zYRn7|JgBw0!6HN>sIKV%=7Q>dWPF6nrPve>o7<<*-!r6dppOt+?&rm`CEr*4)Aob{
zS-M`9t+)dWHP*Okq#2bm;bD?noC4l4Th34O(oo&|{b<wf;9tU*JCmvt<%WV8rCGX^
z^g(YA9v!T2fGzJ!+}}!k_->h1|6UjyNBEC|^Z6>$e#8HnEZx5iLju~k=kWL!pO-H}
zN)kgt)VRF4am88;HbTsQKi|SI$@&*u>s3Zg2iMX$F?JEI;`F^Lv|um5_t)M1l0g-i
zwfSg-{DKqS!c)6)a%zr#&VJ=D6#K^df-AAg7Bfz%*{9i(zpW*i4SgTy0P1tE9g{+T
zGs+X%^Lxc(OF{z05+}36^+*HE<z3=qSgKUJHv?DV4-vJ8sPCwpcQ`L&Z3pD<gk~!P
z@o$L_ML(c6NdD;VKBX|R4!eYj2gZ$?jc)lOfht;eKl{WOqc@_&hpw@K*@+a6P1WPH
z_-$|Ks@;Ym!3oO(;y7vBtfgwQ3?8dR*-geid96oDPU~|8NA$XGqvb><8c{E(rjF^}
z88TJHQ6%1frIV{<9w~sa;g%DM9QH7eyNZ3p7D}Pn)+@CbjI+pS*@Ir>6ymC2>v_cv
zh>MeqW4$>+H3S4$km?Fye4bb0@noSMOd)qzIy8^7T@utvBdE3VAweBPS3S}w%DtEO
z7Yh723AxL4YJW|tI?f>V1M^Z*YeN)L%n)ipQ+Mr&_9xm}ZEp+LW!Gm(#RoD7+`|bA
zHY4gd2yYX3*j)unHR>CCp6@kvMg9@WR45kz^`-OSMM3{hwFak;=98vW?Go@u2hTGX
z@ZBdJ26#323(~}TO!o=Jhu7s!AH&-R%MCPSmnNr|MR5+Wid3tux%8~|bqfhoybAtH
z>yEx<G!fbe%th|RVhabeUgELwPmG;Pw@-BFl)AkA%}x2fXjiO74Oz&etC)tV88N1W
zB+wU%(VVRTo}ojDD#~`wFlDPq#SgER*cjjpOg&`Huy)W<R3+vS;#%g*nr_~p`gZrY
z%0jFnM&OV>Y>!Dw^?7Vaz=%pvjbtjXw04{|ehFuVtXWi+6QBE^^@`m5n9{2s&)?FQ
z5nsn~{UndH<ooEWCLX^<9m3-NOkp5KirZ^mzVfjIF74ybh8K8TdNz5Qh1E!5D3(|m
zyH%={Q@zQUVmI9dG)N>_*IqIrx3;Bg{Ko|`k$bJDp}I9K%R6Icx+x(VoVJXG<FT~*
zX&M7TTVeY^;#UdE1X@8T4I*{}EYLIaet!+27U&<?dMC~be`W+Mx8aBb5b><R@@_z-
z#s>+5*{ym^>ZH?<tGK~U^*eGQx!oTT<xgx@YyESCg48=MmmdxvTOE1bb@}dPO#K_%
zqK%-AW+93u(t2i^>91&BAuOu-ho$)p<o1ZajNRX!T*P(P1Xp03g?l!IDTdNBghcDl
z+r@VKtS_x}XqyYIGBU;uu8a;dUo-yQ&^FBr1A*wfX}*4{sr?!dtH*jZKgsLaUgi@O
zFm}?z(6~#t^syWG<Q%4-HWzH0qMwbpNWi5hE1+0{sV7l_mK~^@@xEGqWUoPtw4^5t
zQono?`d2b&m2<oXX^hZ$GzNH<-+>G@ZLJ0T$mFy!v7#6I`!LD&{^_Noj)TAvmzS0%
zqO?<tdFd+&&z2|Ix0|{D<6wA?K9Rf^#MJ&#Uba^&^pc#Nns;UbHOtfuMI;dAsL+gd
zucm~YRK%N9@GN&m=N^-KQ{Wq{Scm)1{gpnIDWdVQt0HAA1PPI>b`2*aDd&x53{#UK
z;})ZO{C9pBaNXr`&MmRy`?d(^|9g(m*(F|5A7=^P3-B&cCxa|;^t{vL+SFD5bv*nv
z(5pXhs$iv^KfC(SjRn<vTAzD!QJ1<Eoi0kwdvugisJt65jP2`C-D;gR_-*7@w5;`O
z!JQ?}OcBZ~!7`P%6nRKV0(|7)arLi>L7&8k!yS^_hJ(x(N8Wq~4?r^MY;}X#d5Ni>
zry83-td;;*NQzq1<*ngC+x~BcO8%b@6Jd51RA{MYeXS#of1k`gHMhiTf2>83`LG}u
z9cx;Mx?LYv^m4={2AD+EaOvL3xgy3dzfv<f$)&ec7TB9HPDfphdVYbpGLwZk9p<jf
z>JXHFeKF@nk}t?rno`NZ#h`=i))S@*H4u1LZioF@Qb>*QcR%jW?Y?bi7X>Y${qZcO
zf4VI!6aQV2C>q=1iA;`Z_k20(roPkMx-BQN{S)i0(wxuB<zsUs9aUuNZG{|XxL?$J
z%^ym698ei==JRn?bT)lz`k)ZiB}V>rSsYQl8sJ%L1=zbMD!NTr=t(rXd*1%1ey$m*
zG5w=@QOWsDI$*}DPK%qk#4k5jehcd3J&;j->FDb~vE3M1e&bX#$>Rzh5lRgLtLx$j
z;;}sFYCli!vjUMYaS=(l{<}7M=gP2kBm7!$yY=GajBJIi!oe{Q9AfOx<BoV{AT@98
z<o=$?P?n4f^FF9g$VGbyXC~LqdF5UGb@3?vTQy;~<w49fw~>PNcj>Ix9ie&W!DAdk
z)DiMKzjn3p%B4l*{LGu%2Gx;I<BN)>(=Yy9@&2%od-1rHrP-@My0KYu+V)e^e>8EE
z{YClv$#0)su=2!7x6DBnX$lu6e<j?!bo^7Ev}BT{@BSfo$KaUQ@W0%R93|RkI`D}A
zahA*r0_wb1EWxR)gLdWW&pTuYM8`0nSUFFgkPi4~+Wwqm_)N9R?-fd#{_jQYlFMig
zZp8eRX5Et?{?hV1X6_HGx&&e4bBxQhNdA@ipEE9=wr`kg9!He96W;%NqPSs`dE-5T
zW)us+N;9{0i&-#OY%7_0qD;8`ot*HbaA%hVm%%IGVg>gaLR2j(vUaA`S;zZ@s7s^A
z;XBDA$@hMq<byR*ILmG<?sQ((wsxrys*g$%a^D5l3|8@MVn#Z;o=*G5;c5ms6ij^S
zWH@SbQ;gPMlPa6mt7OAXhki2~76B5A;x?kcaM8UdLL%^cyuQF=|Jn!pr@By%K6uh*
z6a=@zZr;BKUqhZu@mCV-iS$Gp=l*9KIQx>w^r=}`y<nDbqYJjReX{3ry<n{oVy2sM
z$&+NhA+Ce{Hy@06Oe74~EO>~j7V0K<*rGlA?gh<<?6bu_EOmv*IW<3D{7T;Pf2kaw
zS*U0<x6AY~`#;On7>8);=Kq0|11`jqpp+9Pd_-m;WuQ@`p<Oj;tyoX1r?YPrdX!v$
zKlPtf7Dv|x+nTGFvpweXzMXSn$n|;dy{tFWn*tch%_Q`w{G1eX^}6<d98g#iwN2O`
z)0&Le>Ep`#&Yi6G!6A}_4sCQlB&%bhg^pfzDt8Xq@B8?I=O8ka##K;~ReTkkF@HvB
zcjH;vsZ2J}tq(7_G;We)gdj0G+c^I#91N@UU?Y;ql7mQyl@YgnJR)3k9bKe`XHjxZ
z=2IWy4yqZ57nwUfb18L@#lloh9yhO~qs=#=0a6T3kz)*5X974f8lQc!+xaZN9#J>Z
zZt}TBP&58&w5F(-2ItI{T05M6!EUWI`aqfwnGer+**;#dz)j>w{-Xk?1+N~fNNwW%
zk{A%=lp7AoDv&TQn}j#mQaJrC+4N&>dU0JDxtY;*`oxXm<Oz1{%H?Z(cav4)ke>!u
zPpGB4>^(?6%T;np=h?lbU&G;a%c)CTjZlp859<n=*mNS<Im$Of-x8~@m|@-9wFVY`
zW$b$-&q>)x7iycZz6g3rW^MYX{O#wlo%_oE9Y&w-d_Hckp4JV%v%G1dgb*p4b#u_R
zhY(D$e!!|)joXBTq;8dDep)pZ%BR!$enTYnV~SK!V5{Z|wxK{;dV%|M7Up{^TMZ2Y
zx7o?lTTik@{)c0;GJU7H{xs#b$Nn-l#i;%tnkDNMH=JVb_-%d=Gk<F<CmJ>NXmJYX
zq&P_<_XeV$!M1$!gclM5&AL+TVK(HkEVs}qp|8140;RQTP>wpNT!(zQRs!+{TJDFl
zxJ5Gldpvh|y_250bk{$T#SoZzB)|=kAO5H-$lkA!=EIkB+ylQC#vX&#kJt!EK1yZu
zlrFQC9qpV@R+SPSNs#`Sch_d8`9={hkM1&p&I&6p&iSQ*u8IQ9&125b<55nmKt-d1
zD2Qgp+plBtj##q0h-7s?*y+*{-(s=Qqt^oj*Rvzzsu2~$)Z>sq73QVK{B9~{lRZD8
zGNq2i{+fQu)sLs|QW7YeDkJEbsb8ROa$dpx4=WnqW08mIH$`OjC&1jH18<#PPafcE
z`gH@_pC}@DNu`!LGK&DRyy6e)v83D_dd|uT0}YNJm~OG^7daMX0ZGe1cc@-)mZ+)B
zs1G%ACuRYeA4f5xx1aoX^1`Tot5i$W{5Qu_ktDBEa`FK)i_v6n<$TDiEsQdAnZD_$
zZsm2c-g5J^q@Yp0+B!SButdLsd7|Q!t&_F#!T!@n)#Lm9L!6W*wpPex!i}^IX1QQi
zMvVNMZkR=S!dl}X8~p^obT~^G*!=hD6U8l&;yOAbM^ZR{&N@9{+rZj|NQxVl=xbdl
z3C5pNDS1uAf|KgfMbA*Ostg=ZjD37LvwA0(pagp?{Y&AcJ$aCuGVKi0VHYx~6WR!D
zQJ$RCg$(SuM#5yCplpBaa1iQC*_=rAyL;~+!XveMbHW^#^CWOfyy8hlr%OFS8R|Lb
znm}%^)3|;@rA3u~A4;~&?25TMm!W>qh<44zg?v7UA^mmI(wJGMb-Gn=?MFxn#xLhT
zCKgxxB|Wdz0J7uVoTDss|02>=)`FCHvm^1*yw+4%Eh_c%8-lRd1xI;J0*+jhq52Hi
zpl_ot#{Kb|%KFq-=B~=;Wok;FJ%K<bFYC<A<2|ivlP57Y**sz8h=fC@We{ZukaUgr
z12FeVM@`|%#^nQNCspJYR=>=jgN~~VZRcY{Jb*3GuV1}UA6j=$#;PX~+`T;W+`z<C
z^oxDxrL%S`9(|NN>u|VAh*<5=b938Jzn;g9IQ;r;+&*hv_QA|wjdhc>mD$hHz=qWd
zzXVJ5@k{sAOZIFsN@i?Eu|HkYheT&H9YRme1rR7Y3ndr9Z#Mip*w%1kMexstmgj}P
z2sK|-hkpOB4p~p~Y8F^v9TFGuPDGLuR$!!7e(z4M>tt4BYbvXkphcW$!VTdw8F`5p
z3a*mSwQ_0pAN+RS-!$Q~mL6(`u}M{|nLNOpj_ZeCODA&BQz-kuo)7qCXvlpyLEJV7
z`e#9?6hjy8PCO;a<8E<R5BksK{ldY$GWXG53q2LfxVZT?GEbVlYbyB7$fthnfNr(c
z36sSE_6xlgz9zC>&fsH`BA#}h4`TX+(a)WjzAkV9u|29pJs+2m_5oVYP2$IW#(6Fr
zu^q7bvHBnHD>H^fnU0+jqdB{_`au|)aA&q~;TD%;%}>4Hp_uSeqGiDN@+}{&Qgf}p
z8D)Fiq<x10kh{oJ<h@&RRn^oHQUu6^k^=x0A1Srw$~z73D{<_X+2DKt(*fUH80YAv
z$9nI!NA_3wFC>819v{85-)Bg<&KQV#iFFu_UWe+Zs=dc>YoG6)@@0qp-9Um*mWT5t
z-LBmo^*9@osf|uyX0fXK$giQ6^mvuXem<$HA<W-Bf;Y!0(v?0>nuZV<{<E4A)h(p5
zH52wX4WJT^`bHi7x7~PR-|^GM^9An8n@*fw0yI`Ew!`Jjx`q)VRuaFj`k@%5e6WeB
zmd1p>a<<NRL1qP5`!V>jhw;h*XDIh`=Q+smIPG_?;4>d(aE{80{<L7}T4(B^bo~T-
zbRC2A+6ipD>Xa0UDbZ`t0<Hkgts(fYO47LV+f9|F-7HooqHr=4HmYHFCFTo~*l7sf
zJ{0;FzZN5P)`M)g1If%EO`OZ{b9_4e*OtGv<`<(v(-m0}^<LUnA~?7Y)<5?UR*HjM
z@o1#^@zsGq1t4B9gDU}{#r;2MtGtzbG6O9!S*v@i{Q&clfAT|(T2JEFcqd?50)vdZ
z>0K<my^5mcbF8~!T!5)iL=BQH68%)=(I|q<<MuTW>8EgZN&{GU>{`t;Sqy;TbfX?-
z&tAWnVu{oFv~)z|MEod5d+I8~zNm}YpzQIvt?yNB^T}lz3h~9K$n7`zyO00<M7_nC
zk0_aqR6e!YwV-1?(MNU;O<3wf6@edTIpaS&&_&D1Fq>@2;NC<Lh4WtYtnl{FDG6Kf
zdE?aFsRE_j>v4{d8he!E4xLr^qg;%#KP!?{iT5F7o+eGS&h1me=2SVyDHc0Z6dJqt
za7LBM&&uQ2ymv77<n+rK3?PphpC{CdJ?10&f-C%b8Ij>Ib%lS>`1jWkvrP53Qr2Ni
zEo8_Rn@HH2-30R;H<k~xcRQT0PBYpH{<7Wg3R}<S1L@!Ls*!NU?qFL$C;&$CWaPs?
zqNWjLcF|Hfo(UuYZcd3R`OS)Tr4ZFDfAUa}n)EvnIL;aIZTPkdMg*=Pc1KMhGeHLw
zc3(0;SGd9blbO*BR=ZA)?5!=L0YJzFaqrP4+l}?MeSk#avT;DKHr_we1rnW;@-?iI
zV6EjVAmR*1NL;6%Lg(l5pE!!`O0w)gNOQ(P<^r^idqF`=La${o^6g-@`L42KNGXcy
za(ySb14l&_s;T+@<dEegc<m&n?+0G$<ZPb?b+t6d<l8={Yj%Lil7Agnk;q&9-NJe=
zB4Sn4Z7)+E-9!4{mKPnPq6l_CoF&xOzM@L<_H7bo{KGZ3S|%9d2`zHaeV11BgIT!R
zKr)LsQtUJWL!vVSCJ5)1SsQ~2V8@V$*PedMM!!Q0Lv=XtuH#@z8tlm}M&fTep-B#_
zuU?}Qz*qQkyl%eo%T+iR%(tPyWdeAWoju}1CU=j@>$n?$fy6u8(6wUI!4&YOSXXoT
zMoe+N4(dPou%5<pV*lzY#mO|H{t1`^1Abx@TlyR$k1VDf-=?F69l(ycVX73;Rx{n>
zf3b)<F@<EYAv`A!k~zNmtD%w{emRPENoW-A9ScS{L_&j#4gEUz%llS{P2wZ75@#)y
zs+e_;=6iU;UedRGk8Icik!G8g+q>I@J$+4Kc77aMtAnt_;r1LDd*Ja|L8Y5IzZ0I7
zifQB}s#TH+aJ%~h*oct|*m_@Xe9qO}qL}sJkY9;CNMyK;)|HtS>Mo0UbkBt4(dAuM
z8ns3KPPZdisN9xad_soiO}tmfpi$Laj(W022{;d5xXOHam#PEEUdjw~+dBHJYgJ{(
z|6s$fbUL(M<0dX!L32{@dTSeT4;2?5Jhphl1xd%tGsYie-myuMnB<FU6B<*(=v|6o
z)2xz8aE+aH#+k3J>+Mm<(<y1O9_g*5-S1T*D6pkgeI<1dI}PzVVuiShfiQTvo<$ez
zny2<(JHmb<v>FNv-BfyX>1}vF&Q21+ZbBvoaPxEAe0yv-A7DcLTbIr^2atMD5&u}@
ztm>W8K9(OK9Q$8hluzW40$aV`(^Tm4=w9Q<?xlJeDFws901))lsj%r=hj4NU^J6dB
zaW*FI=445%Gxd~zFw0$9gAjGSY26x1idOl3YTU3pwmiJ%%$~~h5{qJzyv`=~gH7f}
zbL&k1fSXsc`XXmkm9c7nzGF2T;)VidgdsMpX)SrB0=Bqg7zLdmKk`^jhQ<~jzjwzR
zTf+ieJ*l|=Ei?SM!&d@l6{`-ZE?Jj|=}Ru=;Eq~@`-NRY;P;lW^eA@1hxWI@7RY0M
z61<k^hcmai#?owI=BC=@R<w#dgE=2bWiIB86x)w|%Q7632QIW=Dq#fb{GoN-Q9x)Q
z#}2;1gsjQlTgP|&6qJ{w{`&(&U_tfc{jKSqPjRudeWN%2rcX&{i(zos%s>8Yx19x4
zU;7Q8=d}KJ_Cs?0zRjEO(ak$PDx6H)hX?GseOIZf&--jJANGbV5kOh*?~k6eA~6v~
z5@^gUQn^+<mKPHtxpOVE*BMmPm|Oa8595LaYI!d(*mvNGhpm3G_-ahw1xmXglZPg?
zu6>qqum87H{lGG9-{Azb0zI)_8)%QD7_(tG3|>?iD+<TOXL1^I$jR}F@(c>TTWs_?
zW;3)71m7X`j~ImY(<iD9u3^Dwj&)&@-3CAA(@TWySTX;b<9pequz#l%XT+=cg2e+*
zlt&2T)Q9r5BHtyoDBtI1I5=eG_(eU^R<tMmi_}>R{&<cOosI71JKuS@Uy`)q-M>Wx
z!AyitRG=sqx<#B0cjoI#<1EwyjBr9b`87;K)zL<~nA|rdDe4J^LN|DTAHjN7I;+at
z*MF5-eMG&YLW04!+mYc=kNv<SUyOFg1{Uj30Nycs;K+x$rs1jivG;m1Jl3bMp~>+c
zvZq>{@FsfkQm6QL$f@v!ty_QnD_<Fb{wU9lQpmWR8og2F%yJC+KkRp1;b_U{7%uX#
z9??$}(9Kck4WOPVp@Fn^b4-?ISJYvdX(ko96gP;LO^eQkwJ{UMdBZ_eHD7gYJ@bFw
z(@-|DO7O*zfKGilOZ3lTJw9G78nXwrih*2PDiK&#8_{EWd!<;DG$G;dR(pm;vE?{o
zOy9d^*1g-~8IFZ&H}V1iy{bAH1=b!!A7n#56Y!?+pX+QIz{Vt%W*!fp?5KiDAs=$w
z19R(oU_qP#LRym<{}HJiJa~=(qp=xTcjlS*ZmeImQw^FN;U~A$G~=Q_?toOMo33Z~
zO5$UG5o`yYq|qQ1_N7l{m#<;ASuiW7BCbH<2QeImuG{qrlz%g{CAbBZ4$3FAQRR5Z
zHkW1UBF}K0TjcO@%URX4bISkA+~@>QE=dFK?aa>$k>5n(`^5e$dt(?Y_AEyix@WnK
z&XD1LTAyVY#2%0sgYV-Ox3l#p5O<tf-{l2umrsk8)B8;xbRh@AIB1vK>HYjcMxq&V
z@6pvd?p~iu7*WjjJLe>@Zw_e+u3s$SDhFHY@WGT}N1-yd1FL0C5vT#Q4%k)lAyj(m
zN1#*dDwT5mpPzvQ6{1z1;4|^_R~>}QXyWi#w+a&=qcNIfmDl&HzIaW&6d1o6!VmPt
zhr*od{ijs2ZN^yWaY(OI{sfhzIF=pG2{y&Mr0kaUdcOCvungQ7F(7G+1u7E8ls1Lx
z5zWtlJ`VeKbcyaV=E**U%7ig}P+<MnaK_E@L4{oqrvgAEY~iv!y}p~1bb1VNazHPw
zzCH%!g^Q{$47=f#A5ww|dO;3Yrx{B9I*TO%5TBzNB5U{(9{tkh<)pT6w+amuXvQ35
zr-r4mPS7O>xdi|t$qUdvI9)>h6xa4KTrS8%?ij4nz5WS6?fD2J+JrR@pg5LKD$7c#
z;6gwkk`_Vr$hxXi=^O5i8V9r>&E=ZJx<KpJe09%H(Vm6j(RJE5{!_}k$F7~RtUz8D
z06aT98dlwc8pdP4r>X4J#Bt(@Chk41>Yr6zr<RCKDZ64iM6oMu%7J+>xKM@P7{jn{
zOYgQs6=g6g<d@%5!PU5$+#DobRMY&>9tRc+Tv2QaNYcQ=V{yDymsFV28EJv4Y0^FP
z5e#hDU0H8D)UKzh(4bERr3@qs$6Dokq@k+i#cqYa3k(PP+n;~-!{P!M7|L6!E@QZ~
ze%UwwxyO}45QgthLWzOYFM<=S1zZ#U3Q_}IyJQEpM<9PJ3U7!<L1N(YY=o>O;qrf=
z(<}1SZS*kIPgI#`31Q2>kK`DZcX++z%q~x{%CKlv)PByG=)fiPVT?pLtQc_G7{SMF
zp`?HFpF}l?wZyc8+@LFK2vztPj2gzbozc^m<h*OX+Y=e11b-A*w7gZ`CIG<BBIs{p
zBr*SH)VYJf3q<?5&B&#yrb((ATzeo>5m8WfI194hR$2@?VEc#TaQ$z}8_x1`*P#rA
zWBqxwAoyKSjNAi1WCi*fYF`e1KQd*`T7Wl6W&Q=P*PO+I%j5E?hU)uPjs5@N%z;z|
z6&B#-D6|5c4S5PWa4bVT!lH?1EWHlvCZ~~Ipw%tq&|LV&>yvKxA4^**L|^l3ukpT6
zlkDFzmY~G@&ogTX7*^-IM=@`1Pv%?KDxa$A*0oLT;E)j=AtI{P?grnUB8dNrCUQ?=
zC;_j8-saV4b^>F8t3x}M(&>eI%ujUpeJ7#r@Y|QG?pwXfi48aGoPqm*2jhdWUSJ9Z
z6aACB@6+-sxRW&U2u|^09fxO2-Lhi^`v+N<{RAjnrONEjQ&<B7UBz7Itq$m*JA?$F
zmJnG<^D}8C9~TE!;Ah#)Uy>}*o{5|(2bOjn$Hsa~kvhM*yZqn)Z-V4GZi?uj8ns4K
zzE!Ge#QKAO<CcejLZ?5epYIE*KhgfyxQ1+FWerT9Ka>sVH{R{NhkW#n3rwEK|Cm~U
z>klALeK-wg%6?mV+9tWqWh1#4&I`y!fG`2HGvX*hx2PrpQi<l$X78<5AZiI{CQj3=
zN<np5ocZi|qqZ;^`jSm#@fEW(NZ}K~UdLX6xp(lf1`r-5pk`TVD7H~Hi4`*TW}IZ+
z{H;e^&vq$^Wc|U|?nJCp+Aj)Otl5py$53UsXkcM_1;6VWk-`_vuZyPa<rDyr2Vyge
z%KPT|$8*+rK-E{g@Lmsw0Yn^Ho@!G?(IA7|sxLt!eW}-;c1}rRrct)}RwM!~9VWYf
z{GTFgKz4j2>pX~juey8hI?vCLV&_;N*=T$K8W>n-=h$bJSAQ}<{zt`wC&bW}8*qdv
z0vkQu`ws!G^cJG}=JnN=Vy1wN=LCC8vYzZ@^Ya<KWS+DN+o<YZc<h*}g3SEf&86Le
zDt0spkfej<5rM&53fow#`uU4JII`|R5SsyDzPRBmPTAR>F#@rD2a~UkLMKo6F|2_Z
zmjof~nVc8cTo+-p08EL8Qd`_2PPWOGbYZvcD>_nC)WD!{x(o?a#@`f&x3rC#%cv^w
zF5vsclCIKBy(B?L|6nKcMqhv9XpK>2C`5K)la6KgW)`sWVi!)g82FL~5`>ekR#r8h
zoz8WUR&&`|ds=@Za-syFH;Q0PAgnDMYS}2C%ONWkuynkRg~kI8hMguEeCd9)Qlo35
zx%2Y3AHNrQI4O$Y3^Gzv{hj#R9E%^zcJ?z{Bi;Kr+y9|#kh-G@qgpj{YK=%Wb5Seb
zbs*T7T?x|_YPAB(LwwJXIk|Z@j<r^UUF=#(HM-{MNRLfhK=(JyXh6Ij#tjFNGzP`m
zYN^?g+&JD4cCX2{F+ie#%hi309#X<)&`WrMBcWKc0z@{3;{w92?Ma<di^&c~E49s_
z(1sW^XS|0h{W|O?MaDwb^jo)j?|`D~o@Z0nhnLhT2H2I-Pu)GPqzcOn(=f%?{ZEe%
z_jVdJk1Z>m*KuW60dg<k;Jcm2^5d-IF>I#k-WKI~+?BSV4Ge`QNOZ^2Z=enPf%Yp-
zA2<mNNRqHlnj_7BbF(rl>Rf*6fEe=kr-RXYXr?SbqQ3ey+0)^z?cW`vr!s(yi2&Xr
z-^aEtN{uQ3wO$nA+fX=tBu4S+Q*^rx(#Q!_IxJ&%<bu&!Q}>(_%tK*Be|7h;x|-{h
zbN;1*m{ot^FX_$?-v6Y^V_0heTB8gIM*Tr*3dfQ`D)$E2I}4t`3RPtf>~6{iW4Cjc
zjEgp47a8@wEmZ3I(QD6j*q;(y$SI<iQz1oQ6lY&`s;bXEEGcHr3@QKff+oGhPXPf2
zqWA17Vmq&6r7&g8p($T~zw{@8G6qrr#28Bf<-z&Rj>z(a7Z5Q1Rq^JS?~LZnddiW!
z6o20s&(ME8D^iCW{@}~cT7vWA3{ziMcYY53DaNR{W=Kr0bOe5af;H9>L)Mp{hEV-Y
zI_(N<7CsCXHg*GE%cOdRyHY@Eb<I*0Ej{>JN{aeC>gL-<2OghlAI^#R?UnnIEfmoI
zMgbwn_iZPW0r+QB;`*ssqtve|5+y)xo{uM!FW-;!sov<}asS*xQiSgN{<pEh>QV4#
z-ipGH$KumqOXS=wCJK%%uJgc#7&hDZ4`Mmxq8>Y}*RS7apT9BAXUl#}E3z~g4E@x(
zs9+)&cN*6Njd)4XBW}jMi|?_z^z9SS8*S8(y^BI7(n#D;eduz1Jy(oT%CTIqZYLeM
zt7VqU2`#%&1U}#a3QGK`Yn_cSh*S&L?+pCQ{ii<gMZ@{)lvt2r>1K~ev1>F(gk?E+
z4=6(-lS;boR9<@JR?<cUUV)Hf$rbc|^f=+j^+`qAe^Hf#7&2(e<J{5!jXKpn1Nw?L
z-DirusiWYP6N~~vRKA#z9c5_oJ~S*%Qk<Le2J7+Op0&Kl&W*V>*HE%LyZE4F%zea-
zPDs3B!)ftO=VT&O97z5A_Ya#*JtN6oqSD$gL^b+LgG$&(1M%^IQgFN_yV-tQ%tCi9
z31F8giTHPo{2L1QVhgco_Z+$xXfYd}Xt}2Xv^^UQJMkR1S}opdN@dvTE$QdI#|T){
zS+2OQfk5YO<`ZgC#3_Z)gILouQ|fh@3GN6j&HX-HJ3PKP7aopnZsgi=p0KSH|L^Gl
z3wkOiTtNHdvZXH2*FRQT2)h3EA}IaRr*%8wMT~xn-}EC-SI--i0=#$Zg*KFtoh?(w
zGui3)66QwXccgk)Q77`e*B~j%*TH*aK8$4WQS%;z6XC=^qRNXr$9Gh$#wK0lQP7Gc
zz!v}A@u9`7x2XI9nqbR%2665vykx<Q>+R)NJ$p)36uVTVIBxWT>IQ|S*f=y`*XeeZ
zcHmd~2_^knP*dw)slt#^ACKdo4=~;OAh=T9HkM+4wfpnlCnvkz+O%Jv!<E1kSa(Yi
z@<v(D+@?)3lV&ajG>mp|)4`(W4`VOFUm0~ow%|7Drk#0*>9rlm>1{mTHt+1_gq!Yo
zuf6WSz_fja8ub0#hRskkz!u({-hX2CiuWUk3!f;HbAJBcVK%F$dR_4&mc4K^)`}?g
zk_BPP^N)_cM_gaq1ip{C3o7Hz;87;q+V4Jj$602g8Iyo#DuZ>MjA*9pdI8gmQmm8Y
z5!t(*tgTNq@`_m<E5X^__isx%dJy>?CJSg!TfQ$Xw*GX$Od^xECurDyjMW(diU!}@
z6P&Q8^ZdS|iZou@_XXG?@jK)15LcU$Irm!DIRTz;-6&XzQr*g^%3f$zWU%_+MLc!n
z8WCpKp)Dj4inrg|5<a6ta^qQl^fji}-BRCOdP*6(f-gsH+6R4Tw78S#j82e0PLyPJ
z6)v=1o?2B3Vd74S9T7})#`(+U^s0M+QD&z+n5D|ZvO^%X7>A?Qgl+uh{lH&|S!Yw$
zAa1CDiRgxt?))FQ+x+OIJ5Tk5lFnnbTPVi@H|e&FC{O(L_DeG->VgiHbCp_{I+TlM
zR25l&U%+zkPrZuE=2yMg+la|?E6B^8zfi8jAl@^TApwd{ZY12XB)a;giQj^j+_Q4y
zsD#AoRt9i|NUMr<WNM3E0j8mlJ@#Cu{=xC~Z$B(XFXZ>PB<*|Hro_n1JQ_^VP6GPy
z`VyJ($a0EG_pryv^41r79_9M3v5B*)RpMp#Lb|&IT-vq1RF_&z?d|58c+#4M(iD6&
zC;Iz2+AYMZ4-PhEuPxjfmX+kEN+2)J2R?|%PwmAiVSf!Q(gk($&MQ%tcI9k8`zFuQ
zA~Lm_pT{FkAkVjLI{1}~U^X=ytmo!e&dXKbLAfa0k<rat`0=8|zw@oJ&__OT)kVQf
zqez_1mSI$_rY~i9W62*^4GwZVGNG(+k*u2q&G?84UV=ymrDyZMv!j1U(g)pXgU5vE
z^s>_y*obC}4um9`0?cdWsWDK0+U-~7-YW0y3SDhJ(S<)%@YLrLKqMK^SOw0gfS>!T
z&@mYx&PuC|W!nGYB#%1s{NtkSAVO#q27Lpon!J;5+9tth@+8=Y>*-Ea7SR1@8N!OO
z><KZy5&ZiZbP&K9i`AA}Xz+Xp)bEWfdJ$Vz%q)BF^zB6V^W5g!WKI7c71aVvb=DAS
zaTrk7Xr^BMJVo+=4qGpGo$R^O5pzx~Un>(P{^4ON_mdc?CZ2KcFj+85PiRP$&Rb3<
zKF+@_BDfkv91O<J1hSs0d^Nn+b*o6Jb0P>9x0w@nFBXjn+7^f$M?aTJ{4kLk*CRRV
z4uxb*y3O`_8ya^W7KPalDm}~oRw{AYq{d(N;QEhSC-}LDpv{0mvXJZ3o3F$1lb-Gf
z0}CCws`zb0<nrq+q8$xAs5f$IBsZDqJ(g;Y`icq~R5{q9rAfjam~JrSPx%cP&aY>`
z*{4%8|3H*fZDdQ`vdF@_dT&;hI;%|PjE>v=Dis#|MJe==VF;}$vaccY$E7n@eAS83
z@cKEziJ;`Ox;|Ox%vddtK^o7ljMgTl#*w<t%kgjC)Xjf|Ex2TO?k$xA?omP-R6a*+
z`6Gw6R?oiY*uU^x5U2_8nQp6gRm}dvH#9J`XMJ=*t>W9D?b)B0$d~u88Kq}QT&}s|
z(0WwA*2_Oh%7b{@t67!`PAeH&3lpTjU8Jbw%JQ>r8aBx9QtABp{MkeenfQA7S8A4o
zG{YrXk8XiUg6WqJAu+*yp)m#7?U12dmz%L77vb3)@mXunk&Q9?YH2tXJSkcD=M4O>
zWMSS@V<y*=K;b^@Cu1@vMWQ2fgPRk0n^R0tUBJ9u-N%T352}3M!`k>L`El+2-+_!T
z?6QfXzKqLpGE0rFP;z4ru%ugW)&~#9lJ<d%*QtNQs!b`*A_N|c^)zq-v5saxq+C$1
zR5|?Ec$;^yZKGl&OLw{lo{H@kl)ow7uh(aPx~94V<C{rrKUJ&qiD&{{T~Y`4OBf3e
zoT9?t|2nzL4hz<G5ZSSbBY)^g5%q9%ZQgkKIBo3}N0{<=0??%Y6KF$OVSs?HH-1rB
z*)ix|?dcPN%-2DZ&keLwujBKo|8yyD&}^FpQfmD1?eA-Ut#&7Jgw~k;&8aLXwMF($
z@VxSuXhjWU@-@Y_*rUL&x+F7S_L?zc=>EY^eBZDg=aWtJ3ma`~^x<}y+5gAUd55$0
z{tY;ZO;OaYz4soqM-{bNirSm1UAsm?wWz)Mnr)5Rvo=Z9-dojHvqo$}B=7mX|GMJJ
zA<4-(&*y&b+py=R)hX2*aVIzw*F@(UrIuoXK@KEC%EdY-_LPR*B3_Tigz!&ENo1-K
z7yB<v=+qNw%#3@PW)I(YAgh5j#Li2DJp-=N9{V;&(JwE)t&lzy4jroL`;w4@Jb&O5
zp70+xrTODZYu}i@^~hs?Tb%6Dh^8e8og(B5j-M6Bkbjnx(?uJ`+AqYpw=h=Cs?v;S
zB2=ii@b~S1GA;jFC(8|f4uQym&-D1aeK~0x2bep<0>M(?FD&W2zuz(MG<-aqsHvy!
zn&XVT1&eASSKVXUSPDYRFTI&ydizTVVWZbqNejRHjuYNCZ4e}idCY0$r;j|nRLk?w
z;U2K`Fs$_*en<OVhfJB;*X~D7dS${gasg5LM$^FbRSSDl11l$k{FXIq7SkK-9a&m)
zU#O%*cjJndvF(Qgf9Z=Sm^jYJ)80LLA#-=IFMMHNa|ujl7Cm}~iKj1p^=L1`u5zKZ
zY_Q%o8}9vsjox6(ybjRew6x*=afvpu^5!Jz>3fFsUt;tidzJ9cI?JSk-PQAOcJut8
znPES9bn;Q$_O5UJ^F*53P}(=y5Py8d(P(Yw*s!)loU0e6s_`T<%6R*Y4Irs)Gstj~
z5&fMr;;L;_hWew?Xl`-(^TwYmxE{A1;s04#N6b}(z68%tPJit@rn|phG_v4m3Eq^;
z0UPYczA6_X&w%DBy%CSANckc{W|@B*@T>epbmzYh)V7~K*+?XQ=?qIF4JBz03T1o^
zTJ?e0^4cVfoE=+y*fTAxN@{sNxl1B`?}De-ks6=y#R5Ul!#(Tp6(#uYz1ZNuSE+Ug
zo-4@W#X8LD7y{=c=~NRNj7Wb<*0%7*Otn%5=42}Y$FOHyPdp_>8iGB*3E40c#;Os9
zju3Y8*f&Z3o}5p-;c!jX4<o6Jah$7SwL`(AxyyqM<P~vx%}!)-pwcs_r2O<&N{bb4
z^AM@#uG6G^2}3OIrxh0hti8y6)*{_2nvN{PyQugPs>GA3MO5W7m0yOsV0X5X@vndz
zjdh&5V}(gQVb>g6R*!viUy?Dm3d-O&L*wv?f<N~P_X=X>G+^d;W&NMeCV-xayEdaY
zf97(QKuqikYUqbg^I%ftp)J9Rr9VnHkGw6-aq)4C-a5xgPKmdUd^y~;{*G-SHB=f^
zt!?oaHj5MQJBr_eJJo**t+*{>(c*(<UhzI}K;Y@SAqZ@>RhO~|DUB#l^{f>?D_*O7
z=&HC--m0k9^TZIH*EDT62rgUTDRoHSq`OH=%l5=7Oh^gMfF0ctRv?pfY*vI58O7_t
zm&s4_?!)sD1WflvIP>K_(XL$zt~-fOp$kqT^Z1j*%k{&Sg|AbT-o`D*)8}15iO_hR
zFDHEk5;7J~9f#b{xfw0vWXRF@3iYn<;%r9TC-`E)qU3^H*P*xt4+EOPWW;V(JP)ty
zf5yymTo&Lc1N|Nj%^tHMig7cIuj-PS{yP{UKnSrU{k0duuu@3ySQ_uG^!me_2WTR+
z=gbL_eB(3O0az%WDAl1-AjT$Q_kH-nh*ZCH?~8X}lW?hD*n+ysxMQ?X0aWX1BAcaH
zqbaI&#UR3lG~kI7&!H?S%z#HQlNP+RW&?slL=MqHSEUac+pW`|zDD%68mWYIV&U<a
zai@ww%8?hvYfXW}s2l@CAuh8!1D1TGl(dnJ=JTy&h7K+!)b7KpKiPAI20Q;z<q>=D
zDge!#tSRLk{P_SEI|I}T?w6k>=Fo>hO+-6>(RgAjXUdrxlKVD1SOpyE1k9HU>Zh|C
zc$jzWO1=2NygC1SO8F_YP;lc1tI4@0UrLe^4ipT>K^lh>oJom}WvDYav_zOHKtm{>
zI9vHAd+c@37AfT5#YcEV*2v#eF=8)ccj`SQg#P2wg$jZUB9uvQ#GQQiIYUyuhBCm}
zekU;ECO|#$n}D5sUvb`Ol)ZTVRd2KY3C8DaNk0M~Vs7(@V3?NXjo3U2SAh#F0}%~j
zuGi}pjD2+ZYx$&4;e_@i?_rn+O?`c#YBQ(#_&{p6@EH~tjfW;4J>Um5P6Z@zpOXGN
z!8d(-kB9zBJZWB_6MSqmAXR$h*MiOMA+yL?dLP$*D<&~zjRcnOk%-P3s<)}j^92uM
z2;QKw@au>bI1c}P?h4KH!nTA9j{&R1l?a`l((u`Hm*BR9Eek<cvu&lCGT=9JSI=pG
zk3V|uG#n3xsoUn@UwBz)&pA4LGJ|XBMqi4{9-aSJbj@^IrkixRog1P4<*RWa4e*?`
zR&`xMlu%Z3<8-aVMcif-1`aXMi;xULQU4q|eP3ayf2YWEcog|j%Opg&E{kuNXH)z%
zf0rot-eh4~Zl`vT*uQ$TU-u6Z6o*&?@<kFBPA_(Gs2Pe>s-n5bD{vFA)HhakX+o}E
zNA6swKkYMR4{(`7)s9{oW3+>Dj|e_RA^mrV3tob2__2)l!#zC|KLGaXS{Nfn(7`%}
z$V5b^-vkZ;5|5wUG9@zQA`@y?j8fg$HNdMwrprB6POXn)<{|976O>=K0)>zOJ*!KX
zr~kQWi+{UrCJRl@2x7vtKvZ%r8ryTFj$23nvR`^xK=J$4^~&en=Dz-P`Zt1byG>+i
z&%AxhRKXkS#q}SPZ9?nc<162hcuMZ49`j>m{R)o#esNw{ulBY4+r2MMSwZ$^<_~fo
zA9Mjbj>8YWGQY-Mcx=n>=jI*Q!?;fV+y?ldrHW=G%H)2m@7!QGp#&hIkOj4#c66&#
z+~Jj=$0hzvTK9E0;5PccH$vUa8St9z9*b-3_cO7%>N_{*MPFDI$u%|?pLgn<2L;%c
zej~l}kKDjrNF}(`3QciJvBoA6Tp5bB*~9oL)!Y7vKD*0Y4aoVGgs|!Dve}b^w^{y^
zYT0W)6uSb5K)j%(AEal=`%*~WIVaZ-!DnkDHBK?}ffYa2VBK>{=V%hYwZg|k*i5nJ
z+!o4jBMMzO;W*)7AQj~`>QE{}HNHZ%e7%;mRjA%h@ugF(|HCUh-#gRI)j&Y@g^Z>|
z<`B!S%Q@`IW8ZI~EP9m9RT@8!0QQg@(i@R=vjw#=8aLL6*YK!I6!!$Q3RBM=AA_E;
z&*|vrs1gmhJ=eN-2&2eUxZ?V^x+!-Ik5GnXbzQVDB=!khDx?mQ)jdVdoEkW{%so3C
ze_hl%?9E+#w74a_u#4#6Fg34z>D7^bCP=$xmJys&H0PCwE2Ios1%oSbO=amA8+RS^
zA6zU)%G_}xAS8JQmb-7i=w<|u$ne0VMa28y+y^icO0*o_)q+pafd#H!EsmRI-rM{m
zsZ!5;2xjufc#CT9u_Al8ff%naSZWo+xW1io;4LBUm=IiGmRRZ-AFAw~b&|`mK+&<R
zOY)m*vyJic6kz{snS=C~?>;G6?gq?QMLJJ-i~1*BHt5wj0N}=b7F{(GoF$)Ka@77)
zsSeV)EqDBJ_1p_*g3BQI<A+5WXvfJB{Ye$AOgCBQUqugyDa@yzZT<#menBw*thf&L
z;DZXgN_9Vf-m3gFDFj?*M(dX;Mb~Z!30APQ>}xr^i~8rSIR9f!-D*CJpF(kh4D=dC
z`H-JmBc&MXcr<H#19z`qv$yq!-D(U^a32VY&|v}r?0W%}&Dp6!G5Q^%G;$wy9N8;+
zZ{!k-aCWk282QtMUgAKxe4Jgy6ZU0#_YvQZ_V=dTu1i;?Blz$Hn0$Lj_}VL7Zc>Vp
zD}W0w{}s}lM3C3+=G5vFjGj(Z{=JwGM|tI*$&8zwtJ7$&;hd+h0aKYsyJPmicPdrG
zKnSuEQ~)U)y&L*r^$~C>5VfjEk{0ej(_p+K|F!Mk>w4?;Jd#Q3p9Kp<=H@X0>p`OG
zx>;fYb<%s_Fgf~3He7C^7}3QKbX|jMwVO8+gkWQ5S!-8wSG?CC8Uy?&08n{St~A(R
zmu_wso6Y`%gN2NU_g&%}Sz&=-LP1JW2bANB5^c(fJ#ual8dwrydZ_3C2f`~s5D#L`
zzu{WP0qZ4}LSKeMKLtSumY06?n=jO(|57A7(a_PVBys=fahX<DJQlbm+S}B6VfV1X
zfaf1o-Mhy@Zz;057)G-X#g!gnY~*q<R6y>8UFPE>SCI5?`BRi5hMtXeNYKUj=PZiB
zE#FO3xKZYI;@$J0YzaKEA7=#Xe~DmnO8ZcK?_W4doe1l=A8Ffx`VU^`%#{f-ho9nN
zh}x)j9$yz;i=F?AK@#Veog__peeMXXTvoWqIbs<ofIgJfRH?Mvk6PiuLS<F4PZ5$L
z(9cpw3`+qf-#hiE3po&iGD%6(u=cGVR0SC|8YfnTN;TU#R4TRHOusU$RrqVK<!?(^
zceEGoe$ulM=~Oviq=3m6JG!8c4+82pKqSwd#;=$jQdYnJcTOGgj#u=qEAW|uoc9a@
z7W6{s{Yi+6GM3e+ki%8VUvp4DygM(tDQ+*KrjWw0e)Jkii&J-9lANb*Ilit<<8>XM
zocBvGMw2oUS(pO-Y^h{LF?soqR#lCRnA4d%w)ehK<#ut~_&13Ol>%aBg2l(E7v|@5
zoC0MiQ(~$6V)x=P>t8IuN3h@Da-~e!KK7BjuVQ`Ddrzl##e~eGNcGlYWm4q{MaCz8
z>c@Uq&UYPc38cZae2QdS%nS3rS|dLr75fsjvji{`;fv_wk@OKWlrrVLh3$<UI*|Kb
zVzBx_3g;P_tXcA1&_Gzi54%1W)YM<zeZ&7w&5n--W$N#72TrO3r)8q2)qLevlF;vT
zhQh~C)d@3lLGVKdBi>W9i~ba(msH8jcK0>a3mAMS*yfDf=KL%<;OmWS*1_y<bt39=
zqReIt{soO$h)EW?#!5{POvjvO?205~5^8Tgv)X*dZW=sAd4%Vq|3gE!?=5st3*R3a
z@aNb}D9<YdjwTg;$10Q$a1LNg1Ps}gRuukB-4-u<m9+noNF56J`>|<Kil$-wg0KGX
z&)`;R+F%o@{d1JkpH}t$CoXbOc-6BOnuLE@1#rP7;av1*37_W(j`Zx-YUtGYpX1a}
zy!(+zTHCq#UybJmoJ+cD_^|{oM9Aeb1L4}7LPxoovO!Z`BJAAwrgcXWPnWnxCja$B
zP<Y{Pt*Dz7fItHj(>4Z@c~a@33}rnyS8y(0rqJJIN$T+`nWjM^fw&hPBW>}|&hhPX
z7ey$GJdNTr*=UzpWu-4-pDEeGa?n>P!TBHN><hL|;wLoK)Mk`jwxNX^RvObJ1k#!B
zuhx(Rp9xRl_8(@5qjUc8U~gqxvJ%B~X6?U(&jnPmT+?su{ABxSTG24zs;A13h<*hl
z!rah}JR;CduE~3kc}R!Tj}rP-WrU;*3`cD}4rZCDl34CqO?`ZHHbnGxaA%fCe>B(;
zL27$C%b){_2hSiL9rP**xEx=b8sOhEv)_B{6lvUgc^7PRihUOylT%&Cxt;*~%*g1|
z{7p`0artkhWGE2k#La8)e3leq^t6Q$U(JM0=1G3LPXpbfHtm4CJC>ig``r71sdV@V
z-W)F&<6@|GT}}RTEIs%}u&$?TXxHEgH0N4*=)RiGjnlK$_1;lx3&-JIlxnIs!E3xs
zzwW)k3Bo5{hVSQv29Os6+Pf^c&afw{1McE|&Et>_?K4KuU)%=;TkBmJd<(Kk0T%-}
z{e#e}GqLT!k7(#L>MjT)CTGh^(vIq$OWQ{dr%NE34n;f+vF(r%?K9<G@j1?GmI1-K
zu+lTRp-)+|b)s|Kkq@zM;s_qVSa%gA_mZL&i1j!hX3PGZ=iX-UP`U;Tl+G|W1nGJw
z48O0*wr0eE`L#kK!LtYfgx**EMBkHn>hjh4zTMyFrZ5uVVSsW+&ZCzd$$vr<gx{$|
z<`)d?qqgLDgL<gmgukn9UEjCy#?&1b2tq+>-h~B!)YnSjx`hKdsl`e038+=OAA|v>
zVOwBDS=mJ%qK1vb%_rNBXs0O&UYDx#&I78zHI$Me_<L#PQ|bY+UARh2W$ME~%?T(a
zosfUH00#i)e}$9JyAz2DSfE0M^ok*NES#Uo$Ikd%1X6dqA%OdL{~g%zop^`tDqvhA
z2F5qf4k1~kfnD!HaKoT_leZkpFxPdt6QP$f(S4v+G6J`hC;wriBXj_+CiX$2P6b!d
zd2Iywz+8?FARwY@)rj)iRhfOgRj5qegysf*FnZVG?66>enJQ$id4K?94uCgBRrX_i
zec4}lAb=U4`<z(-(i)lHCA$n6p)`9YLu$1VCDs@s;QuGI-i}BK`r4^Pgk>31R!11G
z3;WF6<~VrOjqLhi+6Y5&mf0`Na-;_@dEt;%P+{U<EFZvPFy3TdB+1$V*CTduQ)fPC
z2|`Oy4a8LbM2O9^aoj(pXz?CVY0b<Ja8nFn`83Nr+yC5c<O~|o^*Z=Fusg)HLRb#b
zD)$Gt(7qLVGKAnSZguwnb6L0?8!4c`<L8cGKWWd39`4N#$6YQWaCn&~kA0%17TlHZ
zDFG--tk&^$F1Ykr%)a#o+v47QYcW^`(pHh-Hec^3PA&2tjMX7`jZF(>V=OS)lP*+M
zRCt|m(yOD^;oxI5vo~2QkZ$VUe&WB>&lYms|Ac2s=ob#~M$%lTh#(-Du3Ki;N+u!M
z4RI>cU2f5ImtanHz2vW*Tp2OK(+wyz1LQ9_-lpb~p*u0xT2L3twaZ~(CWqHbx{OmZ
zb>AXqLHgR^&Ji567=j`8PG~jFqF2g5A=td>TeY74frrQ&yTn-d7w%TQ2x;R>6oCBm
z?L}z<XjK@1M`*7{gh{sx&QzHAJ=fj`TEc`1_}$d&{pqN0BTw@J6Ao9<fn-5Xmm8n>
z6o$!x#_Q>TEAQ2dvk6|HTcR#<HXt#iY~$?s0=O2xzoE3NMB-${u!<pENlc5ow6vsa
z<DieR%D{NDdxJ|u{GSiVl49~Yu<K0zfh8U@c#Y)3u3viOQESiY4CP6KDKrx%0O)T5
zzbbP)pA!CMn6(^K^WoDmdN*PTp?-!GtxnP@7!xaH#4BJfS0;#X+bU#~WL4^EX36kL
z^vEu`F1__p@=k1BB!_9gt^NAwHabWE@9m|2%Q>ev!T<CPwuyg~N0Tgcb*u>yjMWP5
zGxGmdijc38k4`i)F?V2)(E5+tdNyxx5Ga&<`3v~e(N*7uxqCXJT25S)co`Q1R^Z<8
zLYSoy8;Gf9`o&*9^W{^=CB=eoXy{x##?tg97@xLirzSqYEyfx^9@7p@CMFnUeW#;J
zz$uL}4=Ob8R$J?Pa$1A5i%&?CTCc|UaRSx=@I;AK<~2S5!=cXkb8Ad6E}Famu7>l-
zo6vaRG~d>L4|%i8f=EI*wRJ*9w0=zoZZ%9JFVHx^co=*Fs~fm)m|u9=VMBxHMd&C(
zL3sqs5CE{Dyijlu<8XZLACvs{7|nz!1a;$KGNMVZ94|a$z#c3|uE*$MtSMdh<xoWR
zzI(E#ot3Yv67@PjCY;}^H?;ez<GZGco9MTsV_H69GSTiYZw#9YsLie>PNB-)q&H+Y
zM6ExMr(>jhwU=;#{9iIi#5i2hmJBc?DLH#H<S{pDh-ap~;lp;EV?m)43G!ea86n|m
zIv4ASZ|I#pC^hg#!>tiR$DQ3>9i16nVOe<Z^KsBr-qG_M52^8MJelp^HkK$YO9CJP
zEnsuvob(ZzFW1TY*0MVF_+Fp~D~zUlE3j54A!WwREF~NAd;dKnk9(yrJWf1_ZaCmh
z>(A9fgyOHNPSA20(v;QGP{lv@oq7AyKHJ6tkNsS{W6*VkK-4r*aqrQ~Y@Mpuwwjri
zov5~RWr*j@Sfb-f!oejU<qX+j=$g#vB*_Jn7uZ2C&#9qvMk0y<^65AmLI``ZP6jA!
zhOV}nT(-kmL<K_90$Ph6g8E^Merqnhppu5WE?nI+8guvHD##v(^8v?y-Kl;tOXp6u
zk6XvTR9jK8l4s+0gMlOxjGoo8ODOeOb@Hv`_Pzhjz_{1^LKdEl(nv+x7F=C;4_Lj-
zzp!FE^cH|Ctt=MNo7aZ*;{pc@I!?YLX^2{La0-aT=_0V&mBF-;SQf?fq2}hOpXlc^
zeqFkN6NmD2e;aG*b6nMm*Un$ET1dxwQ3ZrNXGoVYo0S~w3!$e#?q*;XA~gL#x95y%
z9-+Z%7pGq``mA;~$s|}bP}@;w2Vm-H_;BVJ*QNC_g<n`+xD5i!Fh~=V{LQNaAukWt
zDOfE-XYyi&o;`1l{Kl&NC9Di;xpN^5K#>pPr+Ry*2OiS*kQtrH_X9C9Bsd9X(NoO#
zBU^K!deABQVA<r)yz)1M6rgS)6&?7#koAnMa6j1eaR5<-)nAsl6=4$Za*^B+s$+Yf
zlrgaB=#SckkA>tgH!`iJzrS&Bl!v7+V?6uVRMQwiYU|<v<3{DJM`J?cWSlQ0Z)QE<
z4(gA%!?j{@aBz#e_6#v1mINsgC@<m7+5LxHdNma=m0WGcre_yXw2>QtsYH+?93q>e
zJFivpUe6E!3XZBEknQK8O0Vqp2O;`)<nnl}V|ax*`B<KHB~f)5<FYx$@chLY2DeQq
z5XJwilE_+nXTx|s!gJ^vA^vw{JOqD=Hn5-?okQv`=i2m9!IDGKg5OJ7-?ZvrEJ3$#
zYK%O|iTmhdz9SYo_K?d9=a8`UK@9E;7>~I5fKL`AsskWMWc{Yg{O2%L?HJjx`6qB4
z>nBHHDH7cgRj9^YHZnx%()_<YDCtegF|r6`8HFklP+IHRgX4fn;iT~+4K=T%l~9rA
zRKC9bnewxA)eCyggTujZ;%K~TMo$i4q|_d_HjZZ}U)g*{N>!b3lgr6lw-{#-IQ`5{
z+Pc|)v>f+eA>wD-higgJn86ad7XB~CV{B{<7xb8CR|)|M3r;tV@JOhMip+2-NF$YX
z`z)s!(?)<MT;Az<HIvIW<hizY;gbmc1d~r+JBQ@$kGoIMmbfj9>3g*w(Xpi|D?e;e
zdUTt67G?V}B4fpBMp+pmlVe+t@4rP0&-k8y{R&D3d`gfouQoB|J#_2ZOID>X!eua$
z(E*_1?Ah&tcsQzdlC5tWTQsFV8)~3}5Kw616E#|=2wB2^Lab0-w6yy=NMQJt3M&y+
z(%O>2<Shw)`^p4Zarf?^=AnEHkIv}b;=cG+xMSqS-nZ3AT(bINu;QOnU*h*l4@SYX
zv3HMSYb2$P;{>Ex>8d?$Cr;4!4Q~BaUhD$r;_ddH>2ls41X&Tj;yF}tZOu<U(%cXF
zRv9)Ub+F%)%8sVHpTQx>koi62xYu>oPF6v2r;Y<yQ#mW?@(}HN;aPqBaGH8-O8{TH
ziVXXp&etuHlXr`@Tg}9kbP!3-h|P2OtFBC4!ZVrN?XGwM3lDf354OYAg)Go_;6qq5
z&t&3W;fyC0Z1M)*p*hma>`wn9i8P>1evH@e<10w%w+D?lAToz!?UImjwa&Oh0PAF6
zj<qIMVENKx)S=!jRdZ6%mm#!5CtO5a*>m(;J@f-OgJacron0cq;~`2%@cu%Z(KqG3
zmuu1a37MsHJ|F11DsiSQKBu|9O7-Poxtqj0WocWq1b7rOb1@aCNTtRs^fM-&GVo$T
zkmUN?sB{+nfQktp84gGB8kg;3Ld&JXeJWqH(GJR<_~SrHw~=6}@gdsCJ(zE&<s-xn
z2tS}Ma3T$X&{K|j1RIwLH7poZYMB4PfU2oot|wKqKZkWe(aB_nHhQpc^BewuhvyKh
zHAn6d>?kZ-d}fWsK}zl@t8^mVEOTc>#HFEO>96F$|C}mfy~ULpcmKkr=%K+1Iw|lq
zg2T499#<2lxHAr#pN<zG!SBaeFAqmo%TfvC2e&p%FmLg*4TY&+GHuw$5WAQwV2scI
z3g%M}@p|F?KHnO-#TD&yIC~bm+sH4@t@qwlL#2gdaqcLN>q?^~x)(5P5_3s@auQG-
z?QASu!0dnlkScfimTB**2sWPk6$fj;pR24c2bvMg5Cm6;H!EGIgm=fJ1fRs-MBOVu
z%cDNLw0*8JGiR14E9em`8W?U7RYz+G<LM1iwZ3%OjBl-TkLNs(XiCyc3+t2)IX4N%
zb5dDhXnbxX$j<ES+8|i;kd?@udC5GIAoj)@91zM~CV=)aQ1G@97WwMOIlV;>iUl?w
z>F|7cv1*z0V(V+K=Nv_mW$#ZAlto4T>OL295>zYp5t_&EHp1cT1wS16xE#Q*OFxA_
z`T{Bp&Uuplt%2t^UBCT6%qUg;T#FF=7eY4e>Pj28X?2yo2-AFyR8`p(fP5MswfcRv
zGgqzidv(Xqo>XA|Rok6S<8JJYQb20M+?M+eTe5<YYACtQ;2Vw*4IDi($BwRngz8~X
zsU%J}KuNaNe$V<X>L|r-Ae~XEUC&zd<`+K&Lzw;LSYQio!04UUmSDcvkC>b`;v|>n
zJdf5ta<W8I_n8cstcVZ0d~Hbpvsp6z@!W^#R8p4`m;*ARMXGge!UHiDE6@kIgDdg<
z`rH;VJp=1%!8HB^ah*&J)YqsrCo<CSY=)_I51vvmP>;KtO$}s=j4Y8BdI1Adn@2b=
zzD}PP$EWS=|MR;O6?OhVsgb#CG?cc;Lw!BsK#WkB;kEySh}?ckqj!g*&QwxdWfNB0
zb1!SDA(*i!y+=H&Nw+cIxYJbmrWIbLHR~+(z+gB?W4cvk|AAV0i}}vD!lxH`Vl)3V
zwz05<<If-PHdqYa@nv?#n+?@5SDQz4i1BmU4YM@Kj#Gtfo(lXyD?c5OPyU+rOtJe$
zH05C>F3k;rmaZP1`MPUEu^@2Gm(gZX%Bmk%UsQ6E3=KzIxO_YpSH&n&H8`*yu2U@+
zk74b?Tb}WGof<_k4H&EqXFYIPvT-Lo^infbdK6?HC!kCJ5$59gnY-QZ)2kna+v1~N
z+$93@-HfXZ`s2Ngx@jL<wD@V|3Px}qjji&0Rbea7$>eMe{Z`KH-q6Y2Qh7f3`)WRa
zyDBl9joj5r@Jg2-lCJZ@&sa`^Na0W{MAaa*3CP*nx2Uy|khD>nIWX>g`};i(p6kv?
z+`H|5dE)y))2)){&WS2)MC~;(5XR&WW@9;t?DXA?n|d*@l)9fx5k8R*ABEWT#+`hB
zU?>K1)+V+}Y0%E&dmjg4%anZ><{Y`@q?_8irE&L+>hr6LDYI~{_nM{y{Kg8M+h_L9
zFWFD=xu$udX+Ns{*Y_$*I3v5^uf18tIF%+x1|cFITwFYt??Q6+yhG(^I~L0L+<LeX
zN?x+RkTsgiN9nJAIMJ0%p;d@mQ~sErN)B)K4UJP1QxW(|AD8kBuV*%s*p<)xM~D%}
zv#g#tjfaPFf~E1gSX_4RhvU~<k7D=8sifDe%LkQyK<FdhiX6(~_Paz+=RFS8ZY$T(
z+4XCih~@3%PUeL0xeW`Z+B9Mv(X;Cm=2v?Dn3tTN<S7Q0E_~9Ua8Db%H2l~)9q#!2
zlMoIfAM<015GtF)#3w!}Zy*MT+Uly6<gut=R)zH8mYc1PSCqO}5=TfkLs{Jjf=yC$
zV!<6#8DO$%KJ+RtCjH+ZW#W&&U%l0YzwN11(Ld|uiA3j@rV}ppem~tve3$;K@>R|c
zS4gbK`|lFa-QA?ONPDnVRhE|@*;$@Lx37_8#J!hpW-QyVlp}9Zw?2FKCyH8)=n_&n
z_(%rQ#9!~@_wvqAwdk_2Z6bm`M$~sPXA)iZ3|1Z7Kh&HLC(mD{$GVdHeyS>BY{Zs<
zIqR#4>bGnLPh(4CK4~5WzUlEA{`PbERSIVvBueCry#8$|lia(4nC7A0m!hi8Ms!2=
zB$Jr6-ur=fmW$1JpA$T$d*mUkdx~aFec}U_<zEuni@hevc_01bd-PU8yYLH3>M@@t
zO&1>cVH6vY*R)4j|FD?vp|0$iwDq&TF8|ldIc^hDU5s|Cx1Y{N3bGvsozqIgWX_%p
z=(;CMr4&QpQ$^^&o3fu4q}vv+`2Nx6kdE6c(eE0E@&{6E`P<F<hkrB5E?gmkJ+RCD
zmF|B<Xx@pmF6twZWT{P-+2)cHRuU=tc^_6C6C(W;9g<sCGq3PX>-f)3_p7VWrl(W6
z)B1hVC0{+R+~svYD5eMvoAW=He^_28Ta54C&e7D`FOphennh575^+;^zZQMA{HaU*
zlPJ$}XHHF|Av?Zo1nVGAk59SmaY3GfL&Lb0U1Q(c8s4+BQVDyWG9GMkXQ$ADI<2ok
z?9|%3Y-Z}pHk;Vin>*Y#&)PO`Aui)go#*WDACeEIAWROL{#)Me9w2V#sr?swQ95gP
zxBS?g02Q38nD6vbatFgrS<-t5#%%5ptnM;<sA$LQ|2NUc<&NV?KO&7sVHc(0C$HS=
zSyH@AX7T$Ex=|A+b*GK1)Sh?PnnxZhT~Z^Yyp#Txn?Cqg!MpM?xk6+fls3C~RdndQ
z53c9LF;`GcqtiOtApA0JXEfQgN~k8oj36MN)cy3%JXNc{$&Ner_iJgR)*qDqM%@1I
zItnN31GcBx!TCsF@-P#`Js!PnPIuADRPKD%G1k0UVax0$Hr<d*ku@NfG37_Pp&!(r
z)2F-FlCv+AEM@+*bM4)(Y1pHBqF<6Wk#!E6x*><4LjQ7IE0ZbnCW}WL70jVM<-;25
z0+VmNhlaUDhfF1f7V0d&1+mV5SYYP%)7zGBFcv@Zfv|xL^+RfxShbmcvyw*3mN_M6
zWhCiFG`0S5po+<z#a}{2C86BvIgJ#hf9{UMH6to?f?O5d3xd3l=kg#o6-B%HY==X|
zk(cB_isRb(m8$bFXPld1jbrt_`1>ycuXZ*LI%}^CQ*~8p87Z49Win!vMvDo87Vf8+
z15q}%Ruc>{3BC}qQK`syFZo+9Lb0Y#MH)|&@fh-K-&~(?Th5%umh?5CtI88$+(UdX
z0#`|SgrsWz`?H=1b@2CflBi%#B7QUGVUYNH21QNh;?viZW%b2a<0|Ilz7Ib(0OeF2
z`nBUJU!)_&Tik-zQYa5^;UaDZku--E-gJ7KaA5u{9l^Wv-)E=99F^POLcVn3&p!u8
zC8FETp<rpQ$kY7?Zyquly;kPCziuUiM(d%$)F~*d(s+xf#pd|)5#Pt9*lZT*9GD1Q
zCt98^mp6byeRd}Zd)shz4bL}<Xe4<)3XS_z<)Dv;Q|nco86XrE20iWEQ@cv-(kC%0
z$vTeKm5kI|D0bvKme+VR?tCww36hre5O4Yf!w8fYzz0oON>P$f$<a{;>S5D{T#-cI
z>@tjWmVIZV>s~DaauR!RVX--B@pZ~R+KS!fchV2+o7lLol&39}ZHVl(?x~M?4dhjz
zMubA$I2-Pd51GH*rYvhg0?m5%`C7b7sf!X!-@E99ep*{f;nZ>1K3wwa6Et`^;UFCf
zrh%DZtnbjr0$Hxl*vnUPt`cxN-hs1yFD76jQ4ovG6v2({xGRF)iA=(fDVlW;AZ;~u
z`(Mceic=c%$K6Y;?-##L)#}rDE4cDSsWC?`KG(}du-Cmw`<{!S+|St&$`n@{{_UfI
zhfq`|>BQZMU@BLvyb@^$wfJjYj+fiJ`nTW<`Kyu4jk=u-%CJqVZfssejTc3%qarez
z_PmRi$ap~0mh8qI4gaO#k&CO3SN=LB<zM_+<FWu;zf5GN+>`SAaJ%qmon&m+6SpN!
zX;?7eVK6>craCAang~PHD}F1S(T<ZGp&54D=h?ikHhkR@4=_uCmS-gd%>@r~OU{F$
zoI)@HC(PO3RO?5}oir`{I+;$FpVd13#p(&uzhhr~w4rnUIOheln}B8vgiPVe-0L^*
z8KON%n*v+>HEa?|RAEFwGP0y!RPLYADYe7Vs+SYe8n+VHOu0M4YzS;zulF<-4D38w
z{6WU$3|%h*O+=#%?R#6Ge09}1f6HNGAAx%ub5sFsAzYb20gQn-doVqJ3LW@Iw{sRq
z4^vzxIr&^y;sG9|LmL;T>F|J9e<Dpc^_AJ4bDAHHKHfyq_LBd2IQ^vCH!3X53v$?5
z`L&8w`p7}G3r~pz;kz!fo<TnuPgGlR!T#u+E0zXu;UEN0v{aWoj{m&(eZJZK`dN8j
zhP6S*JiP``QE=ob%Nfz4E7o7zHl}?e{hL=FH~yX3)9i&U+p`@B;80pdOmO(>1S&b7
zOHk;gr(l&J281C?G@zNeFw2~W$s1Jd26)er>~=;r{Pzw4ISi+xflYpv!<Rg@xB)-L
zl-W!lTI?6I;&G1Pc|ceQ-(!BxSIFI7Q#h6m!ac@g<~*-s^6nl5RiX>)p8P1J@YZ?k
zVla-)K$O*bq;~a151eO};pxq=oNr-UN>zkLg^X}3gF9GWRvzKRMUg_kT_g>{ai#}d
z{L^&Z2Wd{2Z>WOY&z2rxn9{F8E`rGQFO~2MKqMy24KEvs%cP&HLFO>}P|Vo57ax|6
z<h5olkN<}#BsOPo0J<bsPV={fVkq)0e@~gA6^}b=4PlvuIZRt+b$i1HXqXbTze?FA
z7VS=S*gE7!23eQ%OV_rH7oSrCWEM%#gHc9sqrzqf&!UTw11Yf6c{gy~X+gZ=Q&YmD
z^WzqS;AMys5DA0Bc&tX!{{{D6yr>u=`qj+Tvx-?PQ86r!z@<d;UL$dAU}lk-@I&){
zW0x;Qy-NlfcOjOau2Yt=fEJyUUTUWQbqSWnE)81>slRn5s9&1yGZ~A1RIx@O*?EUf
z2*n%zTx}B7h}f5^HaJF`VCHHJ)0A{Nguo&(9jh;4M2ep>i{&WGpBD!oowKY5(;j;2
zot2v09De@x7^^^gl)rbE+b{T{g1!8@{r$Wg@RyO^=u8xzp`K2YX=uwoyI{`ycWYIh
zbRRAl;ljR=#ibD#vv?<SvZ*q8_B6Caa?SWp>c=7|uvHmvh09^3minw#Wat~sHWYlr
zctaXA#RZo#9Jytzu9VsQ18he&h**2Sz3?6<EXC4DRSdpKazb;%1U$|KV43>fF|7oW
z%jCqZenl(&`&eCbj_#gaVXaBiX%vdVSeo1~K9uQwcB29>?@u@!55Z6c;g0^ZzU0oS
zO-D-Tl)Q1RnX%~_QP6ugjQYy*a2?xQydi&!nTX)p4!>R;naDS5509g7F}qDPfRS5S
zR9grVza6DE3h|&2L;08<+7?FnYt2NNfIrhqhpnHFhsuOZ*ih*_JpUJ%<A~Avhb8Jf
zw1;uW4RaspWikdt_#dBzJwVTe+}S!WG2m1g_y3!-fjybRuo#Cdp$<Y>GfjX0oo!{_
z@K|`Zlbs+b<a$}>8LMZD7eub`^`ge_KIp$Pn^A!nTn|68AYkc#?*hDA8_V4=(vo@r
zY^?@f1*V_{e+>SH<aJK-cO56sXxBI(R~6xcC=<SQ3B|UfXEyn+hLn$kce^Cp0jyLw
z(YagbXO6Y0e<%)MA6o1nZ5-;;y+f0Ow_T#+)K?+U$@fBm_d#l?^3y_mB(%`<Ms;iV
z#jS-`p@Gi4ed_Ng7~!k@qd$bqrydrH)x!Sm;Ye5+f(0?NdR|ssxPK9UIWI(`=p3;N
zz;+y$%pEsyzAz2FR9nUyVnK*V`Mr(*kXM{vg+&O>KKv(fU)7e9(sjYwg@IA(0>w;o
zAuk*_01ZIild@Yi-)exme=zzItAM}i|HZx+?K79>EMt{|z*~hvYTs|maNoZT7V0D?
z$Fgv<D2e;G{pXc_6sllL+=^?t`TT^EQOWYlHb*1!jzCG7Fz@f5bs})n_j8D{sBs3<
zO(>x*PF8Ec`&6ku8oV27;0zH;#~rIVc_$F{=zGK4-3s75YQ6U41jTmzv=x-MGvA#Z
zyc;~^%Z%m&mqo-`R*DW14z?A>A>?goUK_7f9-zyDqT(J)8w5iWG3SVByKH{r<3@Oo
z9tjub#&qY2@?#$iL=?{ww`R~0%6&JxYo-`Oc1PSYzaghRNgg!IR-+H3-hz5jz`EYI
zX=I*QutZ3&IG_^7)4w&StC!q7_lhvHMb(A<ON5U5&h*l0|F+%mB?}@QWoagd_fY&j
zSk_`PjP{xe!%lX@m=zZ8{5fn~cEf4c)CYGfLoS#V;PnX6hjqjM&c<1iGHiGhjKo(D
z40l=hlq;i$yoj<l{`=(XX@u`T!*CFKuDKqeaxw8>pes(7rFHt3yzOcQi&bIxae5nK
zjFT;-Uph8gED#qPz0BvEVqTRrJn}<G2;z$<<G5MN2WGt~7T}*XDb%SaGj<oqGPLz(
zX(r6S(0B^&7JVOtmkk#7ZXDo!5fzQ)_<JoC5W;l6|F`G!%bFimGem!6X<D9e+FvCf
zO1g_D{#GEW_ygj}*Te(tt@%0@JWsH6G8(Pb=6W?Oewr~!LBI2%kLc`_q)RE*WZB+a
zu^MZ_XPK?8!jIm_a$obMVH}oC>@R#*Z1yS0?-0BcQ=iUuZ5(3JH|I&dDF0Y|1nOlj
z&qg7CA<ORloL1<9LdN6MI*}~XRLC%M;hOx_HNl|5uSW=ckG-sK8|#PYUzSVagWVH~
z0R`2-+<3k8vvxr74?2QSym|ddVp&i;h$^b4u6ymrw~e|(BZ?PtE-*JgHF)0Ba82z_
zwwJbyuRaXSK$G9-t3T$iZk!9e<3z7%q8ihK(&Uz>l;t+<3eh2-D^vb^z@Cq0n!w&K
zH;LiDqEq!S{P-9e9uoePl}&mGy`*^#O~(2Zemp0|@12@bJ_f6Yh7Z!i>|A*<pOUuI
z5GsPVrMtgExvPSkR^?7mpV8q5Td<}8Oe!bLV(oK>^{~T#(n*=QzgE|VAC7sIz$dP+
z9S;u}m8iAVm!3|wbV0SJTQ9v7p@WeOT5zupc;x68PU(OPy@{n4sXtzX{TfKq^BxL3
zvEfS6z8kw{dCqVja?^saj~}?hHlJrjLBAm<S-w2b=tmt+#wsaYP6Ptpo;3sQPOlvn
zft@xMtTU=b3Rdm44K=V<x+mMvbeUg~KY_nTnH=rO3-cyR^so6mqprJ<e-tI;J(OLJ
zZYCl7sLH+=bWXY}GIWBHVH_kbG{|X1>)-ypd(dgXxDmE?UmkhS48*1;pb{V4ejMdG
z&88j6l(Kd&s*2&zLklBoV<KyyR7ASr@HJI2JixpC;o%MnDbG~(VjMdLx1qqf96O_Z
zfz3vTH-0comBY%&^ERst|C?RwS7zQ`puKe=z;qq^CW$G1rwnv1Ow-QgVN+KYg~Zn%
zNZ6(~^$Y$K%X__Z{%v`UPU5MBGL4_PdZ$?F_hwulOpx?r+sxjNzxC)n{Mi18?Y2Dn
zDN5Tao#U$T^0PxWmCiSnk4qcMOO8|8<H~{;IKHE9d7er;M%1_DqJ7Q39<B9H<<WuE
zJJG6S!pEm$@C`IaRZ1&*oUCpjVwq}X4M<2C!{Uqn+PJsw)fr?bP2kKcTctz4K5|k<
zxnwIJ%Z(lpC!$_|cFwytb;+OB8({XwfasjUYkI~^GQTROpvfdOq=y-%l#Jqeo}XTv
z5^|uf*A%orHvrI%XbbXnxq^&kK(9+OzkAf3g-~B>z-#}aILx=v)C--@!5=tfxiu7?
z-!B^6npF;JPbjmIR7vo|Yk731OksB2J**t@iexSUWxT%lwuG&OmK^pzEM>aOoC5@8
z4v)<p3C$eaSx;)%rbt&pd+#_@X29`{<9^l!1~DQR8++__%Az=Vc*1pBe+YnlQA@N8
z*`1HTJ8Q?3#;5y>+c6(=2jnUS10tx;>@tbi$%5o4g_%pA-NI%BDXl=<t{|nxo_0{E
z!dES0{8BTvyp)ftCdGv=DMyp+s7LjW^nD`&2Cv?lI);!7K7NJ;)8vdp%G#haq-9m!
z?Y{dl>@HgCu=6OZX7&Yc&C#7CUh%u$F8HYbZJ3~hI}Xc}!d5@J`@0ggkk2T-H0H{4
zXg^yaBrb{;M&dDfln61Wz$ni$G7*g9$iUnL#oKp}2C`t=zea;tL~$(fYKRX5M&(M}
zUZH^Vy^f_Nlt}g$yHhWuk5Y7iJ3)OPn6^Vy2SzSzt-j4Gh=$<a6A+q*XG*0KbFAw{
zma-Ugio6ULeu=$ke70Shv(`8WJwYi#3N6`5$Z%{U_9QeVX@TP{#Hj((mh@9J4&KF{
zzlok!=FA*&T4%p|iom}NHfc@z3J2g4)!CDhDQ}t}9KrAO<QbQqAZ18kgv4W7Zx6Vb
z&T@_3sgqYOEn!{|Nvn_*xA9fSQKNQXq+ZR6Gy<nIf-%ZN(E#$v{gK&j`fuRSY6YYS
z0fo*PWIPR=Pv8+IF#%g3D1Z}Nf;@2oVU>tS8>-c2Q+mM(I1{js*34Hh2?SH*c#>Eo
zf5WSDyr6;2u{&$iw}h<_z>83NK|X}a8d5O;IA;%Q(EltrWZp*R#Z|j&42lIHs9Rcl
z0rV3hFl(>(f#D~stP#iw?st&ad;dv7Z>Clf%aTj_?S?y{b&`(9);P88|IO;Xp8&>p
z9e@s_=~ZjB+3X6|*Y=Ak{^NTVkTO^%2!(h1Q_7Y|7AGEJ5VG_b`~b%^f&d^MsQ(*(
z^<S9Pw?zr*MVGE!IA|F!oE4iCOX`!rAmQh#bTz%^I#G!TGxW*h8;v=j$b-BOV8xQ-
zWC1b>CJ61Pyk)IHwM1yaCHuTQ<_Y;S9y2J5K0N(9B5=|54lO!)3X&yXrue-KHiNBh
zs$6lNJP2AFQBK4nsV>6PsY2#=F|n+ABJY=a^?;G}HPL%d3a$K^x>}gnI+5tYh_aG5
zeyiT(rNjlBAx*5zkQ(&5Irc+qFnPqC0-V~_ZFXb4SV8#&%>|iQ1=rxX(1D}kLA0<&
zz_UkP=Us&HI}2fH#nq*p`e_(bWAwZFSCE^upya<b);$!pZFZxQQ+GeEs8B+hb{3NW
zU_v6mRV-w~z^a4eaEpIAWw>-4+x}NHkqC}{FU^%-YO!xlZp}|_93!{aw9wR)&6x*d
z;u(z8sjBwN$FiSPf|N}hTU>{qE&I5xpeCvH4ZwjgD3wfi?^F(@UG_6aA;5<?w@UzH
zM2JKk<xy@;MBTk9!_}8P{)|KX^!_gTb>WMCONp-ymqx|A?LpoQtt3Ds1(ZjVY*aau
zNcdfcxYLkPmT(z@(U|?I`YA~m?Xz(xxgzdjNYgP6jhG)W!-^$V{dr&#heYzj&#meY
zANe4WIJ>x^rHDDP50NinH756L>oiC#zUX*h0Ly|zSfa^d7uGQAN}dEwh^Dp6jdnuD
z&wz2nBZzEk4$u;_1%WYWi*(m4Fr<3yg+SLU7JFFs%JugyfM5`}y4wr>WbBhgUVIip
zkewWMXYwq^?)BbTB(lT(H$6aIt!))PpMd=3X8K^qifirq_{r1JZ5BT0eFMvASie)E
z{E-|q=X*Ik8S~*0*i@a4CYi4+k)}N~c?J=-S!3Y)wjH-Po<sAu+;-0y?Q<+YF!d)m
zA%D+8y=s$2bfXP!+gE}{Twr7C7U#M4AMdes_lQ>ii59+pLbO~;Sg<i4@%b=TF~dvI
zeMVXG*dHFAkf<|kw~|cdXL7N+3mPvlL^7Wktb^?wNE&R{_!wx5oF!vA_|-oEsGRBg
znxu;AMEh37`mk&+y1yQ{8^?aA%fUG#W)=WUj?@y;itYjVSQbQ>?SH0OTP`B%W-^=j
zAxi&H`FCNot#4j?M+Q@Qqh}c4R}tiil6dyQSgO|H*)X$DPKrY=AIMzw@_!rs55@2%
zkR6G>M|XEnVtcna0wuF#RF*IAdh^MgR-TVqF`HyogG2D;m?5-4vMG#tBPm+^=HTr+
z4s@;56<I*jwb0wbILPXB^3NkxYtGj>N0U&(R$>L*V`8kaqqe=)9BqtA`0h=a%IG4L
zLRu=hL>rGkA$~gRU4@%Y2~zF^1e{0Kk<qiON`ZW%Cs4V<!!$)@T@0K&kK?_mJxP89
z>`!Hq*@n=1*EfK9`hvD9LmrJBmQN5ZvmspMd<~esYp>J%KtAYTH2MQ-=`1enc%(MY
zD)jsOf0zl+I-B=m5w4R#>A55RcjJ6Qc!1Y#XKf+OX77&dC9VgW5fE8qh@q4mddRSA
z;BdTS!@Jvt;(6NG7$;+7#Wa@qgoA0lLQt+{@lNUOcJ^hF^nHF<7of=Ugp{3Ebf`#3
zYejvbL6xOx{fF;~0|$`<*rPYIDuQF$IY%rfE>>PCJ#8%n$Vf3<metGkQa#v*$-qoi
z9?1@`;sYI~jnIgM-%-JWkn1qLdB2u~_bm}vNI3UdxgO(P1MA9#>+n>P{zR8A3x)^{
z(%m{u(M=#|P!n;sttcZ0fnXr*r>d{?%6G=01i=0lO*t)3tLJ%Z&HEIUgS*ID`Beto
za1h{&@r|=&8>w&){+Ro?Za=A+q5vEYItFLaeal9&44(wYzHI{5U4Hr0o--knL#pw(
zHr5M79yV#y-rwyjtvH;O0#k**Ng)l-L!-?uXZN7AH$da@@X7gosI^53ST({pf>aY4
zh{2Mcz{+<c(YCBu!xDm)0sk&~&QRZJ8Eu^O*ee0b;<D#VogPI;O5V&scA3CCqyAf=
z&7aMeg{HCbtqxLSMC&y-HBQ#Djy6^=V^XKzd-C2x0c0vGyJhV6I-O7GYWbINeFINA
zY!I+<0em9C1QQP%h{@ZP_>Zr#+J5Biw0<}g?^tZ;6=lv?D_#+&gG_E-uxlX}|3tz&
z`q#$*DA#@ou@)90(Qx2ty+AMyRvt#FoB3>w0M`^PbiVH#*KT%g5FX>EnIF&9-hvMf
zo}1Bl%mCl31Q;;7_wK`?HtZ1oV2F&=Qvo@MWa1_taB|`9P5t<2OcHKOt&Y$G0?@*$
zf7O-tN8%pgz4px7od8{cW$41K?T&k9p9u<Zx~Hgrkh;$ts`a(?Y6$ScXLaJ;Ym{o}
zWh!$+6d*^h&2}^e8u)8?tGmTqrC13!#@YJlykna~UZ^m;Z>X@(w-UyKU1BF!yAeLz
zE4Sw)w?|nVuKkScT35i9XivF!Pn%p!L(;51$zz?^ms?1G2h)#{JY8ArQ0e?qFFbif
zjD2G`?&V<_AzP1ElvHkARw2KutT{UK74!~oQ#B*zCN$Pqf8d|TdrNk*z=d;Xb*57S
zGv33=iMfJWkjs9^jlG+q+3Nl{l(_YQC72sDL0wRj>QB<JBb<@pr18vKUGR|s{TqK}
zpDG@+>pH@(Y6Fc)GMmB&G~SDQrrWR^v-TnCVVDS#QZz9;<@|jjX7>q$+4#(|=eWyG
z^`IIQP@#ZiB1j&1Z=%PevX^?7A0!L-6}+6h;?w6zdeJxAHooT0`(1^ps}@{#h;}*8
z+CBA%Lp4zm-iFtZG~EkU&Qq@jN1(ktbq6gJ`kqCweS;s|i((AgP7vG*42N5O9wu24
zm5ES=X)Jrvmd+v7)@1<U1JQgbJG8_g_%EOH>x84U(1J$L103227T8EXmip(qkoeMX
z+codOr$ITyevsB_4eWWbjoCY4(*BpsdWI^_ZPR_qE>4misnN_)QHET$gxUDx5-;8s
zLLsfVc^nU#P<jO_G_4<6N7@JF@X!!Ze$+kd1ZaLZ_XMw^O0q-2tOM!wnTy0=ODQwV
zE;GhkJm_<xwFNlOK3wiFmWg%gxH-a#)vj_g`Pb-UHko2bok{v(U>-U9EnZXmN?<TC
zU76|#=bag4$cu+YN~57cyO8md4B~fxU#@~m(RXQxc%5corfr#7D>G-aMLf%|2m`+z
zF><Hu1?@lKl<(=lXw&Po-*qFb=nD9YT=CmXy2K>jhT{TCe6l?Je%QO5`6U{v-Uj}e
zEZ%v-CxhG_iD0_zH~z8d=Wj0D?fFX}K4@umk0t!m0>{-ZHJ?i-n?k&FW-+4yLb<hP
z4_i_hZ~kWXXqRLt7d>d+M}mR4?>VNV^Wf3WrBB37x`Vs8I~<$dG)Da~C4*^{?#ajP
zr_GjM(TOC}cxMNHgtC%je$AD_HG}hP-}9t3Wt2if!D{u~znZ~}4xih!KAn~mIf&s4
z|HtR^Z3y>;@H_3pB2K;;^yfXTCY8aL$9Cd67M$FJ%FocO<XrT=wfnM5vzD<<dJ{yY
zfo~xy*^{7)mtyja@5V#dlhG1COO2O)&#Yh7KHVzQn!F(@fzJ`h={<2*E|;LlOcfb9
zJU^B}<=f76sN9yE!G25^Uw}Ua+icg3_8)|U@vqL9i=29J$cSHOg$=0C1bL>>y<l1V
zWo!DyKPOscS<|B?MPN`d=kUO5m|uW)gmxt6Q)?O*dqHh|T)*<aF%$Z@7QBi=29EJ)
z7G`5}{%4Y$Z~Xs{qO%NZ>g@yYf=+3XMmnXD29Xd15v9A44rv%uQczMl1p$?Aq$cvG
zQ;>!s-8Fh+d(ZocFWYsU?QG|;=XXE%-QGV)J5fRVj8kcMu*1I3jjzGD>y>&#;isfK
z@`0JFFMNiDg9B7C@*cvb+;IT|3^QKZ+}wMHwgq<In@pYH->h}@`ve<BpN0DJzb<;A
z7M}bmVfUxHq|#YL)@f}*(t_$d-*euxBT}yDxeYn(ZNiT)%x82)YA&a&n@%KB@M>>W
zx7cFwB?VKP#p#W%1joGeS#2_db8+aiI3$l-!Gv;hQP+T`g`Cj0lknT%>*PMkd!KCt
zq-<dtbbPc1Y!3aU7b6ZYRT^Yu2oc;3i!(Dlm0&adLYd3~x>i_xRtniu?mkBf&-5L1
z%m~dpuYWq!zw9Nax5unUA9|Ec)7U`Y>Plp7W!K=*mlKrABP>?bgJzsHgbxgMQ>4a>
zO#+ER+$%(1|B8@SCxd(yfAa*xR$Td<e6YDIh?*Gn<~_-R=as<*8eM<<D5VNNyW{C=
z)$J;T$2sYSb(ra|ychjVn=)h-2;pN-AhoehJCys%<e?~2+j>XDBj-tlptP!cad0sI
zR6o42y{Ytu!->{OHTbDfTYJ6BFY&ZZ1<J3AS#+0gF1wr><eE9D!a@s+BcG;I*#}?z
zNfU1~Rjhht;e!r{<8~>hS2KHPc`&gj22K9->77vj@z@OGlq+crZ^7rUM(%p;4X-)q
zsQ!|$28kFFY&{Ptt{6+y)cWuICb*D`s&GugRhkgN`J5a_ZK`pU5aY7AYo}!L@r=|>
zLcE@ly`hw26@9k#=2X`|ES~sdyHNl1ufQR^Gny~q5*)dMsTZ%VRo1o@-zIk|6gT@A
zBp5D#qI&n@4NE|Vx_fQrcx)!$tw)Hd%gdcg!ktoY392`rD_(#i?AMj`A(`wo<DWgm
zPj|ydGjjg4#L}o1Q*bdN2lW=Ug;pd2hwHUExwRGTUvbU#u<I}@b=UDF?!<EnoDFqE
zp1FN|m{E0xD#-e7ILz-q3{)4><g8cH+%<T9JqM{f?oElasGwGKOBR3oC{>>Ih|nDp
z@BZZ=SKoQAC2p9o=M?y~xa(2v<9}s-3NMOh+v*zmI;c_Bl@L4FO2Gb0(^@Mk-alG3
zHBQn49d#7Nxs7)mho*cMcAWVGg;}qfaDD<-G*KUa&0g~Tu7i6zV}5YgX=kVJN%YZ1
zB;{@Es&7BrP=6p_;XIxE-yBPNb2zK`y-DL5M0orBHzF<SdFsmMAv+`Gn!AU#i0K^%
zl1ZA`;-^-W6?D*M??Z5VGaLK9=}yU}OsMn6_i|jiZR0aiF|Tfr9UYLSX)PXwg5AJ!
zvd7aGr`Nfb+~0Kb(wP+#ojLXdKEt#(u3ob1L?4m}ojg5IHrQ(FHXMEbWy4{LFY~gs
zyXeZHKnqPXtQ^0AQQO^^?kKI(B`c*b$@S543{!#n%3FTIeZDbl+sIOR#e-RZl>GNk
z?}bVFv$nhK>R|YbX+Ih*iGPRcr4#Gc7iYJW8`FBRZW%Zx`p_6=uHj&dKgP2WLo})P
zITHVI1dm3>zNIVUNsgkLjmkxT8S@>1R?tk8f@YqFN^+gKiy4^ETBeY+NX1Qc9Dko9
z_Z(_o&==Axwe(d@%<bngpI)=2=BIb`B`?Th`*LSt`u(o^Dn9qC#+<{1zYX7^Wj^^!
zauKvN+J*5-ML}yFYhwG!UgK>*L)i}xqGIlr;|A4I&!WY;R}U^4%LLUXoka$3K4-N9
zGLww-@Bl`$9#UUi0g>SwrjeY3SeFl~BiAzQ%#IpURss+i;RA_1;^@C&DXEY)89UNh
zV<@>y#L_>yJx<0d;NM~AML*X%jz6dCk*~sq6LvP~ymjP=<%1N2seG!I85tO@8}Z5&
z>rlt>)ph+41LMm=>iQ+ti4OxE^uDHN!*(*A%J$@$-;XM1^v`_P9%fSL4*th_)qnKT
zt#P~e@{C9xK2SMtI(0%WausK0QL_7N1;03e^Nr-cmRy%c(}z4_`|b}eql272KWW2%
z8)-Lzp7b!)h~8m`f&%IT0Zx0`G%7HOF4}oCBxhqfK4}6j7T8-EN4fg)Zw8)r?;|_~
zvs^6NL3L5tVQD2bFNWCbz3~hQ(-Nxgd9Tkf4K4rs%N1g#2_y1GIa}*fo?1<|(MA~G
z<z#DMZhN=PQsL~**v%Ql$ah$sG5FPeCX4Vz7vJ%I3St*Cfs~xuU_<gFR@|GS!O$i9
zd=fbuAx7{I2M2{7X*wr@>fme0wCvr3cyOKf!W*+i1VeST=TDRu)e8&8Cid#aoLS~O
z!|H{3NY0IVI=*JMR*gJH92_H2`w#|o(VDW#VR&M=T2}<}fR4&#Cb~mt*1AeJ#*@dw
z*|Sp>L)bkcycLU`cj3;#<|SVbOpMsP9*ZAC_n8|nOh&{Awdp_(6<)){Drt&!?bD?*
zziK?c@%zncV*gXwHoV;MOm$AUNuC-vPl^kL=XE%+Q`|nNB_-RY%bHJ1Kg%3)zxcV1
zW0B^1gL?hl_QMiPXD2@mX=xCd^Z4}*{>`?;3FDj(tZTerf~Ts6@$Ow7Ql$RZ#TC;L
zhG(SQboOv)O6um#C)nluZ!^N#-b%2W#ZnO?>Q$Jm;D2(#mQsnw53Vo=^@~EZPd<iq
z9Q!>U<}JZVU$yF!{1U>vmSLwM`sz^*qh31=+k(WXq{lM85G?yI3@7!y@-id(^{Nn>
zmUMbP<5mV%p?y467e-sQlvX#=EPD|c9?vHJ(kr`Yv;7mVG&~6*#6)%LfTwHE+HfbR
z9x;zMx+_Ol|Dh}9qzsyi=L+nu@!?MwwgGBNn~FR#V&Tru%Z_}@ds4|?9)ffb)ZX~_
z(=-U|<xm9n;AN{u)f`J3Vv|*Z_{eu#2`hMQUw+kL^*xih*9wwf&(deI{hL@F_rXzQ
z<_OKRukifxFEb^K;Tkf$AxVxXjHHwAuKHjKEpt6p`TLCk@oh{L!Cbp~n`Kx{Z2IpO
z{(@*Gb?&!Vz(<|eN`gC=05+0k_jY)tgMFdjy_RuF8g~f(fzbT)$dtu`W{wg0_31_i
zCBnEI)a;lU{@xFC4>@}fOfVgceGPD6wM1bXw;z00D4T|5tdB`;jTg=XAsrwgzgOP@
zjFnsYC6CTkoN5!elsJ>C3SxMp@LHp0=s-K6w$3XZ*TS#t?5a3={+;k(3gn&wllcgn
z*?FOxqE{!0a`Hzi$~_{*dps^KC+Vh=|52DxRM|#y<_k?n|5rPm0NjxR+dPA9Cff%&
zC|ybkj?m`UqcpcALw}7A!A=@bVl}%i93f|aDZjC$2S`}p<b^`93Rw`(Rri~;9VUdr
zqyA0(2eEW#pKDE2wiyz!?RX83|KsTv$Dgd{cZz>Lw;av^GCY7v!g4^vQT}6>m7wuR
zBz!giEIiYUmi6+&f3DLf<q~Q{9Cq2$tZm{o^hni=<j>%@T@4v!U`9!LO_~*Wkp0N)
z!GNHOen)j&3SixTSDU>UfQ7iB-@CW(da&!lS*lBJXo4Ik9Qm0-HRkdheO|F^f6MwJ
z-v3d$Fwaxsc5$`&vr-QIcU0*x!hTg|_2VC;bHYy1`AdB27qX#$*mutDS3i3E9Wr_1
zb4V0^bNv_M^8!8sNs8<zV9Krk1=Bi9b3%kUIWYf-z+(l&fd0>qV*Ljk)DLO_(Q^-Y
zq#sXDcXa0SV^L(kq01Lp9$Xbw@Rj}|QJVL&;6%-dz0Y*9`NwU#*kBKmj{i2Ulc%X(
zCklHs3?p$&K~9KbpiBog<^Mnqci6~zXO1H?@z+dv{Lt>mK4Uz{e9J$r2Ss=?#vDga
z_?gggX_0|nt2fjXx&ld}m0pTB>prIraNK<URvJI}inw;C`tc^gm{{{~(}4#cB=*4M
zHYtdFuXBtN+Kq7DvhYQ;%e^U37r=lh|2)_4Gu^c{RbxU~RdpRM-370D0r_i#8<^y{
zvYC0)NfUt(3IhNzkL}(MgP7)p=55@)>V^eu3#}iVo2mI3141W7i3+X!mKzFSQ!rkJ
z1)-j^(IidH-^^NJ$3H&!L10XA@`cSSKxP)9@}TEoHvT-JQrc}&`(?1yfX36^7j<1k
zNlV?tBQ!GdnksH`RWD|PIPbPX!||=4Ksc!VQmkp?rk<;#QnT0f*m0Hkq|j<kzvXX#
z?}XWvZedF+5Fkd+$wB7QQ(zmsd_r;Y#0ocL*H!x%Mw%1+d8O^mw(+;0tM19%0xLNC
zjV3dsb_9qIp<}mW)WMR@Dl}GBQb-QR2-%%>#yj8bmj9q~Jx<H?Md#Jv<BCq_x0(9y
z$IL#nyrg>!OTg@hRq;xym}__&GR&~fpt0=;&J$XIhWL&?s8_(HbNzD|OyWsNbN;62
z^tRAAZ9MRn>1$b@I791`rQ3dl`6kS*>(oVNSTSem@dp^qWm^Lw2RAM#xF`r^UU_ti
zAM|8yfzon&oXf+JqcFSc!S3-2PIR^E6vow1E$8Pv*piiPT@H(Y7VL<ytm9t1APC<}
z$|#bpF~G82ox#*3=Ejwh6Ay?&D47tN^O|jrm_l*Xt4-(v;{^3FivI-Jc{`QQm25=c
zKFnj7HinKM1&kmNQj%QvGjlqL)Ou;Vi^k^w!-f|`)^V&3TJ`++fZtV@hIs23qMD0?
zZ2gp0eOkF5(&JAKu3h1FxhzCjcP`#H>~a)1%6v=O*s2$hadg~J?dTb*OZ$e%om@%8
z9+<(x9O~Xbh%Bq%@UyN|buw1zW`p`!E~-e-`;5%LiC>=I6!_IrirojEmO+zL`BV5)
zf?wVi@z1-~Kqe<0C%pqU8q3Uc^uXMq=xX?}#<tvO(9q9#N1?zrs=1pnTrqm~<^Xl|
z+|J(UyQOD%+7|#MqIxuof9sL*X=ZGcY&NSoM7RjKZ;3IanWl4#St4vUb+}QZ#sbKU
znUhHxa#rW#&d)Gz7mOF6ND5=Xwal^CNa$}}OPd4+lVivgAS;&z-69#&LzuVK;Xct`
z&f&1*O<Xuu5!E|F>?P)WNfo4e>})_1?Qv^vu+Te(VP$9)O2xt;UhU1cW$&^)4|Q3Y
z<}Fg*WG`yr_JC*BM^1WKPYefUJ@lKo@;T-~WYE5)xw$KI@3AJZI>jtw3JqXi?R<0-
zTA7!rPd?2EI;%X#HD1?#R64%*#Nmm`K2G5UZu&p+5p}AAx&ah%5U~P50oYG759!=7
zP5sq#W$Iy>^};sgAw)T4rdq`}r-a33<qc~PVFy@=mB<Z<v4LNPJScyhlTvqbK^z2C
zAatUP`dk5Spx(9s#eC&y#}@*#Mam#@p+M|O`@`i+R-Z7^Ih(y(;geGplf;^ic=Bp{
zR)>|vrC6DzuVILreS~~jmMD#6N759BJf0$qFl@(?xImmW;38@&uj<Ht1N~a%IFmvF
zSbBVSGcvr?ucY!P0-qi(gC~pxQc=W0Cq#w8(R5pPmj8C4fmAjAig}ONHo~S!nfJ#X
z@=*+{Qh=o?h@G`JCtWM!u_<!==SQ88c66cQJ?dhL5%K>pjIBM|IQmef86x6Gi=kLD
zlT_MM$7%^tUpA}&46|$-M8bs7Dbm==|L5)tuiJ;55Cq|mun>K#qwK)XK|Jt|#u$I)
z_7n?(fjgQubY^eNCH<`<8IP);Yo&18&<A;Bmss6)JEWc{nAYl_0FlY)hbpsI@IU~z
z%dC3E_YMdMUs<>!M{%S=k6;j|R@uP&d=i0x6bQI9lM<fccusOY90|<+VmLcf9!DZ~
z!@IyEf1VetyQA-Q_gs5roaJ5=AWw0G)t`6l8M)VuN5D0d^1z$*%w$v2Ie!#v0kWde
zx4S~1faIKU<nYp)tcg@v!Xv2@y_^31+Z?dXhurtEvdgPWNv~(#P^_||&%sk+UHU)2
z1jO^#Eo{mMhO1R;XKgvaR6*KCpZ@D)1(#KX3;II>T-9ek;U6AtqxF3Sb1vdNYlEGR
z<F=pJyvxP!n|nRS#W(+9MuZYWh8h&>Qm&xO1k{C{!-4y^b5?1tVxR!n(G`=X!m*zp
zij3~j{a+rn!EXCID1G0C7@ab+Su8<r=BIy!hD#Vl4aNmW<esxz*W)z|O<06hS-s`G
zv*=O5*oc@&ZO@SVPHdMVqQ|JZb5UqX7>+pNsipr3piz|=+m1C!PW*P?C~~u87uny$
zUM|a&Q~Ufsy0N58q*%m%vVo~bCpR_KsLi--sFL!-VMjBri1Oe=7i@MB*xR^Or{+B9
z&T#8azoEE<+xwk$5<Wgzn{<R)U#R7JR6dUHhJ_<zl_*-qiMh##`qp^!Dz?q6Li?@-
zFHG^lqWEe~2BYcNpHj-Ek*&6|{{1_Oq^1)R{E9J+sgh4C|7rYXoj~2Ct?tfCnTbml
zIXAn7XKKR2#u^#2wB9@-d8U7|c!;_p&G{hZd0cpM^Fn;GOA=dfenWz*n;&@Kyr%!G
zJHqIRDrFdsh`o6QwTM#6ce$eLPn)pk=@JsW+fVe-KmYbjUYEx-xsSL|ln56h?i$2-
z8=7}0)DZpk*XH)1c0sH=OO-G2ua;$)3cW~b^uvh%;!RqIu0&r%xE;_aPL^|V7M;QV
zph<7g(o$2hH^9=**dS-zve)e$iMhcCU#wRU?uE&mZ#<Hx+p$bvD8)kgF)n=*%h6Vp
zb_OFNt9?jL-Ftkor(e_gWV`gcfwF4E@mWi_0=;YfG4@W)3ekOIZ{IZ&i1HBiQZ5KX
z-aM%@81=dfuXS>UQX(;R8fjh;!B_lRNl0ze_I=EZ?cL}(s?_q!Ti?l<q_^_D$Uv;h
z4o(rnpjCGmy2F-=MFwM&x0AY$F~yZ6Sk_y@Gt5;Uk^wq@YeXFq-1z+%bGHifh2h)^
zyFpPzKKkF!7kPcC!;fm19X>6@kTxg1JyR4>dyqb=^p^3XBKG;vL1shWAi;jvBCl8~
z*qpw~m}?8|m5(PG9yy@kbZ3@$)}r@iki=b@$)R60VezI#&o?5eljQxowvKW5P^W$2
z@%CEdhoCuW@=%34ol+$GulUQGlOt`(`2OEl!&?+g6$&eZ;#fwTaV*d_@>9EA{FhJm
zRPEh547rUR;t07W=WIO4;~EeiMw>@<_V2E(|5f2TGq(9x8sK|NxTGkfHh~HkKOWQG
zfW3(7pI?>Zg$C;RVm}boNZlOQnZY`E*A0URT2*VcZws|x<jF8}wske?-*VcCel}$$
zN>~letI*vaame8`_^=B%qkZiaiCG*cD(Arjc5pXG?5Y;U-NO<7Qm|xqIWL}_qs-&k
zx#ownE^ut^Q=e=;lCFeEAcCt=N9AY(`%j_RT9VlCzP1l<H7=2VO`0tnat<a?l)iyj
zR}ya(0{^Eyl!zf)2eLLDH0SK8d0{IPL5}=SjaBeEg3LSS@sL3go@C@}PES#=o8}bk
z16`lk%&z^+F8yP>w~y{fm(XDu=akvji+CBqFwJ>DoI*B9)t_C0{O3|FXc(rbmUs)-
z-g41&Rq<CEJ4OC-s?ZgJON^u#KW1*}`S)e>p~HYZe1lA|k^4bpI2N1NWX#srQ{ILR
z&)J`K%M-DN&?kYaM8$r|56anUf2b@pb&`DssDog7ervqN5S-e^PRTVavK=*EKepr3
z7?_PpE=*d*F^W{PeOBLRrCMi5s`Amjhh9r}h&P9XcKX^xBbDC2;+PUj10t>gx7$;b
z+l8DNzi0(Jo(C|tU0FYVhf_QX54`=M59(f)Jmx*a6&FUFA;}n27``cIz@OTj65R+>
z1^pVc{&HdBoPwR2l;Y&=-sV?xjJifn5=H!r8zN;g*t<;xWdgGmCde&oJ@Kvt)9V5-
zs<Ya?FjC2S>o%N;)(<5-4lhS6{&tbYw}1kz1kw{W_0SGz0nPaxi19wY=e<_17}m&O
z_Ck2e($MX<D~ch6=_-M0uD;JHeI(f&ff8sO&VV?8zug06fjy~5dg<B-P0DXcMj$nj
znU=3_s~6kVC$3%P>bMD1nGXI0hp@W@;sbC9kIN0$@A5d-C^y4!a7P1GoyZZe!|&`)
z-;l7hvi7Iz&@94@DP{MTjOOtU$?UI82Zz5YKt=Jp+d$74j5r9i^sw2(RDs>iFv62%
z%#w{Q3tSDBAyN|8qLj8vxe&Hrfc_j&d6dEgj0we=-i9e1{8X??#*VI_+G9nv6JDfL
z$vXi!Z(GSt|B+X+C8yLva`3wtQ8t(NvG5;2-G=Aw06QpPN1m;>?xf%9Ha4p2j<I#<
z%mr>KUfqupK8ca2C!D}ZdOyE+JS@(BL{BDr_pdLVQWl3-*sXn%9-+a$sDL9a=$;J4
z`r%o2s~?ai{f&Q4aShNiMLIt~dBD-O#LZizx)RE8ozVvc+hk^-6Cs)KTmVF1Zf|^6
z*#NW4T+^}Xl1Z<m9^A@6#1Ph-y5k4^rW%Lt9D@y~kuv;h9BLLtWJd-+WUFrL_E|I{
za}oN8@GoL?jxQy+zyFbCl2NW8=Z7-HF9haFpx_Ra4}boh37r>)8sb%>fv8yGCIflC
z)BYrgCPBw?oDwVphe5g_vqS|p@2$}8W)-Po+ph?Q07OX%U<OprzrLW?IC{hv=rjML
z8t61Ku6A|ck`onp)m!gpG_5X6yQjCFQo?^&JV^5a{dZHm9k+Av7Cw_1NveL>{#>${
z`gM&C#|X$Ubz!d=YzP2LtlpXO63-w-lH_^30_h2f^gkVfh+4^(cs*~vVWxedbF`8Q
zH9CZ!@uY7Ezf8_2I`Alc_S?6*x>cP*05`R%yg5M$h_~Iw15L|kKids`@LCCgkQQ~g
z$Mii87Y$Je4cU*abezrxt`MGpSHeGadqDkSP4edKZ6-MiV6>pC#w*rrS6q?LNzm$_
z1(R`p7zy2_>%#agX?A(#G3u8dS!WGJ(o3d=iwNaNYdDHIW0WsCjm*vBYI#_MWrOPN
zDaQ%)26>t-tS!syJrGd?bl=>M^!?d|@z)oQ#zD@5nT(Ihw`C5GH(GxDCtQYo{4j*S
zA7Gp4TQ%p9q#fl^pYy`uK7o6U?NdSbTfrFhxf6q5!M>(*!~tH9*&LbyYXFth*@I3(
zw^JXSMRMPw=iTmPXzLTn#=!ur_<?S?nbB~gnkI0Hf`L;+>iu3nw4Qn1*>3pV_i$86
zVkj(SQ8cAi`q1*Ll1u&CvX7l+<PSWLF8+bSGN6p%Z;)C)WUPq8UuCsb)<lsGz5Lm;
z+7cOly8I_FVX&M;-)i-qCl6@h`RZE+7qmxDhLyvHGb*D<Oju0p@TCq5%fU#u;c6g!
zDv>$&-L(XlSPEL7O*h5~fYC$|ioRY?jjW{Rd?v?3J?bvLJPe<O$KrVuZXr>Rf4+hc
zw8j0p4Uoeaf<*vXOe{iJz-DnVeT(V`%TrGL@InsMGRKJ}Vk<8Fp7otCPVD5_u2<K(
z2fT`WAo@A`zXOMSi7?^M91Kc~INiFCjX`?a;USq;mS!4FT}WW0btD?#Dj_ek8)kn?
zB0zfx^Vgs>gh!82;n%%qDAGL-Dy=^4YSGT~z?CV<23-8Bso0&)$g>8QaeZ*h?wut>
z!x7WXe!_J(ZPip=)-AUB;DmJC3m+s1qn}fK{A^@356cMXv6R7pc!cKp4|b-KAzD_!
z-M?`CON^v9QxM3aJkyPd6(U`#3wn5PyF|EBfo0^yAqmH7Rp)7FXvBU6emr-3;3Re*
zy9|xsqIZop?B*c-3D!1YcMIGIBmn?0XzTcqp4;hJE0I$iqv>qj^|M`NivL@ZFzh!-
zNHT}J{HY#m44hP#H|%fD9+5Xy>vOKrFW1#Z5K+h-0uPXq^&Q5vk6l`VSwStWkLtO9
zH{TQUef&wv3BoXn$;bBuU&V8ot)XVLQmDR7w26jRYD;|t&ECgch`r$(>X}LwUh=%<
zfk>NX06Yil>sfYD#7`H9Cqy7M)bhFSSmRZ&)0m$2p-@^T<!-2Ki#M{_<2$753D=(p
z(}^uZGKC__EL#n?e0eo7Jd2`B`6o!hWNQl7mJhFNs<(rUn+m5^GVwcyMaDBf3Qr-(
z%NkqGF*0zrr)ade@6lXqk4F@%=DQ{3`=Bvspmultfgoy5hx$w>oC4LBcptH`%;H25
z`(Mb7c*F6BscqTI+HXub{lHaQcvZk~yQo;}{xkYWF!m3@u@{?)a&-Sq!SDKt|7dIV
zi|R;Xf6WNTRe`qehgue0CTT;GQ%KUS+c7L@NaEG+*AXINF23Z3@dmkvHQwO)412ih
zDm4tpoTgmo|0@!+udRL5z<84U54{|Aa5#7nkv)p<>HF(rmH+iNt#|&FOjaQ6OJDeu
zvV=<h+4j)S`!;7MnmJwKn+)#`*Q~ZbNFKD1Yk{)}s-Z0oN`KQnL-}1zZM|P@zx%7j
zk}QwFx6!&(G$S?8U>h3TR8n1+`H1rpN6m`r1qvh{DEwl2C#&qdT?mdSx?ufsaT<vS
z#xl4P4D;zqjfEe)KTZ2OCJ4yPLnhqa4DxlT_AVjLk+Hyq>&Mb`da4)aF1X>Zl(6Z<
zefg_@gaGPReO>U4ZzA2=!0vU=C`E*iMlFDtMBuXmnvoRJ;PdOKhv#$}Adw%P;8lh9
zHKZdGCewJ&5?wj_7pqGrSa0tW0>6u}VOIhUsHpMu@BPt9*2_172&n?#&9cZA8q|&c
z0M2LUeUb~(lL%-XAyQtaEc^3h7e&YE8R$NRqxEJ38dii^|Au6&=O$=i?Dp0sM;K$E
zj7>;ZpTZV;l{LmEo)e$_8VPv>Rltk*a&hT!>?5GZ%*9_nTwFzj-~*g>bl32ihB;`e
zJ?CyU;<kPP<M)5X8fa`7%`|H0gAdgJyn_tJFY@q?z}*0yijJu-8CmJpMY7wP`u_g^
zx8(mFEz<j+E%o7zlehJKa$9E^Z+W|4uZnM8DbOWH&)c$6KH5OHw{?4H;@Gl-8Ep^p
zhjMc%T|WM6w=iarC){}OyML;*TOdhSq#Kcb`+d-ht`*o1`s!Ybf~m1*i%B}|Sk8JO
zKX7dp^kaBWrW|e{@1$SJ)dp-UM^Qp$vP17@T27_7ANW3bc{gI$EGuU$gY_BgC-z|&
zA&)Cf>r#gtzla95{SX2#iug3_5Av+COj*1bmW#7I!xBC~*llm#DJao9%a)mN!`mZ?
zn?2H>UgQ$GkVrZ*`>?hikbLE~NxYf=?jAxKwbmTN0|lz11(Fj7w!2Ha4EU?W3hKRV
zHJ}m%S-7w)rF#y4JzU}3$cv`i@4vBolASLI^y&^BwK6=M>vxAJj_umSco_4WezQL!
z*A;WM7&yk9JH{LJAq(w3w}xZ+ova8pHomzFBxutXPDOd9KPDnpQ|#;T_&DK~BGh?-
zPx&n=+{rPp^Y;OBmcup8hjq*3yKMZvFH_HON^5iG8?OH}d8kwn7CnzN4PxQ7pu9ED
zcS?K9;Q>NUH~IODSO3QDyR?_!b`^F)c>X@L-lA9rE#P@))V<8b@tR7p3^2VY>~k+9
z;wOHwDT(5{cMW9HGR_==k6$H<>~ytJXwU9SKGiQJ4^a~76}<PljXh+sobdsuEPHon
z;MjHED0}eohsm{aDVyS;CABI%KhSwcu>#Rs?=I}b?9{s_gJmJ?(+*Sa-jGT=d_wGJ
zAc5(;yy>66wCmMOA}4I%JL+JtP!xCAo}^9VdhK)P4;g^0aeP{9-c?o)p|}tdl-%-j
zm-+A6m?<$W(8%MlzP{|;nJcpE;n|c}ax|cre=IvzA)hu2aMhhpqx0?ZQIrjbo-fix
z|LV>8sF6%p9X-(W5G`O5EiAifxk-w$rWq#quC&tYJiB3{;isEJai?$_pfY)}xqsV6
zQB1<>l(--<g8l(_e@=S+F<hvEtUb&uy4?)TxaU+a__uuc%|%pldvR=}9n~bed6^VY
z1Xt0CP;gXNc7L6d{PF6&I6Jv%*w!SSKGzR94G$)W&ue-e+OsU0?41`)|D;JQ1d645
zN4f4Pm=?zK2|MzR5czKrJporqPi?1Mg!^Eg*BaX3WfdX%K4EEu6t5u$uQv8oEdHgg
z_5u_Ri<rhv5O2M5lYrdZb^qN;#U$)eVe^~#zR1cVyqemi@%DGVukbbaDB<@lKd!!<
z-^udrJiqrs#2z+arQvJEKKk-h<q3~kpLXSFa?v(7HtyS)WMQ<+X*=KtJ{DKhHrNVL
zoaEsCGtz~xrWUyQmGh1C^~<mCvp=9&HjRy$?kwegSS~R+P5o#)h*%+v`=&vvLU(^%
zE{+0c5+Z$%l0H2zDrcCtuTiiuQy{Ky-rub*O_4bzSa1VLQZBS`#*_?;F?AFt6tb@+
z4AH*}(<}j4XgWp(fz9=gDrBIIzjE(cpGw~DhJ<{%ySP)#0T0`sr#UWQV9&nKbh)sy
z9~|hvl7D&6q_cHoFhyxF%t7r=>ul1IBtCSpcHobOc1MJM9uesxJ0DM9tw(vOPy%-%
zf$0jR(Ta%QAAY<q?}RdqLg!0=lJkw=`?xLEeQ?Bw#Mj!z-_V#Ux!fu=vf^K)NIAsc
z^wbd<KI5ISELv}F?Olp|EdM=xgy}-P7J(wb-=ZH9V&<KG$^WEUmn1gcy-7p`L@)ez
zYZLZ-e&URp+oy5u-AFS7*Fnp)+aLSif*7=_mQ$l$i+yVNAVe*z9KrYVIZW{0ZN20?
zgQr2gDT!eE=Z%!t{m{0CvI8HPuJhhlZ}3x-S{d8wIrpzDE3$vCk&w-?4}QjGEn1^#
z2$?jx>f)sq3HNy3;2s{Yijl|(@m)A`KMB;9y$BVdzaX4`IJ^qG*u=`qy^YNDI(qi3
zU^ptOD9uFL@@2+<59qX&xKp3j3%$}{tF(M07>5;m;?d1k$Lh-Rk&eU41@nrVnB>)M
zSsfH>YdY7+W$)>l!X;FHv7FkHp)6dV%aJTd6v=)saM4#U`<%^{Q|{kOt8r;lhEj*T
z`(#19u=@d}ln_vQe9R8*eStOA+?`Up30<}p_m(#q46k%%U2}uj>#O91-U|^*6FwY)
zJ>^VwS8y4lQaSEnmXXKr^`zSjk&@Xrjmn8%(c+>wecHn0`qNNntE{E;WOF#dgn!gb
zLLq$4w~8O-Mc;huEBO$9r@LMA=D?AT&yCBK#Q2H$%VoA2%?sW$(Qg|P>#XF<5e$Aa
z2|bG;DERbtXS#3(`I_*8#B+lO#Zlt$dR-o+FAw%~B{hVkw1M5s#`)sr88ZIb*#5u5
zM_jKJ+qiVDVp54DBwR!W!8&Q;8GluJVuHIyqI~HM9c~o{o~GoP8phqzv8i8mZBXT0
zP<H6~ugtcxVD<&}eH$mI0^?zw<EO-j=DOblE}0y@6bpH{3%S(MoV~7^KG*N5ZYF@W
zu`6y|GDtG47<~BNmZ-h7_wJoZw92u?8+(sbwE?I3@5#r^6JVL&1rz4Gy3eHM-cBSt
zzf&;GVM_T%GN|tSwI`)(AM&oEG{d3E&4?{=Ls}B;RpSyIPoFlzL@>4JAo1cozldnf
z4|{_W`dMjy6pPy*yX1;_(Fvr`hfDWO)Dp|YCXcD+Me#D+GTcyQf1SR+@`};w>&qx{
zrlLOu$xm`t52f&p1XAtHGuI7iPvVwtuG)sHtj}Zi?3-v(FloK+6(7%<ku-wY@i5y0
zM+a{J+Ca~K!0$a3$BEN>E;n453pe9S-&aPb>VCX_ivO=d&q&rOV!7^xwv7a0ihz3U
zzj|!tx9u9SpJ)Gt5^Y+t|M0sC#!BDi{$bvmdLMB7vXnyi>%-oPdzw9DR1XJuU>$s#
zj}F`VL)6T#--NA*Yxd~~By4E}2gerEu*@xekG_cy(-HdKBg<KbTXr11>$qBvDB>XR
zt?pfdOS`-nR#Qo(j!#mxn)}>DFE;mibEA&WNyTyOl?N9)Gn&FbBKCd{cOMmfS!PrM
z^<I*edRfEpBdQ!{S_jnC6K(bEt~Z&+oL$|6g}gF_@h>HzGaEI1b&vdpii6$E8&8%y
zQXvhC)rD<$;^cilo)XV55U}54Nv`W{{w3XtXVKK~9N*!J^-+Y_q}b;!?W~cI{5$6M
z>5DSCT(#6>#)Yg!<5>&RC%w&OZ&%Zk$5MmiLEc~M=WRxf?6cIol8uNLRYtGgJ-Snp
zH;{?cc>g1>WnKbaK%&bt)HsInY@?6KdLM&6w|Q*@4e`6jFq{ut_WtSS&OhhGpv$LJ
z5b&#)c0Ep7lrTgohq7>l*P!o{Yj|Fdr;I}cH&LMVO8JD%QyOq6X0RFLRWH~-<UDiY
zpy#jR+w1;&32Y|(X5DEe<{KfYE17QFjJ@S<_o(A{CAJ3NC+rS7^Xgs1?jZCh+`hYD
zmBCsSh1<*II6`_K^aaI)yD!TftM`X1>r95aim8q-1dG=T+emJl!?~QI&w~**er=lW
z$M-yLswR*5KJ@AH>EVl8a~>aWlrvKGcJ<}xv5%C`{u{L748+Q*v(7$Vbg~zDo^2=j
zkjAIBr37}_HWZu?@+CcKedN*CbLr+P@(xvnEK3_9FEJsjXJ=cZZ*C7!$6)gVV!a!Z
zg`kwp{4&dGqwj6q)(rP9Bbx8M(E~srJzFovJM(iawYIAHcEP6RL1D76#V5ymhWGdV
zLk^vx&LAOl_MasWsNhf9lD^lBOe4BKz_s!xu7vj0%1Ce4yaoa{M0N+^8@kd|zTX6s
zFe6&YFxD)50=O>0Oe32B4Kg@i=Q;E5CQb1?$O9)S^3-_gD?vM=pL)=HUnN0$94<U7
zVWM$t?V2ZvH8WA2quMzBEt)1OjZOYy!bXxx0hep%cLnm!4WicZgQZFHX`^`NWA9A}
z>EVt?<6@nQsorzWP?t3FQnwK7Ya4vo;Xa7<h8~i$2y#=o9f)ZdxL|>%CN<f`xE{w{
z3+jBrIz<L>{U}m%?m56tv1ayEKENH+mDlujfl4woSd2c;E8p{vppyM9>94)e{~jZ$
z)5AgCuMF6Rc#?g)#iKra<0|{bIx#-;g!IokskfECia<f<*6u@yNx1awY?~e|$s_KK
zSQA6J>XtQ^OUN`pGLRx1k7@<uh`4-{qjvnP+cO}CAVCBt;26F{W-aUQ^f^W?TW)`D
zx!D$ct(Cn@>;#oE-_5G;ySOt~$?-N=?s6E;A!GjCp#zk&ezGpb6yRvQ_1KL`&&KrV
z&C?4ns=unewJQ!A)9)_7$&2gkVZC^2jyp&WR?PNm{NtuAE0=-9xnmms<F?#(m;^CO
zBXEXqzKqR8;8}sjSzKyEC1hM=Qww{D{@bjpojk-3rckK+$j8VO+^2p!ioD>a_oY;Z
zVnJ35TuuqQec%9&;9w?)Tlb_OrKCvS=1D#$zjdN327J!Wf{po{j;c%`?%e74AK}@;
zROaku*8RkRy9Qu>?e^qOr?sdn>2K{bB@YM2Y`Jlq3>>{+djCg)(SPr?dZA#PL(*IV
z4AO9fA(pB5^d}@^qrd70d0s&TzQUdyES~9pRZjK?LD$c!ydG>nci8I0Y)xhw&8fx&
z8rWSf6l{&1KVqu|J=C4@A8v^Yg)VWO6v*Rr7I#wX>m1TqO|)Ni8%3J;u7ve`G&#Jt
z0u`u_YSxXPGQ#r~TNF594#EqYK<N)b>QyXUQMa6*^gB4;?8HnWN9B_07@nC^sQJWD
z+vFlrlH7AOMjluBD0wZH<j8#*fF&UO)R0oYs>@WR>p%Y&TRSJ?Sk$OBan{1qO%)nS
za836#ARv9GPEvbVSmbR8L6|QfaA9y(zw{<h{3p7jSd|=ghzdp2vR6iuxs6sBow@)Y
z4Zkl&0q6Q!6PHRpnQB0B=~{YFS^p3thBmVhH7w+kA9-7QJ}gpao{x2nWJSgaS?@oS
zHxG}XG=UVnj9A=Gnl~2LU5jP8V_bz5L;MzC;2;tVdB<#1knm4N$32oNvZwFDZi#~?
z@pD}n$0MHUv!Bv%o%sQS`BkrF-i$3|vB0ui(ci?uP84wB3K8wqsL%t^M$b3aU7dp)
zqt2}Vvh%ySwb3do3UpUeWhLT(o(JGQn1&zG?yA2=y*5winC0(;@$WS5vb&Yn#azE4
zi}V>sBD@fOiZFt7{_rhv7b($_<CB=$-w7euS4Z9K4oO+n?tcPv2w7>Q0~W{FP#8K9
zYH8>B+Yfd&x!X8sknImp<-eX+GNp@ULKVd*+Pm^+uqzzNyR?@XrUe<a2x*saHt@0k
z=cuOdufBF_Gnv{s{iFxx3vADDJh|!w^Mh}Ek*WR-36@9*j9OAbyeXr$JNSiM9qCP9
zG=b_Y4isUGZ-X(Uy*2Gzs<eJD?~#9n0u~QNk>f%(c`~5qRx5r0dGzjHE(<YqIZo*f
zPQdVyk@(8sRQGS3*tnMf-c562)>%7V2Obt;wIEl()8xQ`oHSm;`PWZ?jELoMEfD82
zP8glsBWcn%7p@HZqm!Y*H5j(Ry9+TLBqyr;r5q3o14kI&%@4^B1q2;#C%WDqU~~c^
zbpFzp+Wm?$j>HMF^ot@0#n^r%>+8Hhf0Ojh+FRU`o+$4mVK&kmF7f}HE%}4W>s0E5
zu&KWQ8)4}bAaF)u1nfOQQ+`o2I=|7{leK(Y60;%90>u$1?3u;9&Xh9}kjSE9(bVj2
zLD>5qH)A6{&a!+!+i{fYq*g<Y;hL5UzsHd5#0ijer#hYIl}KFX_OU;Ql(w?pr9Go#
z99|<vA22C-%)UIz*?BX?nmt{fznh0&aDo}O-1ho#-_7{jjh;M0`~-5HParAv#Bh9e
zQtHOxJNn^MoU7<TV?NuOXrB;8%QYQoV!qW|;ok|Sq3`hulBuhH<c>o1BaxB)ofv!D
zdU7$7Dx0MJxzSrT6<pTZ(Qt10ftmN&)&J=`%9BPnj<2Wsnd`1z+h@~k@l3c%HxBQ7
z>DqyUx(D4NYdZ(j;Xrk0(WRZMpQiiwv$A%(J8RYf+$CK>Z-H0>f%PB+mW)j)Kwggv
z!Q~=D9lCH%2OY@J2pSH*FI83+49AD7D9t~-^xE2j|At}UEqM7xVUi_g>I);ue~FrF
zAz=`!ml4YT`Q1E2i|Z2P>>6=0k^KYvUn?aJv0>l>v7XL>s_Bx*ajtz%pLOKL8mmmC
zDRKSSqs$G-U#tmaeioV?f6?e+_vHorYT}*H-i={_S;p=?%V;7SnxOw^_kjX(6Ba`-
zYpYCXby3g_kR=0eZ#Ej1LY+LG6h<0PD=~F$Tc;?D3p~;`=J|SIx+S+UOIY2o0$Dko
zy@aE5%R<(a#=!dUZObLfBH|ISlIpOttg$h7K_n{Nn(DkW7t44;Jh0<rwe>P1QIS+c
zUyiba+cV_qMkaDr0moyC;0Y1>i|G&y6TgMApLTh_Xf9N>dEXvs4HbpAt-PJ6V|S{7
z7{eu0x+SY++9)utHCL-jiwN_G=;8jtHLGv*fv}Mjn9}9hxIE%-nh4Z62um-uTTnR_
zk??*-RaC;kWLxe;Y0J*66el#xA^PLO_*QcX6IRe3_{6Md97tCXpD<s~HI4lQkEH!I
zat!ofgTc}khFx);D=3&#X+5dMmQBFtt?*c2htjsPN(Azx!rFl)9PuULheRi#6^PW5
zVBj?ACWoPI`iFXW>!9TxEO3+JpkWBruKW8jg;~p7TN6n~F{bjo07f1($U?;+n~NN`
zdv}^-_-pw*De`F!Mdr5BXaG;GCo1Lk_zu2pUCcSRDze96L&S<js*t!t*zl7#CiK~2
z^T&OJ3pB617)`bwv|J-%1;u$3``(#p99e=h-V&M~_itz2;G6cXtT{!BLI#pqF6rJ|
zpuEkTbPP!lkXmoNqS$^ZI)(~F7#wQOeRt^%ED+B+ExNn)Z2)(vU9NLws0F<Wb7A#{
z^TO)i-#gY@y0?vypBO4$D?|J=p*=y&63Uvo_X<~f>A{Xh6pay<=gQ;SHv&~ZQQ(Y7
ze4Cp`7Uk6c`o_Jdo@(KrF`BI1u|0z&*<Wn8PFEl>{jA5w4^P^RLl?>mBU#HGI~oVS
zA9LOSx}(Ci-&0GPUSDN+AwM)HkU+kQKlj4n|Kc;^YrHU^Q&J9>XjJ{}a*V&gT{gBv
zg|Q|mBYrA5)Fi{$nWW#*u|RdDJO%_ZEj<5C)tTWdK@v0zy9)&z%w~LAOk4sxNvuDC
z)aAIly(rROCn+%tce4gsT5;l{915OGlL)bp6G?z{{eth*BiaTT!2WVsFtwwdEDl_I
zbB)|*B(Ke;iAt5>lE2s*8KQ;^-t2$3LdBMZ|Ic6x3eYCYii0SqP_)rJb+f)!bLYeA
z6{Xi}*^C!^sc~_i!j;nhqNO>^?Aviib>o?)ANTCVN<=E+cI(C$Z5+A}B#_*t>!;h4
zvRNK~YvZaune)Qae~^?~WOpCSkxNS*C?DA8iri*H|00zqZw~LO$`0}nhiOHeMyAP?
z&$b6d*pw!3V)3XaNNg5);ga~Pb*U0uEW+~Xh5u(b<cU(f(NUt5p5_1<LN?RV>ZhhP
z4Yswtu$9*D!O_vH6Ye1X-mLSk;iwJB33nn*OK_LkW5=Yp^r!G6->|i76y@2bCEmQ<
zkHbsi&Dl6(2f5J98t)D)3Uz!uXzZRJ1rrFq+fBjA=u^ZmjKjX4ET9wC?F;wsqUyu3
zL2b$>Tt*+{tvo3);rkrG-%qmL&HwVx*9VXLk#}g0scR$(D}8yYxE(uzLmwkQ*dYBP
z4y~_QsTYoUk*8<LO7D)tV9H5Kr=uAAAMma;Toqj6uIma{c&3jyIfV7%u03y=a9qf%
zj^}?O#)tQBx>4A$NoC}d9+jP|<?dsa>%v<HCfy!`#=Ep1H`MYX&vcn*P2wK&yQ1+M
z744d;1A1hthSr*NIiu1Fx;Zx^9u1$9j}xwOz>lrv<3sJbk)m0hZy5<5PnM71b<Yr8
z>SNK|U4Yj~sa!tE*-(GMLr8rYZFGA@>Jr{7>2?=2o;ml{-1bvQA&wb$x@%?h)z;Q}
z5)6PXO7tc|!wrGdS2N*i1;_^kOD^aYVyVxAUIW-7MLo`f9m9-hHF}OhzK#~u6$(Rd
z8wR~Ymzv7jA7PostdsF&lZj)~&h+_m6P`FdzS4`pem^(w6QY^!(uQSd<694r@!p1)
zz?5&gwW$<Tu<hiCdw!U5*>l^Hu)^$BxxEJhx)RSJ>rsJon|!OrOecy5G~O4Dn?z5_
zvZPI1Wd-_|L?_-n?5$k3{}jMzB5(H5s8#NY*8E7}(^~c3r<;eKu&mZYz=5^2>9WVL
zIt-6&y4+ponQ%Q~)xXA&z;JvlFaGT<@oy1HP#A}Ah0PkAi>6@HQn?daSt6okMPnm0
zqOYF_Q6KrHI&W^A7dpLTNwv4GW7&@d_&mFUEv%L`&wC^Pu6@KfyJ|l#@%SWX&k*&^
zcIi2rV}^Snh*VI{I&iiB6gG>JW^Pu?OMjyL<gDX|*i-jECYyI`+}+^_S(@J82Whq1
zz|PH6*j)c-{<2`K5=FU(VNg;r?vdZddOfD;>Bu6T&7BLDA6kOo2Qxqomw41P+f>2F
zUU`BRPo#r>3Sz>}Z(<|)>F*1sep#|D9L_8mxBiM{eu$%=-)tHZloimSO~#g+I$X|k
zpMTySS3KVB&L^k_$q>m|e7?!^uz?Fplg19N$^!8pwV*jemFNZMeCRtb%KcOyFBnCu
z>e7)9y|$a~y`NO^vxG+TRcc}JPjjdMOJ-h82!02F5=#J<W0eySs`QCK`&Fa@vOApf
zcUI-{1$_$%MH(h;$%vxS^pUd&bTzA3369Gf46J)>++xH9r>*BU&nT|_vnUwBV}MG<
z4e_|TA@JAN{p14Z3fM39Z#FtkK0II8ijJh#O#+P&DTMyiRj?sw1zQk~iTCgRSwdIV
zc7a!z!?8L50YTthVA#%R&btK<M1<O4iLj6LCocV7I`|QU#kX$y5x!e_-)b5dX&$wF
z!3Ym`N*HCb*?nXTumXUOrWzzci%>DkTS>v@5-M49oKL;&k<aArM@L&4c~O=E<^}>0
zaOQmJqp`c((7cmi0M4bSn1Yue99QM)9ue$94;!9(UG6BssR15=!ooRCJzB4FKQ)vT
zfOtWLp_{YNvQU-TD#MVEBWMZ}g+!n*(SUd@ZT8&TTX8LJ+I*G!<lm#)On3tzeR%M1
z6uk>I$^0w0`Wuc}AM?8P(>1E#d1WO^;Z9|U?IyCoAA?R=#97*WbR`ZN6&ZYTgCiS<
z!EdMDSRCD1gZ<3PDGtw_@d7T$JP5N~tDu>9gXr)Yw#WEO!0=86-tZ1-^JCtkWJprA
zr2Xv+{8l^+1fxb!p_LiG*!NcZ5wX%YvQ`ul><Kj<xRdtX&)Z2@qlfJ@fgb_~2Zlc^
zkH2&3OxlhgUX`nA4*3R)M!>ee>XfR01FH7y&)t2)&y-YKjzPxp<4Sz|6_%^i=xQv1
z()rK{k*yaDR1KDhYIc7a+kW|66Sv)|@Cpze9i8)M8wWgHAa&5~e~@4)u0Xy7#%A{`
ze)okRXRz1PMz&co-Y4{=3b^>N)UETvi|Ra-WCv=+GN5c6SxD3_H%NZYpHMk-x51q2
z$)te41d7-~(MPm0-%vcSZ%@BjK_x+Ch#$aG7P^OoRz$p`4`c6QPjQaG;MREc=R`Mm
z(eWoK3zMd~|CG1k?EuOm(4?u1lF<WO5R5@fRKP)j>^LMOg5U_c&4_+|AKW_A0u(`4
zcj5WpxqFOo>Z_WC<PY!E`ILUo6eCVV4kex}b)XSXZPcZov=Q0FR2eFr6w6Ssad_{u
ztl;jomx?zB7k)KH)o0=a@ji8oOZ&MQ2JqJkuPWRCWv=pCO`!=kxVe8M<nkc2vA7K8
z6uPTS2GC?&2N(R=OnBvz<=<Xe14^OYLNwl3zSAwYBeQR`)zjD89Sa1jv_L>g6i_*E
z{&V%1Uby)ADx1k)b<(`^y)95t(_iISj#U~|K8M<?JnISC*!#pr1#*V70n!dukl~*q
zMzItq*pKkuE=wLH0)!=P)kJo4=}^3!dH8JP!&5|MpB;3Xwfo+XdsgCJkT>Xl##@Ne
ziP^X{NCcz|#T8xdWHS5neGS}S{I<`cM<t0N{9bJU5#UxguAL!gpx@j<^469cYrTU!
zFGt_;e|ZJJ$O`|8V_X#h4vckKRIr9YyG@Az_#Jr1JJ5Lr9}b?|B*}D+slO51{eoi$
zT26$(1LeMs@I;v9mE_afG-&y`euA-ra0H%Hff9%287Sk@gO0N8ZemQ@N`D;LlDv*I
zms@0>&S~)#rklju?z5=1jg>brOM`scrSnJOx+B@(Q@&$7nO#90-O2oipV(x`m+-MI
z2xvfDAbC5b_EakB8*=vsO^ilMq_ih2DH^Z-HuTrMg_8w+Yl{NO0<kn~0-_542*<LD
z&|}*ULRY+&k2vS@rn!z@=~pLgXwC;w&Ue1HdxkD~(+yq~IFUHH*Gi!~)CDSJFrpJ$
z;xP<;d<#^r8NAI40t>UMVoUzCu?gPs#+!vEJ*yV_-8Yta)gZ8uJ{$VAJE8L!s!Vmc
z6q6oi_2Q|MY%ao05k{29?Y%>JkTNcNXT~*Q+C7robffIwy#8+Ph+`64JQ6_xlHl6n
zY3T9T%O{8cv4Uh6F|>R;N|yXui)eat`bEk645+{u6o+&-v$wr2o3|AKLkqdzB{R^;
zqg*Q8gEGplR_YOFrKis|fBcC?xEdg6ARMWuQFKEMAN3BwUI!Q&|H3>_hJ&T%2*(ih
zBOdfx*Fy711aL<nyjHz@PFLY#F1c1xDdcnzf+^kZemXa;y`~geFW5X1u2<|HnlHs0
zkvEtm*t?eawM}q+tJi=r7r7B*^=cQr?6}ouBxzcbCK2g;KmDi<)e(+$U8TQqes;fa
z8z<~TrSiDDsO|}Fzel-Q_}9bt^t-a})tj0j>G8oHS8M~9D7&s@?#;RXKAq~TTwRFR
zPk+#Y84wv{#6_gNcVxLmLOs2EN$Yw4_UT(*zgiDj4>0XuYbC>|?@x&ie`A-crOf&w
z)A8TnXA0aBo+t3#w<n~wX`eq7czb^Uc6@N0C=vGa_h<W}D*lF9m&kqDXNkK3%QyMh
zHp(UYgICm3n-nItj>w0nmm~3kE5nMb#%J9ng_gEj^uCX+N~~jX{Y2)NhsN{^9iBt^
zG#Qh+rk|JI`V`6IELWTHTnY}dL7Cb4K$g+P<2PB{Br+OjIMNYP-!i{0R`YDut3L*~
zL=$=%;OV^T66^k&)xQ)VwHX3hr9A;%Gn~AC59VHeVw#yZAa3S;WIIabeg7JKjo<n|
zj?O!p&4&%3#NJ!&RlD}C6_nbew$`3io7yW;Ek*6xL~X6TRY{84tJGH1-c%GJ<IC^+
zGjby5WV}h<=RWu9-#i?7$h)Mp#1&c03Sm796+r}%Fr~i@?FcGW_hOuVyeu<wDzu5K
z4y`^Ye;zQ9>vp3hJj5QyUybt)^a2LN+Rjo=3gNEf?E<9=sbJ_z|1>}y!Z`+N_>go=
z{8gOQE7W`YoEOWZFn4X4$SlV>QTX<i+J3aQv}=z$qW--|H26X8)40On^K=Si4#S4T
zodaP^20XIm%V?J2HXjEsu3TNI3I+CEh4qyMa%`w<E{1#i5O>M?O&-K%l`dCP=%h^)
zo92hFac@6eTu4FlZIb>zCo>22`TO#mxc2t+taYst`d~i(agl|-lA_ixNoPAl`k<My
zA7((jMZSM8(tD+;r^NlQYJ<?!3e!9DTKtbqmpeUj1hZ3MImNLow*<HH{%_XPRQ%M<
z`=eE-<X6n@r~vZ#GT(ThF1k1lmfGQKk?jqaUP$2J!#oPPKjKMv*o*fMv4<vMX?m>;
zXtvk4vDbK}_UrEsAunLqrF<Uk1bVbsvwk>6R_xa2@rvS|5)E6<UN@oFOYYsqiTDvl
zd85vM)yb#?*JVhDjW)6@p78cd@y1UK_r)6~>Yk8ZGDy8HO(^Xzc2uK8l;ZS6##^dB
z+md$1;H&C$y0(aSK2$>yy7m<DYkM-w1QDwJhnioW|0(?-x@kK8*M4S)WRoqL)8R~I
zB2=S8fzS=QPXrU%r1b~?X-60_|Mpw#Fxe!ZB&XS1-4EReP32oPxBR8nfdZd~(@h8{
zX;+^<FLv95NePgIR#dip>BdO`?Ay4#qe{8F5m4B2QV&c%3)RU91cs=y{Rnv(0zMWc
z)xe}c$xYkH4J0{FXVnjgs%)GOW>ufYr4yDykbyLLI2wx~L2oVYP16=L-?IqOIo*GY
zdq;&^o!>!P^RgA0`GcRV(ZZ3o>*ZBY?CXNJP%yH=xhK(li)rz2`*V~!4DJ9JV)pR5
z<Ym<4g~SvwJj4P4IeZw^<t0`UxOxspn(4YC@?0^O|Io3z<wlV)QE)#c8{bu#Sb<%c
zuWI~Z=L7gudKOeN)wFXOII{j$H}zfNklZoNs1b~Nw($uj;4}Ug*6^0AUOxHfIPZj@
z+eR@0OaCN_7GLbaY{aUG$#EZm9*tq%6rFLL|3v4TOqHVxoK^gE5E$iSqgTxzs6%pS
z5xa^K1o2hCX{o2;qhI|mP?KQhO_@s}f@1j?k<pNE=4v^ns=$%w80Kw=YSN(R0_sS!
z?;5+W9+-`ilK!oJa934DvgO@aD77Lx&yZen#_|p<+(bwFI;?0<d>ojf9H^O5Ub8Ku
z@V;G@!VH{u`e@dF$h@3yJ)f08EaxW^u2J(>y2OJ*fIrlsRHDnZE^EPOc&5DNfMu&I
zj{he)NLyCUz(M-Un4@x+iX7ju=jIMNIzBq^iCS&So<!*U^Yx#vC0tiOp~5rDQyA!A
zPG@>zNk=;H&(lNEN#3%-qkR?q`$T*{eB6IMNRZos^!QFcE43Y3C(g?0bM1jak48Jg
zetil-8v>y<OuyQ}gOo>;j9sjYNs^v#|K;P_JS*`}e#BcUE5_jg@v4n0@UmB$oXss>
zGe{73_(`Ox>^J{+8q{X={Jl_)xUm0k6HQ~+M8kKg>DMI05$YmUx~bx?J!c5JvSVuF
zS8yN(-HL0~T~DHL%3lIQy{y2Z;?OU>g5>*TD5=pJg^!X>m&a2;qw@jZaX_`fbqJ9f
z3Dc)<x#KkyX3W`|_+~cTU#`EwET84pt_35dhdxI>10*Dl$!zX^MA*%}R?B5a3H-o)
z&!n25_BN8<%yb=vP<UbS$n7N6gy}B+m(w~O*wNk)tK-3o#Fdy>IE?w;en7hR>$4?E
z#Ks06)WUt~Bb;swLZXpTe^jo<ClV<C264tgVZ~4RwwBjz8z7jDC(_NJa}`g;%FN~K
zDJYvCp+&EeVc_>O5Qcp6beu*1Esvm%`}!;1=F$(m%5N*n0n9;%b%J7hxvGU%PiVPw
zh(C`xue#^+Ic!_NSE3fC@ZLX(vT3jvEVZdT)JtU8g@=;T0-qyfwqzMIHZs>fUeE04
z3R-q+N$;7?>q<g7muYWR#k+aEV05a=2lx$Ki;Saz&+o(fLGMKve5NHCM}g6=ZOpoz
zKCnms#RvQzevx@pCp2kYL>O7j6WjeXM>J1fnv3bZ`)g+@?CGQcOz_MQQiHjW?1!CB
zcupdbTxyeTqP~YLyJUjftxze85y6v5+tKW9kgy54$)m(*@kA;Y1QFc*H$#riq<H@G
zx{<WPBk~`d`hy+WuC1}(g-X>k5>g1+E*Uch@vj;1)S5vBlP3(7-v=fVKZ+RUkgZjk
z5R8+)iGq{~{$_l}I?lr-gt~Cz_hMx@v#h<W7V#*M#IQ*hQk%qf7lhO%{^o90E%0a~
zmSaHC%{uv~y{44reS?R69(A=Yl}TeinikUQLQQfaJ8$i3MZ}GSVn2y4P^zq((k+$e
zjPA_5(i35O_v5y&^ykZ2rn1rKs;4526{=1|Qw>}|-#zilZb6fgNy-N}4rUh3#HE$&
z6L-9Ue0$Epmj-=II1;Ch!FOeA36F_<>Y0)b^2T4jYtWEwxEqNXy&3%Oby4&Qzdz%I
zS@69IXco%VlC$gM$p0W;O0CVxEB38;o2178o5Q_~kOi}wGzjU)qeH=g$&8jMll`|o
zYMJec;~elH>L{XBmH9=Xq9}a~$AWNHW7AQxz*PJ^qZF+5UZS_0+rt3edO>6-*%wa8
zc!}sY=YHF)CY_V0Co$as=W&;mC7{_Wyo^E&!!P+M5sepv!f{FUJWNrKw<@<9j~Csx
z^xkQVTr&_}cBoTL$Mb#?_YyE52>W!aSosY}NIS^YXnsP>Zb+S2)_P5pm_WbBj;Hi2
z!QC!9A>M*<GBW%qg1c}>um76*orQ(R$|udYzyBJHA<MI&<Op@M`U2*c5UAik^+I&V
zWn*)WfZ04na_vCgYHw%wOy+GiKddUVXEy7!=5<1HMN&Ur{t_Npyg{&yZ`1EXzgI%k
zPBM+31S2ei?R$p96D)il!I_R-c?vVGO22M-3v9b*dj2Sgmm$Bz_a2M6ppVMRf%UQm
z5rqHF9yxDE*l<q0+|n53mCv6}{*jjAgZuumlCOg{Scv$cJ$T&!Jea1HRbA>;{uX0^
z{9tZ$RzPLS&s?a+7BB?W>J$C@NGibXMfSZldE~1E$+y0G!0}@>ha#BunYgS`%H?P6
zpe2@smC$+igOKFp&5(od`7;HSS9~?bZ^K`tk0+A6pqH{2)#3R$IKSTc1j8?1*;a9?
zhiODmXmPCXe;ht)eDGuPL%uMnj?tIRcY%$?M)!p35zzr^Z6K!%!@)^HiYIKzmsivA
zn6dEOnP7)Arn6oIzWaf5-zRiS?a#zPziOULWC$uHjR5(ztc$l+gepW-M-snHb+VAA
zm4hR+%6-eZXP3S{F_(FtdFmTE@k7^N1Pg4lV)Sx+J1tB$C|nnN(a}LW-*h*519^r)
z=6^aEJ4dpmfN>#>W1-KZOL$D8tSa2V-<bv8kp_rIl0ifj<c=#3ylp$mqWU?$5lhuV
z)y;C!RdJ*J9Ev2;Lh2|r4W-OKKuvb2!QGhJn)U2)vYVn|A5s|VyamTyP?})tyG&QU
z^KgbEr1THPVdxFqU7|9*(mB4+2=_GbgBw>gVaY8z2EFd}^|9oX-|R2jba7WQE%HhC
zj>abSh`fM~qz{)tPb+hqi2PQ;zJARRA)4(vb!(A+oXgEr@YMOm_%(bN*iLjr5$^ns
z&R@slMd`pV!(XJ|0s;nBZjXxoI=En-_d}mk`;D<R#=qGuY#?~@K0ismF61HY*CFL|
z!X~e)jcqsHU-MOJ*~w9QL?i$HB~D}~HiQa)6|_-K)-~<xZ}=JO5W!L(tx8wq)jGUs
ze)<(A)7h&n5a&iB2l)5PQ-H;_q!GqA^D@wIg2QD)Bn`R!{-1$s)AC{AR2)*YRIG<n
zxq!o=@6jKVMZ#&$e9~3GtCl|ts<_3H`qXmETJ4lF&v!QLz8vaQcm&fq=q&AAF0s6%
z?rf{hU+F)2m(^e9s(!PKw^oIr+!feFWJTdW&?V&BV(`WmSA{w=lIWb=KYx_RP@IW5
zgo>x*V!b&}nsZYO_)~*~-+i4@+6bsv-xfP@v)6}n;RraaWs08hc@D10G(U2R{32lC
zBpNOm*`&C?v6Y#w@^^3c8_{Rw9r6W_RgNAqk)Vt)Z2hTLo^e#KUNr8tpCay(<CDst
z2HQ}Lm}ZMNk)C}^Tq9zBX2lnQ9N%ZNxN3&xC0-ak_lkdLT`jQLHzJb%_&LYNQT~fj
zY{;&=+QW`;?w21Z+ejldUJT&iofn>~pD71YiFlrlSAB&)@gzWMgTr-ozXam*cOdE+
z?2~d4@yKmevZLHfoFf$>1BaGK(S~0uo2F&aH@P88S*r)(v0o}~XZw1mdev6X{GPwq
z<p0<m%o%rA_x<kcw8ln(QfyX#)6=yk&9*d)%Q|+!E&K_q@(s$1e0^UjlC8<Vl)Nqa
z-8qh<gnzHQQ)Aw~QkZ-ZD1P2^7SukYNIl<;t@1@t>sv4>F_9@b6)sjPyjYuS(^8bp
zPZb$gRoCA+!Sz~r^9l)__*qniPoJYC!FbipyhGRYhpp7mM#`;OU${A0!1rYqasg}O
zN5XBcJotmy`fSy~S7^P>MDoFwTb;~>yeLX||0-?ef(#y``(T;U@K8MIKuO}IIjyWU
zUJjTF_L<~@vI!97EKsGGdTl+j46{3%w`uYrm5pmIa(bY}w}DTAC`&xBXq6l7)&dt~
za4pkW4PJVuMlKTf%&21IF0lR4VIw(xP1kTEe{m%ukrl)Zm|+voKfhO&pzoUv?aX7}
z0P)dp+>72ga2Q~}P7h&^<MmyyR+tfV14|;g5X+G00Fg*?y)ROBs%z3es7<rGg5XUx
z57cP*DKMORJa!cD=e$Ca!Fc2;^6Jf{sV~q_Kr2j11w(m5Z>X2gm?<cxV>3B<C5|)B
z54+cV_yTw62cB_Qd>PllJcvOo_!WGb@IMU?OM6QN4%g@5Ds3Q*T%|j*lXwEx<FO`<
z-YsqbGBmO164zRW;CZ!9(e!|&7hSl<{08scT$y2JBVH?v$dRY<_{pa-2CL34bI)b^
z13ARsy0)PF9eTb_pFgQtoouKL|M)ZdNS3IAW^<jc>0To^?CXCuDBQ5~72!&n+R+zR
zKh7EhYJZyMB4ajjF4GlooQ$<+8o~;DhA`u>7nHb$I!dYU4K)}nA6~oPBbhT``w9RX
z2B;O80!i$~;k28qV>0%{b%E9s(JOps-G{XI>`jh0P%7*o9X=_FsQgEt^&|UMQr|aS
z3`adn|AeBc<0CI5MGtJ(Kc$MhcOV!N_!N77?h_ME>l2y{^y7=_IF2{X+MKJN8nsui
z_$Rv4rok9~-*Cc(y?eK%uY#UoiGVdCRod%+y^<!pB^|*|*}S=%qxa&}uTgJP<6d-r
z$04#234>p%e&@(=fvhN&^B-$GjWu(Wi=y|xJz~G;B7?VS-Q9Eqf$A1_aIOV-I{SVG
z3pRVdB6iy(<+-ETPul|lLE40?#|P&H(%KaaN*}EiE6?GKVdTn0j2~i8kQSc%Selu*
znM#Bv<fi-5xL^HGOipGG4-RCy5$n3a9Kc2SHwc$fT?v{@oT~}dH`r0@tBMaoDHw1`
zq>g>~ZW8k$TG(^v_+gD-)SM3uznD-b^%Dp|&mLuXakBb5XHlS|Q(o(B-9O*^*==+V
z27)m-epqIRJ@&Z%rz$9pcc%C#j6j*tUI?Sv_R%c_-y-Zb8`?a)G_Qg&Jv^T~Y_i1J
z3DOl(zgNXE1`CW~?zSyRpzSyh0v`UNy`aT<S58%8b#I5GYTmiBvciP1u^slXsw*qh
zvvk<<bWg+TFV5IQ#ZtyYhmWrsR(_{6HsOiT%H6(wEWwPI#$)yLmNvMOM~5cE4O5CX
zz?*58k@Y2$b{<{tI8agsVCk|#L<lF@y&hsGfuu?QtC;CMhC<|3r{fz+cXdU2178<k
z%IADH7{YIz#VJCN>B~xq$d;#Pg?Jpe7~GOzQ9Libp}8!pCSq6(2LINt1ezu6t^2bK
z2K?n3k_?Ltf^7^IH>?uK<uwT>|NFp{I<&(Pd26>Qg%#m{5;^orXbi}`<_Rc=1dGnK
znmdfR?YWVHDtm3$;p`Uz7YV}DO^Fsfc+ag~CTshbZ*YPHVC7<`M7Szc;g~9rDPbz>
zFEj=ix4wTXaFov}P1GL++^@mz+!0I^0wnJeTAu!9OU%=~40ikv#9DWYh82Af*K*YF
zI%ys3dUAYE9Jbs((=mEarTatm`q9$-*Cy2C3(n0@#|842{90FPRdUKlzwOS95IERE
z5eDR8M<&{Szm&W5n~vZsg$S0x*oagguUDX-iC{Pgb-jwT-9(UvX)f(sPy3;I-R^L(
zLNnC%E#{dxrV#mH*?}X8lSBH{6N&kjjCm0*FPsf){Jy$eesiI8L7XRlR~@LkfS<WL
z0Nhs*!AYy>g}yG{xHRAkaO9DD^o#i=kSOMS9!Med;I5yngqNV0xQ{3XFGd<ck%ALB
zd;UsnTTGw+5$wgLcpygV5@ODzQg)O$0$VjGRK`OC&}Rg8M9D0&kJ9wpqZ6ERvidf_
z@^&w}f4kR5uUqi4>d5DFg%Kc0kc7&Ic5eAixs~ch-akWIHy}hMvms(S{XMAV{qfEN
zDVb&Arty*a0B8`j^WPt=re;?8Z|E&)>o2r;q?Juo)Ot>CTb}Vc-QdB0IHJ(S&n~OO
zjciMF;>Ri$_<}x+^%Z8(4#4!DyrSjAId@td&+p*=o8lI7;gf}jR8#`<)D28}hyzY^
znZ_Olaf&op9N4wC$BD)HzdNU_vm!$%oh9glC}nbr9lGqFGAQ<_?7fr|I@npPXX(%5
zrV1465b$5r7hSUg3OTUbCP6y}3(?6Vq6QazTb!Gp9AnFO_(=JSickiRr|9|sT~i(f
z4JBJ`H2y8sz<$J{Rn1OZNKh_#TIzx_j8d7~H|h+Lvj&xAK0-y~s-|Nc*hYZ&h;`*J
zP77g}bL<*E1ZL*=DHn4y3eAYpFaEb;cvcp=;Md{B8u&;&HX@9UqZ){La>oe+%0AXG
zOT>J<#Kf<|zauvsl`4bjH0cLrtKJup#(G|QO-?QmtOsJr(*{&9W80Jx5@%wWvaZ4S
zjZuATn}@;*?IjgZ%Az}JQ}rLcbw&R3>`DAm1d@0{`o3@R9?5u-<bEe29X>_1>@@HC
zmw4uoP(t8DktD@mu8r>=24ridig_!$`*@Q)q4$aD;%>Lyb!hd?ANEa(85~vV{PpRD
z$>^2M$Bt-;JNd)x6F{=9$7w&JfU6`HUzIZi?G_=ek3V_l;gG!yyMYbkNlJEr_9wLM
zcl1=gMAp$7JE5)72*9^(?+6B+%2(YQYlxZ~M7A297c6p<t-Dmv!XZ~!^%kiF^Wp-i
z2Gzs3Sf3OFY;fGcKL*sGUa%L82YU+EEK(tB_?BR*x{}J?X%85!-8HsmCxptWm;(B9
z#?;>+3^!%ryysvN_X;)kogM6hoY8zaO@#Ff;XLSC{$3vOoWG>Mdf-AvRoec(zfw)?
zA%pJv#(WZ0SWJS^q2i(ZSaZ7FQJQ>R4Eq>Rb{0cB4S9x9*m{hjW4~ocMNiH3;S>?v
z9g;l8Qzn~W4?xSO$S{I}23ueL1jUpgV^y(+C9T*PE-q8x`Sjn$M;4--PcaNnUwwfI
zBsCo~h4`jC3ZD9MvlkqlKt+!0g8T;({{D~Irv%@^Ke3RR(6ti5-|67Pc^~z@kZ|Zd
zvAYrMuDNR$SNi7@HuKk0yK*lGSclPQ5RBU*93AcJu%dc#ePy=m_=lLnk!N*L4j$A(
zR1}-N&sln9E!Y|?^0|1po%o>!hL8p<X9T$WEl_%H5yF4o8XT~F0Pm5+1W;PO6IpA3
z-=twTP<_|gcf6})i_ePiWqLFjIrdwU`S0cA(<)=%Uvawgh*Sb@U#-GZ(J9nMpB+jz
z{t*7&KHQ)(pf<0Ei43ZCruneFR5>x@+NTIqUU1nsvI=>@5sN8@b*=v5B6+=7*AXs)
z>DeD9j<9z`?~UuK?c+84$lEmCP3-TSk}Gym#jDs5oZct}AOmSpX1gtUB*KnfFKm@q
zQYn4`&u$XBY@8D^Tf=G1$G9D{Tfd{Q0(59F6Mb5Y@LIN1f2Mgda^GBM>MlGX+ED%O
zds$XF_z2E7TG8J3?-8KZ{ki5n1*cE-dn2AsJ5C2n4*{w2pjoU8Xmt5*;o(x<;SKfl
z`i9i^59bQ8x=BJCLx7UDPwulie<>hpNw_pmj{THkZEp7qH=x;miGDH12r?-cvUobT
zlBNbxzb`(5(q52?!2%|9(HjGpn3CAO4`_}^8nTJR7&wA#_5*EfC%XC6{Da>vhV*=6
z_WklGM7xjaLgi-P3_p5(<NMw4K+{xZF_KQNTXoASYCwq*|L<KSeg$vV_69tPi*)6y
za^D}Mo1}zjt#j-EBDstUia?mKsPqj#wI0^EjNb3GCotCI*tqx$r_{KLCgn+gyx=EI
zUFG;U`uzMosA}WYrpmH0T{I-_7IK*;j)a&FM-5bqP|n^6zH}fgo<-h$ql+fp;)+TG
zizI_FoPVc7SzfyR$?@({IUKP3OelW7I%SMv-rC4?wpP+jv>+Jj!nBT2d!><$EvEMA
zLWGi>%BDOMJ7W5&J&SDKne2ImB*FtzcrC)v&*Z+e=eb!cF~0s?9LobmHu3*O{N=uu
z$oM3pHSb*Oe;j|sDWG6!v!LK3G5*>h$veBJhLAvdA-JZI^8*tVY@2`BY$6kcGW^1}
zt+Y2cecJ{+|Ma4ijU9FG0^^ksA}Fw<jXjPR8VhhA)ia>+{X3in1-|H0;3@qRnKGO9
zI9BJ*n0n*$)cD3G@7!N(&+5sD>8_la!7x0QnTem|?&Xu45#+HpJgqoZvp^-(W?vN>
znF^$NVd1@mZGrton(f|GC8L)<5&N=>LQ^r?;o)LhZR5GX5;TXIj7wd!n?Z)3l-6ry
zl(vpmF5hP|413m=7%}RR%%<$#aEDP1&@hIK_%Br8UyjWuWK<sI4M+$(kjy9UsRG&l
z-Br0@u~snnU=Egs@j2U5N)NumWo09@QK-`)<Kp#2F)SQLF&@dkzgMA(QtU5C#q$7T
zm$3oo%Td2blU)-SyVnrntr{wVvYA+$X&BK2;7dX=6vu$>Cj1>)jdaI;41iH#>oqpF
zgq_BNySC@zQFTEI9*kkJ)){PSK>wU25)WSO!wSFmiMvD(JaMx@%$vdi1I9Fpp1BN0
zH^@)>H(*bxX&He%?*otGhraNHTCfy0;e%Bci20vYet%~9#lN0S9Ww{h+9X}P$yHc)
znvkZd<M=cn047`OjnR{XD9|dPZ4^Z2l!=S*8mN05%`bW_m;YS+l^FEfhKyq|4xar~
zQ{i-rh42Sd(r-ALMW)~kgn2wGyAqo{14%nWKr665jMs~t3uX5<2;Ia}-2<H)%qihg
z`y{K|#RI>`dr@&BO=*koi<3(d0oi5kbr068JDwvd;*-0~R**<VJOCpe7HFnm#Hb@(
zQV#M~4sKI>i4(B<>=v6c(M!=z*x#kq13RH6+2=x%I1L(5SolQwf#2PJ<{LBh#DtSy
z_k&||t9ZdR+M7K@FvIU@xRLgq#8Re5K4y5}<1;`g33^7H&aB6K7S+?KyV!*h@h1u+
zUaJ5Qw%>?J(n0`k*YF)xTnP$W0$)I9jhg<f$o0p$89v9p8{`0tVN;B=#5q}gka|`H
z_-obmT$N!{*`MqdTsrwi>iExBhxM^+yYd_4K6d2k1ArDg=Ncp*%cx>?s99$VpQ5q1
z$Mq-cQbSS~{`J9qq$xrbu^PpGQ$%zI00QLUq`!npk6}Mtrm;~Ik}h0>S(Kjjp??tN
zwi-#;TqIi9AKcU)O8ony)b|rY?~Rtp;AShrF~C~r$1qjZCt-^!*MwmJWnk0fBHmOw
zrqS#^cPV@W(1(@Tc;Hi<ci;}dD6n^*`*7RnUgjWX8c!Ugo~n!Gzk^^to2pI@`Yo?Y
zf8Zj9VVp#9eUVtX5YgrNW5+m}m;H)=v>jP);c9nOxK<>JG2pXMun_it315_4-7kF^
zcm%6dzHqT{KefGC)sG<Hq^iLBc#0hYz@lO~fEOM&bJCkn&Jp(nGFY-qV)5XBJnz^%
zIwTKWOC54w^7_z%R~t(B&Fy#^7kNGvi`PZA#)~$8Ye*V-g3iY;!8wocxuAB+@}o-`
z4hd4&g_GnFl)$m>L<a&fzzrx8791y_MJzes`%^sx(ltp1w+off-RvWZN0DFlG(L5M
zc>W@Y+AyBjNPto3g$vR-kVp_1;67w~`21v+qY8Bnc#s!+Spn}GU4L|g^v<Oy#61Jt
zfRAC1BXG2sZ%cSA?3|*Ew)42ZBYSm_jK`8e3}a^A&jHk-hcna)0!`yt<X|6rns{Co
z*X6w@R%~*jrPFh>yW0WYE=9e`Vs&BV`{LNBMgmM6#;_NgRD|%4*}+$Tye7Z51)|03
zL5PsNBh7LWKv-Y(v&$df_n~;$=#(HVfnwRpVgHk@rJVdrOa7CKX_(ztXrIW$YhAD;
zF1_`cB8G31`{I#*)(WuH@v96v|C=#zqjJCq?*<f$M+FXw>)R!pnvuxTpox@Ph2X{w
zVaRPKZ}OgIK5G)C#!@n5u<*-is`&c$@t_}V%z!|}*d3g^4S|bea=7;Uy3-V}*$#w0
ztn$Bvj{_(cM}DdFFyod63@~o}H#CZUqA-4d1WVVY6_N=BY11=Ew-`zqi@(GG&VHgg
zb))4`@+2vyUmo;4^%0K8ify{4kUpF}$O0Jev%7cs*(ZCP0zeGy(o2V@G2g3x0319T
zlF?=`_RDhy$FgcdIlg2C+RfrwXj)A*hQOBYl!_E1p;bEe*;*cBixWeD);;$u6^(Ub
zRqYcNe4;gE!OpI%b92b}?3}EnT#(msl*#|?LyA-)vzeQq+0f$rH_jYAlL8v*8IK^>
zudT5ulc^<_xHU6ZA<6O?s+})@Vt866_#9BZ>|&G72T9MFXM~>h_B`>3#^OOZ4U$0?
zA60MCS@o-!4Sl3UMvF1r{%QHGnOt1-3g4Ne%G-w9iB6}f{h7mXl<}upqRcb--Vm(Y
za%EYc7f}ZF9Y^=`r|HKU2!0VBn+-Mhu&+fu*1`)bWc4F$>BL%z=`Rgmu-{Q(5yk{q
zbjZtGU1`>6VV$`#jwYfgf0p&~e`unB&M)RGMp`{Ww=`&{tgB-|&|DO{AxwO8rttpe
z^GWiTAWxGsB=8xSYAoi}s@#nDPZ(np-h?8WY7_{>T|5+Ih`(KsvB)0!&4v=dX5+AG
zs!cWtLCRO^I1I|_Q5$CLa#)NphTpr6wb3(k9#5_~yV5IsT5Y5Fx{nCS)2#h;N`Isu
z5AeBV`upGifEEs{+ef;EM=_h=7l(hz0(KiiJ2D2pYYOh*lvhF$n5La%nfE*i{9uac
z3f#KYTf~SEx}5wX!_NG|Hwd}ib|Zcj^I0ma<Z=Q9BNy7b?Hm9;!5$d7FVa_H)k5AW
zJZ&&Rv@2xFd)>{Ao*Tmqho*2LDVIvpv}ZQH2M7TnC{W6frmd4B-basc@blsR!Hm^v
z*OAzT;su_6#;o7oYBgOkwa6(;d*>rBM|`Zus{A@YPKuhmI=jdgUrd^_%X?LB97ZlJ
z@4<wqYB?X<egp#Lj>F1Ck*{lwb8mpdBP!Xw!>Yj})c!dSj_v<Q=Y8!Zw9Wj}0ElgZ
z*@WyT83J-ae2(+vApPjCzYsR=V%|FJ6^Vam41GJ-{EeFQQ9b(}Ag5Z4ycX?m)sMP1
zw+F8-eX#E?R8mW&AidET$Z1n5Az%TdqFt)W@8^s6;Be1XN}1F29TBMYBntgX)<+4d
z|2vJ?*Mc8IUn%5Vi0nLUJLhv(ozmMArHK-O{6mZvb>C{7#C6;RvUW{Pym)`^y|PhV
zRD#FSQi^S#XJHS{l>Vwv5z+Kw)*+f)5QAg>M>}$9zi3=0KFlrhFDS$d<lgk7<e#0u
zYP12`4sQy_#hYVH@NRD?8$^QjED>0@Hxp1PP^@L{=&LinD=1{2!5Q4)9425Q5R(p4
zczRkqIvOpO73fvE#`{OWOQ!}$8jI0B?R%Yb;ba0%%PfASD7T51g>|SAX2eh8HbuR;
z7rW7Czzz@UzQP?U|8>JQ<mZl`;TNl_PDjjmtacpg#meU+*vjTS?jK2Ix+MDCkHa!o
zugN0b$%B<(3?<O?G)IlGX&E}HW82##8h(hM&ZZ+cEBGj<D`}3OD#S46H=<GcTGZt2
z-G$O7x7L(@m&P!8)`<_><+wr2)o5R>iShxcbo{JC9FOh{A&LD;mmyKohhvG~@|BnC
z`EK}TQd|cWEfIq4VH+>>FQYlXs`|h*z>ILko{tia?^7*&p_$|J5iE)Qxc^wT1!k%+
zGf1<!vCbOh9Dn<dW*zM;egM}p+P~Jl$Q{dBVSLg|;`q)->=fR!vOz9QmDBNZ6dea@
z=$h*gGdP>U&^|mj2{8}PRyD~dZyGBqhkl8M&@H)37eD=x0;->SUSc&9w5a5m;W5~Z
z(%F2Vd={uoW?Nl@15+S_Nbf+bE#>)IqSxX~JcREjqQ9x_)$6Ty@c~P)N{l5wM=k!V
z)|<_ulq&g4LBXfb<lhHpaKUlZH;tvA!r!j+Er9^L(|;Wj&}TaPn!)jLsWfDkACvLy
zK3&(vm0*8VHeFbi!7RGvt9eD%s0><7ink%~U(?h?b%H<Ze&Eg-v{-$JbqDF23u!f5
z55)<`KES&2_y~%D1Vs!pcWOB$4|Z=1@u!7f-Z3He*`zyH1`3k+M0eh{JN%p0G77cg
zJ*AVlCiR>?ZK}42sRlhV7m~AmFuFq0)EpUqxn1W|h$+4aeJ&@;zvDn$cO)U9W%k$w
zyLTL9pA<r4(?tL+ZhAkOrUh7pnBqzhtO(qi5|I2Mb!I5)FA4x2Jbfa7@E2rNHPh|7
z-5N~6nVX<*51th|ENoqm)<?+`%wp{!5GmY)?blAg*o}!s^fvGEa#O$=kKhB4Bs#6^
zL^iDwwOpolqhFnG><fuy%Ht*uev5ec=udSENXRYrqQ>dnS2%=61#fwleMB^VE9V-|
z%r@RcjQVwi=H<4~JbV28DbPY8$-4QT)8#iTVqf%?G@hfdQ~J$(BT|~Xe0K<x^LAt~
zjRV)cZtMiv&+M4Qr!<Fl?=+RfJL|*#?Wpze=bi8O&vu5E0+r#cAqC-=l&I?baGZK=
zyN(-cKK@D=Bw)0Ib{aTJ-)R?CKqT_F)_6#3wVh!`s46enRFO<jF5gxgC_y>VH2FCQ
zEnNn4L8#At=Q6VxxhxlkR)=92|J6`AR!gASCDF`fc(Q`_jeF-494r_EEJpRqD;CmD
z5z*f|a=s>e*x=vp^+_|LD6_dC8_FE8ZS<j>Udu5~#ol5bSlzYC^JiCE+26t&)cPr_
zZoi!ZR04Ea)Rj*2_PFx^C68kDL7Fiow20P)`)GG3eOula85ZOWN*Mh1*ug8)`raF%
zi9fJ#TcXD!!Vb8JpHGVGXHio%y~^EB$N6{%i^cGqXeMz#`=-UpV!z=Rs1l+;xW`f<
z`0cYsx&X0cJkq#IPjcn&t`_Edj!`z3XS(*mtomW}53BL?-F_u3x4huGd5%0A?Kgl}
zfN3nt$6lGdb6f5e$St9*RM}J?vlW;*lM>J5kVCiwU+Qs|J=L2SrUX>dvfzm{s9hQ5
zX^)Q83_V;nw3TR;OW&aN&X*5<3k%9O-s=C*HG!cSk!I9Xv*%_2wi#l03Q(?ub7)W1
zI4Ou=l`fma%r)S*c`t%-eA<$vDGl-SF`Co)#F8s3kNLXdisqTjozDYZnh#ElHAL2m
zx^H9Lsnm#SAOyR@Ob^J%oZqV9*A8x2{XBAM{2J7?CCqxRZyr&5@H-x7>l1#wo24$9
zQH=IdhZXhXgbE4RqmhF*<h96Dz2AejXc6d{0-c>H!wkYbgZH4CYA$<RzeZPjERg@W
zGMP{FjrngOE8{1b+J-9wpuPE+Z}^JY<@7`Gd^}{|q@8d>bKRyCDGHF!qlF$qtk1Q1
z4;{^XP*oZlCm-GPzYas$pZPF73^ym$^1Tv`%dm&2@jM!+CJt)C+l#m2ZF^5j-l8_V
z==vjYg!)HSfef)=1S7BKzDcs;4C$*<&JFaDNyi)!503Ce^9<KEeb2m1IS*b!*{_lp
zW#rtuQEA;G+HrBHw6d3vFM^%H9Iea!Qmo`jBSi13j#d74ez?~_zsoA&HY?5HCgb(M
zF*P#G^qd8Wv*Od%R)z-*s5jW^E1Plhx_`_s<bU(Ql<FYsaFdb5ya}G*3?bEkn`}B-
zNZ#hSkn3UqvD}lxVtq_+%%4iFBDdjCzBuYEi8zya$lmhzz@le)2{soWl~r0COMD!W
zezGg1=Z_g>eQ`4cGJ=1uC>tn~(Z=M%Y@UkKMZO{k9+nLNuRP3nA6<{uwB7ST`e{=>
zA(8H}XL9|cY00TQ@uQugh9pX@cXxcq*_bvwq}(n;D`DTzr7FDVxJ(yvm7vH4HYrQq
zm%T8o$lQH~iFZ_Z=I)iUTyIijza0WMG;oVoV@r&s!!CP;)5x_S&iR}Wk;bS_`+hf4
zSCjx?0YvS6<bunVZl13=Nj&=s%a7B~s{j|0G{$jlntztDst8As-{Frc+y8~%{t~#h
zGGvatn&ydIXCsIBZS9|?uIuGYe|bFx(d*O`=I9cqX(z!EnM)^)3bkq@QTLVQ{->$y
zI&%EnLgfhGJ&+nm<?%<I{^Q+QUzZ;Vv_^3Ml}F7-&-vF&e(pQy7PYD1l=$ZeM$8YX
zhJ2Af39iF!3JW%Tbjco@KW&V5;tY5LNmt7rLi*hvJll$u?UayfbJN~7&s%gJj3c(=
zU47$nC^~6FcwcT{?wYGlCn5DRsdnt-=9EI^Cz_^}T2%634vQhz)r4k$9mugbF<-$f
zgNv(550e<`uI!~;-R!_+RZ_`QQKd3TSh1D#+}dZin>-LwWl86$s+UjmUhBh9F}ZC0
z^Bi%+YVUIv;AEle+)H*NJ$S(Yc{=0OX^%6>xFXA(Su1a-lW)9UjCTB2TFXBCpPRdg
zaJ~HEgA0coc`oi`nqEGzt=dPDPV4Q2<pF#rr7sUgym%)hx|OXwW+NamKEhfubA$O!
zNqphkjB|#fD<HLs>9~SCn4+oxJk*xVV6dc1Gijq_re4t0^Vt%IRfhIvO<v`=#^Jhj
z#4WGIgj~NN(cSt^>EAx$CdIxg!lWn~RzoikAL=W8<Vj4_?5wTa%+)W7D~<^!InDZl
z+15-=o^C|^IQum?$8BQXCYA4j<H@tF&~U)#HE_jC-HNUK9}-z7soid}t2mZfQ-g{d
z%WV-OQbs&2hY?(%pemuOcq=zk+<;|+flAx{zT?|!ExqThYA{o4eVB`M0IhS2?Z9np
znusl<JnuMM13cVpQbDLGirdS%1<xlo$T}JgJQl?j@}qN^&daB7c{%i}^!LkRpS)87
zby+dEkIpTDeW6>WVI}`sY!8)Y2A7+r^FjZ&oyfqNY#x(0FX7oC(hSi>iN40J4WqY}
zlW{pR2^Y1V?Qv}bgW)nCrV#BOL_Eqi_RYgBAE0jgjei-UaL1&z%|5oF!spCe-g6B`
zENMS_Y>3|Q=}>ET7JE~#I}K4i{Anp04vT*-EroKiu>Zh?<4U~xk^fX@*@<-}N9kIX
zl{B<6={I$(@%D6feNE~<t08WENoltA(XSMB9&cBuRnUfqieYnN>icKup<h^vl2G10
zCif?`D+C29>Y7}Oc^UV09~#$Pjkg#Hh<Q-?6^ekzNYbXgh?g#@wXp<7x_g>m?48!r
zR^3=PQk53_+GL+UNIwjsJ5)=(_*e>`WUQ{Z0qd|hPPWR-c?Nu9PJdU7A<Lb}*1Z2w
z_HKfM5b;R=d+LQaj8OVJ-`De;uPHjQBiEavrOyPMZ@yNv%VyR48fyHs>+F5X+$nj!
ztCEw=KUqE_lmBgLKV0bwLmEZc@woRhjkU-j?^lb{U0NUXhU1Hi8km+1+sk%1sB!b4
z*L{B@t78T?gE`2TTho&U<8rEo79V|vic;HCrKv8<wPmf{j$KSwn79K4lz3vyrTzw=
zX&n{Ck}Z9t4<3t^H}t-qu~dp~hc7Oh^ohTleNgPO{5_E{cHp*pRK;qw4nL1HE{x#w
zw3n^0CEj9N)SmF^XJyDUZ=Qe&spY<JUW8#})<hw7QapS8f#Ov(N9N4OYQO*SHsUxa
z0AI=^5JQGMD9%29R=PZU#@u;H@*d}6GgQhzNT~)GC^VgOA1FWDq{ek`shUuOtF1wJ
zM;;ZJJa4*5mKzmS#nPhL_&Z@&oW~K*S`HVd_Rrnc3mIg-8pc(FR-Uo9wDeRqZc02u
zq~ND6Ul`zzh${7!G8i=FH#gUYS*0C+RpG#MqIZvEg#jdpq5vJ0a9)}W;=xeVSTD!%
zQ|$Ve%Ec%F-(@RzvR!oD*GL#;ljDBcdy{4Z@a(0ZaL!;6wFeG#g?EdvX$0POYi+j1
zV`$t#ta_<?2>rfrB&s29(_JdG%xx06!ls=N>ke)dfxJZ!^H!+@G$8q#$fK~4Ho%vD
z(@7-KT9==5GBVF<=AViqNgiKnsIV@r@ybu0yVO-zuxxJJ>Sks^V?Q)ULr*K~@qDwU
zlNThG$eK_!ux*g`VMz_{oT_YDn&>i@&4)0#N9JBRQHFd9uZKNpQ-}T<7x?SWDL3&i
zW#m+dNQUPH3XG6a^)y~Nlp!NTnysB0xZc#uI%|haL5(E@hBI?c|BgVpd$L%r7IF+Q
zy#1I+L=HpHwuYxpEu9Ry^%A2U2V2B{ElLwB!~K}GVk7QyS}GTrhpf(A#I7P3n{R&I
zh6Jzgi)}1acl*s|Nf<Z%xOaGl>YnIMsj3iFfjt3Xh!8^DzVlvy#d-}&j}N#xHDd3t
z3E|+LL6sQqRbHNJ{uF=G@ri0a*y%iw?)JQ0Qf+W3iKgStOQOcVO>_2NV98>enxXo9
zuXN!LL^50zCAfVN8!+iNqNz67%$ceupsE4ee14}a2yPmvGXf2oH?QaC*_ioVagKQh
zdoy0}bpGMX=9#uexC{0SK5E3#%Ec1+&K2(qb2_McBys;AJr;3T);#&l<ScGpqz$IH
zyt$#h#S442A8KW?i@0~o_3>jaCyc_u4(3}vDAO=E2wNZ4>OkISuzre|{{a-eo+T&N
zF|=}P{~<HX=#)lPib=oPy6ul0f*Sh9hih)mFaaV|N7i|db>hrT>^8p{Z1#6{q<;QE
z2v`4Zd(HRO9i2W`;s+Xrk*y}>l44jfA@ixC^sC&LH!iCqF336Y-p4uPO|RVXmhe?^
z#lcX};uugXTwx+8MsTX42f7G*NDw1~k{0Q;De&ri_$w@aKZ%tyXe2157j&ei5wq6)
z)d=&gS$&Ez^37dsRMa1+CRzp1hpl&E2up-fW*tu`;G|z$HkqZ*u{ETB`?iY+fjq4?
zl5}%LVd;oQ@x;3>>}V~(O+U6FUkQdOZm3xuzM>2>SQ=$i$qMNoW`ppc484YR1XbWL
zI&6}w(gqeUWs=iO7qlkIXT5)jS>-x5jI?&qg}k0Aq+^wi51qy=5Uh~@HGgt6X_C1d
zg4;9r0Yet%M>?pOon*f0wwELdUd5Lqd5$7-a!}guL>9HbMr@4@eY>-Gm8K1=JIj)~
zvs$uu;&3&f`l>E7sEr?e##G8^R%gKvbLq=)qrV1gZ5}#ZP1ihqG&>2=*&9WaVCfGK
z5KI*jliMpAxS?(1?byHF<$VgccyPgbF?OpBFRVn;XSh4DVZPqS*4=;0M*oP)H*0Ma
zgG(5%{iXV<KXyA@Wbs8eE@i3W9~uw)^6&!Wk)l3_<Hlb=$YhGHR8QkRw6E7W_py)O
zL3n02M$fk!qFpHtr|b7t^G}FrnsgB;x-g%sw2IDZVbr$*xq#qgf=!{`+$qEy5_QP-
zMoWUGlj<85KR-lBUp5Bs1S6&-n+QYz(_koAJ_2ohWvs4zV18*OHWU8zd^amX+|YBQ
zsY;LxIDZqXyR=h;uGOzF{g|pDJuNBS8w7I^zM$B~<WX7S6IT4t57_F`Upm-0h<*)+
z29@7el04J`VaTNWzFc9krnC0*8k#O1_nilN?_CR5ML2P>Q6_voHQtQs@N^n|HrBFe
z^ucUQdYJ*wYyHaR$+x4acaKl8=Jnk6biXy9o!av{E8l!m!?OBx+j%Z6+stdc3#MDT
zuvxON>zNZ-ZZA%JJD56p0q8Bcp$~Y6e$k;h{$A9Vfi`LohbwHF2~nuCN1XDH5z7n3
zVtZjhc7<2ZrQq&U&TnF^J@QW47SE5+%ZOw8mp+HdHg4<OB<vXF(_M&r-z*vWQZ=iH
z6TDCKoKr%2jk?&GJGp#AF`x$$$hwV-bg8`6!Pzt`5F5rorK84vZV1P5v}^Npe`Azv
zgwVc@`s79ToEKQ4o2AF&NDT?Y7{$o{LuBI`t<a#1sDj3p{)pE5bs|~d>Yss<2<No^
z?Y4b7$L3=s1$W*v8MlEAIm9`GOWKXys)Y2VpNOG~rK&GUP6OTdeLMZ{H}DTy`Q+Jc
zHAr``fHG#d!O+)G=xucw;0PI%1q{XbMRt;`BI?cfKYpt7am0qh&QBfhR#7jSm1yUr
zjS<1?Uh-HDDcqd9d6n=k`)r!@f+dXd8_=IG{EdE&e0RmJj8%lJjbJ3zH~e*uF51}l
z^FJ6i0oXZg2I0*gD4{(LoM?-r4p;oPVNoH+j?c%gWI_e=M{Q?{|3ev>g?K>Y)1l32
z3!ptfJaUoS<4Ul&Fj)QR6CTTDRa`%+)k+FS*05p9%4xP=!3?){jD~xl&WR2VH4K#+
znKgD5f8R~qL|^TAgY8{m@0g#>pVC97PKVQHI2$;P;54E$vyIKbdc#fO>P(e|tB$<=
zs8s^R`hAFp;Srg<bar0nn(NVGhJR&ltR<ppoCrejRDtPtDIn_qfvXW<d$+?o92^ji
zE;fFOU1_yPlL~A{>&+_W9!}$O_Ow-#To7>tr?&oiXnWc=e1TxTxK>-c6{lCp^(fAM
zUGe3`BsPt)m~fi4fxAAwvM*gUNl|fyJz!BxaBxE#hQ5s$l6260q7=dN16w7iKd-xZ
z(MN6InEliB!@@nm#SzSI%29ptEn?BNJYekWqIOOMBjpaKHYSm;9p3V+R|^Hlj^n-^
zVijE?QU*4-VnKYa8J^<9-}sfW#0Oc}xK6ZP&htvzK4>+14I9V$o;}6bG*bC=;p1`t
z;th<|vExDJ!*avmjw2Qcp~DIO-5b8$Pz($X#dUVX>jF%)dAwCYLip~#;n%TIB<)~V
z?3tavnvdYzi&!?YeJE5Ol%;SfGBh~Ejd4E}Q`|ztG!z%Rs?JmcI!=w-AC^#czw#&;
zm|X~O#f{pdY<~hQhtUhvCxB_&&k2>Z`d$(Fomcm_Q&!i2qCM{sb?w*%;y&+~;;pcN
zL1J%$+o;m(mT4{t5`%!p8u=?;#c!`06F%W~8*@th2>!(&r0fGnEv6A#kUsVViW;g=
z5UPI>q2~<7RCfuv#sj`h!+YFYcM|s>!Q$&eNDpD&@JsCAU#$xfz<iUc+wbOxIKYaz
z=U_4qG#C$ySTP(yXxc?m1OJ8J)ws877Ck_0U24tS#7PzU6yc!4lI@j;l-5P3McnS!
zz^$}mv`JpJJ-P|yxr3C^f6X+{y1aj{v_y#_$JPgArXBE|-$A~3_l>27^=TrC84;e;
z>94fBHWG^e!*D*<#|IxxyBq*VWR*JddED_|UIF!DEk`j~rxEc9k4~JI_K9cyLI7m_
zX9NY_q_ynzsI}p3md}|JALp03UY8kL$k~J|BiD5=a86n7t<yjgEJX)M)m7im6LHJ0
zKI|+d{I2egGkr_?I@j(};4Vz$<tmK`qlyr!J};95WuzDTCI0n~eu3z|LFac2W+!N)
z!Po`3c?Hf{(9Cwl{vt~bg}sdNZ4_+pOUn}VJ|X)wU<eq3zUwRjW;@4eQ+0&|T5o|8
zI$2DPcP`KK8X*(8>0#LBQ?xm5{@RB84Al7OS-oEwa0iJSlz%__D0UWZ6^5>g`XN8F
zkt9hHlr2yh3;d@oh%^H}Mx|i%yKQdIAkB()uLEZfs=4yl&%@VK9m~`I<5t#CDo{U_
z)xV<_Ju!gVxtRU))z`Z>QRr%@Ur^{qcs%A13(~BkRjzK}fNLM*8{}+_Ukuai+GR0v
zTmLPp;`6*JxGwco^e{Yqq5t;2<fh`sZlMiqPt&xR7+qy7I6sTMGZsx~!dny)fmxj4
zeEa75hV3HrH$?Ti=ijpgxyJ=v;h9<4UHV=o^fiWWIL>z>yqQ^d(1VG@466dThK)D_
z;WG#pDrit<VN=X--?xNs17hpj=RS$Yonz!bv0~U>7f54Zt_@U^Np2-fS>VC#Hp-js
z!|k@xf32woC`&e097GZGEz69r$prFC=Q<3%E$jLI*7*&2!10OMk_rhb`nrcA2=g)v
zOE!rds4V$;QVp}cT_@-;>c_ZU8Uup{4NGG0c?*8+|M`-VZu$qhji~My`9plzIZglL
zTK869Q|_~R#FAt}H})1lFtJ}gY1`1@$@Xq&I;tN}Q(IySq7yTo%_#to%%4UD@!V1A
zj5kbQyxfn!G@CpROu4<#{by`*S1J0WSsS5bA4|AKpwJ)vE#&sp9zD$rqRw*;56FI&
zsSoews9k<YUH;o6?n~5k;D!8l7)&5;Dpo~<f*c!yabX?Vzs-4!y+ZbR9D-cE@E#{k
zYvD%&B5yry*lztng-$l<!9813)a%q{NY}B4+rzJPu;7JoAZOK64P6PNXQ1PFruzw*
zjc5CuMD8HzK@NWo&~BGq>Q#-D7N<XpLT=+<88_AahZnm|`((<P6qtaF0udtLbG`i5
zgGm9;);Xh2=d}BWQ<<@LwdXw+YGHdI@gn1O@(&mJve)#eY1-yUYvB|n{8M}70yJF|
zS|&MOlDmqHOpMdcg$2hQzwO%~`%q+&aRHGf7D}<)RK&BzsS1zvrS;4~MrKnm{}J9k
zL;_sp*W`?6uAa;<;rJI^0^N_vmpz&WX&T>5J8;7%{UZax>vN2p=-2p|Fw@OK$9!(w
z4v_3R<`T>hsDh!l35#Puv#@^50rSakgl-PtBKBz|W1qY_7iX)oRcz8YQB}POfjH&Y
zPn;6IC~CY9t3zXFHm+GPiGLf4yBmLv_tACi-4u$^WL=YDxjfRvIC9JUw@g?=2u1X!
zj6YtNq4Ebw7Mw*?EAcm+5<I^)U3(xMzI2y?aw^Z0O_jJlDB!<jR8<#w5kUj);^iiV
zW7-nWMS)68AyG*z;Vo{LzH&D1;p`HxGn{o3?}h1C;2Z2%ReQ@lEnw(1gj|M78P6Vv
zlS&!$80)(!7NvZ+)#nTN0%N-%4GSuzB*8xFIiLlr?!f7<y#G0OSXl#(vC*fyLq(b-
zeJQTPmH6MN=%e=UZrBOf8%IXElgq@eLYUCZBGxB24qj6tv86d^e)i5dcwuw{cN)A1
zy98fx^1q%HC3VdB8~Fim9h;|;E1Mq&Vh4KoKAaUe@|B}^k{5R{#YW@14epZh(yp`~
zWb`6ms01br$7(b-OQiMcC(+#)9OweHKQrUQV2KZZRP0;N2>DMJ*>-_MBYyw6b?|aQ
zb9Hl6X6T!xs@oljJN425Ut(TUm`wZRXcu?#8Qd!IKIDPII3+IR1Jqab=s)B?atYx9
zI))iaHMV|YMob6mvaeLW+z;gm10y}=X%=hmDZ^G*_ee|eiwJyiu>RU!<EImHqe$bl
z^o<}G6&9C~+Ld=CbmO$fT_?o|5S3trKf&FZG~;JI?>TM@@2aC?KNX_^*scXlm+nDH
zoD6btsq1iW$v)&aSPA75CkJtRQ{;1!xOC%KPo>{m4b@#*51M_r`|_a3<UxK{S478n
zYH|5duff^53qX`6p5~_CFVgK+#>s_~`hNhyKt8`ssl+s{^I*o9nsBLsPhC7TNZ8MR
z0C$P}Ycsws3u25NbPXDXz<J;yVPlQdffUI%2xynE^(gn50e;MbK3jRrwQ!#ysEl(f
z7P@$72)G9P2;2qEv&+T|K*Xk#0p<nfRR;uwE&Um|#;dM!by0%?^a+@xRqXDRoCa<J
z4}b?GK~Z|X-3y=Qp*#f~cApX87Vr!36WdfvB$W$u%`JDoKd(BtOzrcK^sC6cYsU7&
zQ~c7`y?*Ena3A;uxJU1$OzCe?hR+FT`T^&^4*Uu{B<yU+ZUxsWah7hW^J?BJ|Csi?
zN9#32=5>iA>slQKD!{8Q9pv%;?$h*+2n3U>GoO|3je(Dc@&7%de~n3<-DR=PGOWtF
z&R+n|0r!D_1Gj1Y0zT=ou<je3P`vE&;e6l>a1ZzyxWV*+Q$ZOWh;!mr`@Q;S1h_%{
zbrqNbO19CDiNJG`8OV*wl!%%BBXAoy$K*@Kq0<(_VrSJ}^x81TKmP*m37VIUU1bER
zv9MsVi~i%}|A)Y3dM{;aOJ~L=`TLm-iiPg>X9ya8lTbo^GBxQ@5=zo{@Iv!RlfWI|
z-@uQ=aw~}T^7t6YPYtTH_NNcH4EzNAO6he{63fE7L4tB51YPcQ>rVl9fqwyafN_rf
z7dflqa}e8;H8gMZ{HK5$z|X)>q+T(<G^fOxqDzcA#nS24bprSk@H23WaB?=R>B!d4
zzKtS%(gi*L55PZwhXj7Kg#|WuHN>vD9U_i5_4cm={{a30Tqf{h$^kkXejlMqc3J&p
zJ^w}GPG4pLrC+WvB{Xx1iOH6NcAo;S0=I#iz&U1rE7GXY1SWuSph9I>Q~Hk*w)IB}
zaFxD~jr<M?E=-Gqx9v|AxCs0N{ENubF8LM7<_5D~;8alO{ilI@G~E|~0ol-5q9z(-
z%cnd1`!4!B{&@&oAn3MJeE};mBV<aC_p+wvzXbdO{0dwrwrqi*4(={(7eU69`bX2=
zGt{0pX(9rS?*=jfTcJ<bWa~&yd;Al)2Tbwi`USST#DCA&wY{(BF9BzOAAlc$>kR(}
zWX+r>AdC^|KdbzI3;2<+)k8fSmn&kDUEkL$4I(Cho4~KYFNAHb(t*pkEuq4`L7!n=
z=lm{wy$}2X+$8p*>rblE2DPMp*7;T+^!#^#pNO11${&fjuB*(wQy!JaOZ6cpfa}0d
zz(e4YBp(6Pxv*q_|Jd+LhyVM?KmP)*0V6D57iCGXmGokkg_qj~U<kMbJOJ(kr^!9O
zbRooC#xW^#C+l{B72q830Qd>GLLbMMc{A_=Cg?t_bGu@q3iv*7i?-LEw}JD7x94UC
zzHrAQ1_*k7AGjd+UXZ231!mZ8(i?FRY5Vri0{0lZRQRONsXa%FK46-6xX+qS-Ohg-
z_?5nYm0n9hwzk^>nv^Md_Z1kR`*w%i3sTOeW?k>#7;pht<95=v0xJ;o`!;Zi+?P@N
zgj}WrX^Pk;=yu%{z!l&&a19tI#}#=H>*BE9vpN?J?l|r{OBmoGS=17U4GjtQd6{!R
z9sVgWd3clad42SHOpTf=Wo`d!T9bFd`%S{$mdP_k*+XBVGItq;9ah%bpxHMFT0X?E
zIidFOs76l#*U0=XBiT9sP2dN@1{XPLo607b5<#Zj&V?^p6Lt#Bh2LZ1bd-da?kuNS
zk8#QDJtw~xfqVS@l3W`^le+RwNYnY*3_wWw|3L32kQSy?b_n;8)7E6WZijzv5zKpl
zo;vu3W=1AR!q2=y3{igX0Mm5+Jn6!*Jir)miRrYjW&lgT6mT85MgA$s;ce<R1V(B7
zT}du?eoMYh(Ml#9l=Gw)&H08dqv*H*gTN(b9#7}j9R<?MRtCm^s}!C!GXTzd-U05A
ze@cQbVN&dqBVvxtbok%tPZzt43-q9MVYz=904}g9;gru_0AL1q0Q?MGkq8^&dqoVS
zON^hlIJ6PDYq<;j3fvIi-#V-#>QK|f9tS7aTqe8>Tm&8h_knZLJOo+4?@1hzaARtl
zfzHtPyD7e(&msLx{Ag<obScesr`X1TJHW5NHDFM7RojF?Bs+wPva|Gy6Tk{^jqb-q
z19D{dStv4SH`(OqPv`tD_8F4pE?LjYE|34S2CM-a?)=Vu`x&?mOflwzP}};+Qhr5)
zHnaxjKkfl{fU|V`i;Muhybi4}HM7NyyEqNp10K+N6?kC%S=%Dvq+*{NvLmOxuLJiO
zI}MAuRN|$S!IZP#?V&jrf8POq;B-1<>8&p|)TAP1VtB(zk8|HHNroVkJ&Qg%pOm9Z
zUyc5wr2h@dPsp&_;?0y+wZ!-TVjIsZz!l(E-~n)&_veWYHb2-%*aFt618^Gnk&*vE
zeo;@D-#ZiVXePfgfMMVV;8)-WBIj!NW+F?MOdmCA1NM!7egy8&fL8dPwy?s~aLRzt
zRfV<zoCAIa{s~;A?;DWein1=NC5tSa{%F(RC2xM@{r5@W8u7wfT!vNT%E7Nj{}RLZ
z`%K;y@~c8iRB9pRBIfAYCV(m60b{pO84n2se8v1}Pd3Eo^p8v4o{~*}L4H$XN;f*B
z{Iun_3S49I+>&RiyrcXM=#T^DUw|Dy1cr!tTb6&U1KPH4#PmsrS!Q>O3@`wUv*w?0
zMNX5eH(~}{DZ`;5rsPM=10UnEYnu#e3an8k%Nwh}C`XaUfza9YvrNuaIHeKr2u%ST
zaRW5tJiqhx+tab9X!y<Iq%<^pv;jpn+0Aef1DpprE~CZf0V@xB!UGldi46lm0hj<T
z0B3;#ku|eh`acUCrMy5#3*cz>8G&7GO7C$f#xP|rY?L7507){*{BTJGI3Ufe`cWmw
z_gNP!6o7Hy5^#YeEQq>p$PO5fPRs~<bo!0{Q>^dw@d`42MXL*;0~lndSDSw-Y@V24
z?A4N`y{-JZ=@1y;`Nx1#089WES)VCL<G-mPUn1uMFv@&v#rS`U{1?csrj}ejXq6qa
zpW)wc$5m6(HfPrsN|RnnUUp5Qu2Zee|05jjF3{E@zi}y*-SZV-jDNp<{$W;qM<i))
z8eg?HfTH~UA>*G3=GVb{N~M643Q;{yGrLjPz~vP#1Ls*~;)qfQZy*!28Ta?kG0ocM
zhiR4wN>s5e0sQe~@3(6*o@AY>ef}vHj{C)FNrY|93>5PG6Z+JDZnWQZPgnVC)&`%-
zMHVPdZTdTTI!)_mQH#&au>}O6{7x$9d>{FLtOx8QrU<s^Jyi4j=Z8gZXe{JWEj-y5
zW{!gkIGhZO8R+u>Fv9ABm3$-a-ipbU9&3svV4OAj07xp_)oQ-8n``s=PeuH&f~+$b
z21;eb!`!M6isRFu^{_k0XiclYFs;RAz{dMN0WY=!2IvhJ_;wzb>J1xcc$t^(Nz&Sr
z5hPD?K|cA>l>|y9)P`vd`gmGT3Op8#`8>;@<kQ=S8K?9n)SG6;z(0wErK`;I2S8FL
z3shOI2qf6>#Dnn|T$uqYl@JZfdJkWejxX+q`7y4l)32c61AL#ZyniZUXv(BOgF8WA
zE$U<U>nWzlP2$XnGc)ROscgGZzl!DrA0FVHC9<p6r&W!ZOX~;5*dVj%(<Tnk_xHtR
zheSO_l#(nq4uGV}-4cBwb_GemoAog#ew0mq<udk^|GP3jDbaF`wL=anw5!wsnR!6q
z_PqR_P+xMn0iPeXL6r*Mr%&c$3)C6*yd@>8QerL_u|Ft_-pgcr^V)rh-OR0?vL@?*
z@-RUal*q4zq<?{5n(zwxEmQ4u;v{vlT~#71WM;sL<}jN~%LVLJI2UZLGlT_HkUfg<
z0QqMlB;tLAJP$G8<~Q2aq@05uHvXvqqx7Ci`6aihgaVdP?FU9|&?YQU0anD*F?;PZ
zqY`i;!=13iNT==8SESyPiTe<VCW-hC0o&W$Q%T4pRHg5#E_+uqS-I^|2lN|gwWh1x
zrL`pFD2ZXxT!t$SwuxKhKP9<k%R8GlqEcP3bWkY|`fhSlpb-XrCu3%h&j-rZ1%{Ui
zzs2u){~`yu+lx3t0ih4`OqUAeULyAbF7|aVn`myw7agAEeLqNO+1Vu#vR*hq0SqMV
z6N?iFfv^baBDKq`=7Yps-*2kVlg0Cy(6UH79hGHrD#+tAX&Tl%tAZ3%Lu00r_<pH;
zi$wtDO_u1ThIalUFRv<H6Pc{b_T^9PE84qIN~}(r2!Y2oG4|zqEwxhyf}$)oF!$GP
zF5xESl2kwoa^Ib71M;fVvZW?YElI`)l)4azOPt)S3fN{%D%g;_G-h*04t>#eAlt@F
zzGjR@Lx%N^$2<J5`pct<HmAkh0M+VESqkg}X-O7SXtu1&lA8Av$YSh*@lQcCu3Dcc
ze}83a|BeGx6wvlaPM-2ov>0Y_Sd}U)F=cECGsqR1Rk5Cg0=r}?Kq2(_ZT&(+yX#Pd
zKFx<lZB{V{6?p;#Jto1lr)8AdU!mz*WfV&WdU+2dQ!gc>v(O;kV<sq=);ZAPDLgrj
zZ)EZ0iZb$GmIu#BWfO2+kalygmoQE*$Pk340F)3N$xzWVE2#-U+h*bCFS+x#EpYMV
z2b)4sE2|n&JI{yIVdjC`xio*}343=xZvy4znL=IBh;E(Pvs}Pilu+H57gjDZdlXx4
z)02b6c~-Iag8ZiT#dww?j%b-YGF*dLKx$Y`C^}b=(QZM;@2%G0i+Z!u<%R$<Vd+rJ
zg(rS!$b^B0D5=rk>RW-xFXUZXWDpB6+jd^5Y6{59djh`vhA*_Vo9-l8$~<0;$2P^5
z$@3kT#(c$?G>>R*8scb~pQNd!&KufGg3o9o(c~%-ezRvguaV{RXGOkbB$xtSo?tsc
zF+ikT+ghR(*jN;)Pam$N^iT6D(eA>RpE75_g44DX`*|ysiLUqg7LaTA*%4L({LsXu
zAlciLf6KVsluSh6x2-PF_9jWB3>@WkTP8)!9a|rj$foa1U`sMGmWAN%o-8HY1d<j9
zW%s3A(IUrrc>|VEnbvIgS1f*M#ygB9c<NHpEiT{N;x_zFe%eaj0llsg-`NwUHHM9W
zB&I=$MLz{{#hF-43#aP@{qCeYt!dHzc9S$KkZ+YM8@A%T>xL{>Yy9j<K{;h}^qy;x
zj@i>^>N*ug57I)@A$ospp|1(<l~pY>63?9QjK3fA{vvQxZXak!1<mS4>id~1)QNR%
z%-+$nEH`f!-!wre8O4?n&vtP|a=mtQzC32RoMzYg>w@`H;t5lszwrWO_go!_BR14F
z^SWBo$o3T(<M<j$-boL}c`nFA%lj5DQv{lV6W=NKnHFhC&MeC_XvoC0<DWR@`5g%Y
z%7RDJTM!Tmu!Utq^^eiN#msGzhn+HumC=%xC`c(t68C&1ukb)FsM9^Ln8=+s*7^$a
zZXEbzBtA$?(liY2tI7YNhAI*tIwnPm6v`UuAA|a=&a6pM96(=Gn8`&fh0!={$&wfI
zSIp!>{I!ha-QnKhUH1aJz#8|2sIfL1;7bnElhq>o?gE;emEGmoONFFMMQ+T4w&eob
zK(om`Cp%<g^MH!N2BuOYR(5xs0!`QxwFR`;^&8RsH)lmFV|5Of@4A<=RYQBC@k8nU
zgs68@ta}b%Szo5b213YV8n(Jq>}d-|B%s$#&gq%CF`8>Y)#3*8np4s>5?`k-GT}r&
zFAwhhzXF>I8omRJi;3v!zY4^GaP-=6UZVBny7LbSmqsFVo)9nQa)vhl>~Wj#CHBy{
zn=4B8S}VILJ4ttdR+HPjw}COCyaqI;Is)?h7huCcx9e2E8k*WsA%dM!`|WTqO9BBs
zudxzk**fcl)=yKeyy1Ep-DS8kN-S=@Cyt#XN1bid>yCd?&Pna%^OulBq$emgT`ZU5
z|GG@iH-&8@nP_rT%^Sd1JO4JnY>fQx3+SCMuamNebJh4iW>57RaGK*c_OXmkCdSO?
zTf5V>CmOy2`_h7#%9EA|wXkr@aMqiY-(6rtLaUUwoiO2)$SZ6RWWKF`wtyX=$tEA4
zuA3)-Rh|rSY>3Kh%a7y7zI+^O=QsIna&Pu0U=wKiO|D7!EQr@u!Z!0`2E8eu?d?H~
zE#g>z%`Z6c<X=3|o}27J+;jiA0j$ZMnp)48yeH||Cih}20UJPMJ#DTnY?sEmAn<jd
z0EBslsoVKK5e6iZ8~5{+G#Pdz`)tF+b?ymS1NNx2v;3QzR!`Z(x#HT`5&?bQq`%Oj
zPu0qoZzDFqeFDDd`4@qClGI`!^V-cM`ee8-+J{`@F8Lj*NUtX>FrT8uBRKlU5z*Y1
z$b(Mlh51xfC{T8rTs+^t|6MK#+Lkyf5#KH0D>MbsWwi_R{3EbNb5)RZm^DS2m%GQk
zPOCs8=J$Oe;<~jyR>Yo~J!k?ez?_0^I|aBcahwJ#=3H9fvq^q!pMQ^QGFHj_8q3>t
zwCH$l11rEHu<jz5ufRO80DO^!$kw}2Vrs_Ls%!$wly`s*-3qYFx|!w@y?n@2r~IuL
z|JS%@XI=2W`*_<HO<L@+L1Etjl7#<znF=kC`i{MYlj0T|XqT)965P>X_g`dvBa|mo
zQQ+%*{|^7Gkp7K)&dzLrc^?hR>m0BKG$ZclSfmfr62H}xq^2d3w+XzmYxG|xkc8!#
ztfZ4ju$~gWU<FvVr9ykuhudtE2*qPfMR_}K|0D30=a1VavQO}hilT0{eYeT>=eNM$
zz&m4rNJ_zF8Zl#9t0seZQt!ExWXWlZNQ#0bsij*eVJ$Q+x3~m6C2Vffy=9eEacdPS
zOH+oabl3j+nxOG(cK)J(MhB8A2*nAE=)2AH{Oyf2F$?oU(#uegY>U;T9k+c1o&bLl
zVG0S)KOzV<v84NqpPcD=4Ls8G$4s7B9oMq&mP-j(V%;F{-`@&)?+Xi9C!m$mLK&fQ
z+Vh#xKVcfw%&jjRZ!x7uRq+D&19%Cn8~w{%0$GvltuMs4w80vIBcA|^PTM&L%bJn3
zjt=P{cZ8?|%LLwgV?kh<bvc{CE=WVS)#Tjt<tyNkViqUT7}F;nWQ%U51>erp&H_(>
zXTT?;zsKH~VM*G8Z{g9KcG`CVc*2c3?JQ0~S_8_Gg(^zm=iJJ#zzg6n;El?!>n$5+
zFHb=ne@g{KgnfDkJR)jm3WlbAY=i92%dd?^#Dcs20(fq)xtZ{Pzc>n+1b!a9#}(ie
z@Hg<<HVL%ov<|S1)JlJWL~t3$cisYzfaksRAC?7_T~ViXQ|tquD8KKGf9%IA`S$-n
z{1K1ze*^plytGMP$NwYJj9<(rT$}Wvd|;0om7W>wbwv7)^87{VM>Yq@^v7G^De&BZ
zqy?bY9*XwWzByySU80sfQtpLxc@}5}N+P$Z<Q2Ao4}`pbuWRuv2VY@3kXE!UsB#SO
zjJWHcjR7EaP-}Py#dm4Y@psgJq_C^0^cPkn=1hfkT13PANbB=nN9kLv1`Wv;-Gs4I
zi`VGR{~Y)ocy0{f&R-ObqlGIYWDQsc-V^k@bN*N~o}W(xLfqGs()%ay2k=JM3E2@U
zY_AAK3<7Eo=exWD{!;okd+Ndg1t62~tq!aLFM&URN4k&7{9R)~j~r=PNV~sTYQMM2
zKM_kueNqR7IIby^H%q{C;6K0<1I}zR;a8^LP?Y8qOPadCX%=|g1siMinWC_*MJ=XA
zBd!#DO#azc`g`mUsgT<(b?(vi`3fv>CEW{Kr)2r{iu5B}^}*`ZZusvN@H_BG_W`DK
zysesPgLtMxTg3fx^7|+7URN=+q-(uTBsUOpH0bNT0-gY`&4Fo>czx_8^Mt9*gWcmk
z!bia03Ot!HVPYLo=0=$Oq|6-8i}v18{_&{B`vj;<t&ek`mw*?*pTINae`_wSN>f-N
zZ+)7=|Bn?kUK@MVdV!o5^A>l3dG6DDr)yoEiKyhaz$XluYxG3^|E&Gjj$~P~t&M%Y
ze7Oo8krA00skJDPrK_uZpL2d*5#R;E7vMcXfEOfh2>!qi@HzNEXLomLky>k=89H1$
zFJ=sju{>nvrlZiP%Fg9Jmra{yHcgDNs^~M>-?u~|I{pe4GDjAmZKTD&!q0Gb2vm06
zY<;y@$3u?u4Lp?fya6x0T+&fOx@-?)gFG#9nf@i-?8g@6ZFc39496fcnQJ1gPFwf$
zR?_(OVg<QfygaQ74)`AY0Y7aeUGK__lnn=?Wlaq$%G7MX{-(`o`JFX#MsLMA{Hv^A
zVf~vGMLB?hT^l2IO|e_@J}-vWANh;2<!}17{x{<N{%I?3_Nsx-A-~(UX^eGv2{+(J
z_;E;k5#m<VTH2Na?J39dOq{#P`n_=d>FOAI{=`#AO<{lhsROuaKu|?AT0pASQM00q
z?=xASVe1jk`^2ReV9*f?{l7Lu+uXG~VrHF`kxp0_n&a;ZU!M-?f6w~;qks@sk!5%W
zf5HERe{Yrly5t|{KWuAHrt>M>g5Pz0em%7QrX?Ty7I^lBz402Jz%}?q*Z<8-Z&RC5
zmD7))X-%%@sYvgCg4;v#4^{Vsk!L<q+VVd9q40ZH|69fP@0dmDD)SWj^9MZg^@sdQ
z7XHV%T9fy>2LBXk{G$K<y2WrMv%hVf>$bq3e$(}Nd;8<9UL65xFDOU(Lebkbc$T&~
zj|JS3y1y2p#<;5GO_5e#gFoTUws5#FwDsw%9n(zzKzaL=cfF=xAIkHyENqx$t4?av
zDT@DZM7sP3JRE`)?`T7!z;;aNM<tOF+<}{L6CP}ZgC6KUl}_m?0T)&jzAP#AdlPPL
zC5;E(!%Z23b4L1|B)p{X?Us0(3+r#eece=E8=7j0rWTdtzX!K>g7bCxFTg>Y3+c%K
z+=8ocb6e8*MYygM<4ZUT2h*IDdbXa6H~ZFBIO~ognpDkyCp@>zun(TXHTZQ~C|2OP
zMEzdCiSxoUUlEx89g${lqd)K9ny#6r#iuZ7w6o7ddY;z5zIFXQRfjLYNe#B95^>6#
zr{BN7Eu8N&Mam2C3=Uf~-&OdYt}oZNmH#`quJ=EuLG4jVaaGpyFSxyx5}*r@b>&(#
zG~9HRPK$aZ@43DF)B0QR7S3z1bVSzSjW+)V+}?Km3vf+8bwQAlW1=K@BRu>Y(*LeM
z+!QcZN>ZE<s1&jvscv*fyzSfQ|0}(Jven0wPRq2YWCE@WPI<6({ntf(lHPw-&H7#C
z{nO#SwG|G%0}th>L;uGVL#!%*zAE_-ufGqEmE(9VK9MOS6pHlApKt@7Ze9O-xDLQF
zJcpwy2rs8%yHxO|?C(bOP-rlyz+q$4m{KxtT~(_4dlE{_86nAx2K&}j_<E-D&0weJ
zN8zxLiw`QR9}(=s-3c+unu$KC$?D|5(vf_mTK6hDf2QG(kR~QA8d(q_NrLd$as5+z
z|3e~ut|)o(T(z#(L)sso8XOWK&jG7yyn~1ENVUq{);|l!gxou>(X6Hn*kgFM`})V>
z2%Hw7$f#Z4o@xEBc3OX?{XYV8nvO}gktGp&JlcgcJu}Akg90BeDSQn1?Y902$^W2I
zx^;QAXK-JHCY@~`4*fYLWQdZ^^ILeO=C>W?UlJivoS#v3-YhBtPAb^n=_@8gsB%QJ
z@l~~*r1d}Aosef<FbM~Q>08zf_zB$GlTc+=@;j;+qGx;>5BDTg85K;zyk^B4N`5?3
z?fAtm`hQsZKWn+HrfvX*-hpjDGt&Q~0(&W0;rFrZ&yMmR6|Bd+z$zM6_<amdb|F;C
z1=Dg$=Pxxit%&jcfojFO=}(g1F~dY|+4`UCP6#tCHlBHP)OEyXuqX_{g8rAbe#fX$
zx>eQr5UpA}iq5xj+>%k~3bwRQN<T#DH*S;yO||<k@A>eEQ4%HruBy*-c^A?7Hcr`K
zDY}lQ*3s^s&zB{?aixR0BD7iD52f6gQF1mE0l!h7>E55Zgx@gm<oNy0p3i40h)-Dr
zxTNRT_k6w~Ea`@IV#Zx`@87Q*rC-{=rXaBQr9Wk()Et)sUKJ(e-Xd&>Qg2+=V_8kR
zt9w3->Hmm<r>}^TaBul#f{K}x{<l7e(sI<^zhV3L-r;w5U;l(yE$cGC@%wx4UrG8O
zCZMg&U)~R;=$KW~{O~XC|NKr!ehpo#m-kPJicH|7us_=-zkQwGDH*S5iCvQX_O>52
zH581i7|}9H%)Ng<m+=|3!GWe!a9ICJyMI%PfV=xy_tsC@p5I&Vwtk`r6;0e9_Q|b(
z;bVS3%4fa(_`Pk{?}Hp4{HfUbKj!(&&?r9U{iBVv_i?1^A;11d|Nehz|4Igq@WJ|1
zQW@zZH;;YEzqH%3{;TqjG4hXmg?RoW<)0au!M^P7kYu6^^^xcAzxw_qYn|O!+&c7d
z`M;>Y6|<jv``w+EP}=?cl{jh88+(T*>60tMsLR8yX&`uSH7y(TMy@*Vs<klhOEjbE
zw=65d+*f!N>mRioZBGmvEBn#3l8`e-wEi{yv#<4+l-;lCfVVa4v3Gbh^!{!j)`Xt2
z_wSEdf_Pw)K>Ix=HEWVD7e_lkc>glhnb%^nYxhrP&^J+ITGa@|2dqDCC>^Qre%GH{
zyGf=p$$r-#`V;w`RsFNy^*1EHnwm6{{P(N>6%`mF#jyTC?_V=Co}S)(ZU1Jvt}Tpi
zg*BOweSI=Jf|;G4ecjMX5?<B1#F`%@|GKVsDMF7?+z(j)n6>PN{_Si1=<wDhzozQU
z`#ryP(e+ix`QI6DaY_*F_8J)+P?~eDSZ7#-S3aM!?PpdAwgXB~c9r6IA(DmN%xF!>
zeFs%!s;a&HnG)=~9fM5I&kO%ASBu6AcoG=>JAEdEJa|Ysx|ZlAUI`<9w_`9SDBW2d
zfM}3?tpw?AS&B?)=6Q_`l;l9)==t5&KcxiwtRN&&**E2AU+=C)b)g9#(iJ;bkvwW7
zyP1(p(AkHTBQL9A{;lR)7It}lTF}39LU4=9ZS4PU`qL1S<{@JZjcMj*d)A0b(w`$z
z!Q4ip(>&X5>z~q(&nd^=)+qZ+$$z)?kHSGXDAKKz;%SPV;khO$cl_jAj^IFzPH7tU
z)$Uo=QOR!}=Af?Gnztfoc)Pp)j7WbD!>j_x6*1(ziUHqU^k+us<%a~I+?4z`0Sb2d
zOzOvH3_h_f<Mn)(0#WGCtc*uAnFKVz%YCkYPI#tcO7Sg=VBpoB_UDiatm8^?t*S<{
zxBVO`p1%q?|9g3$7b+3o-^qqoME!JGyyFd>o+UNV+<-enJ)Y@Pf}?O*r15jc`k1`W
zBe(;%;qgvBXdI5ic{nTh@-d|;-@zlbsNCP_4A$WooP+Z!bx#|`&Qrbr_FWDc9EQ_y
zS`Vip)4B%RpQ=&k$xdf*2F}A5a6$C4nYAvYE|t_}^?uvuYj8~RKWjy<j<u<#E`{gY
zK0gl^;gYcX4=ByOYK-&O;o(l}pM;Ze4$fJZMJn^Gz+*jsf2TTOUHWrDw96xMAWb!z
zT-&KjsigIPt}c^N71>hX$AdlBKO-RhGjP%vTBFP47F>h7JDtHg9ECG*UexK+ascnu
zC3Z_)V#E5M!4V^EpH~bJSOO2V{zp66plLV<m*7mGp#$3gr|MF<zmxvd;RIZUOK?&F
zOjk9sw1hi4zuU+^>HkSM38zKdye<cjROmK5+{p$`!D+alBXmg6OA)<1uo=~DJ`Feu
zr{OG|*7>cd$>9myg`041C(`$MI4|C?gLZ>%t4BL^$vxO*{iom}Tr~0{<ba;3QR?na
z@^8rgr1w84MwPC0s;4fe=i7#nhvA%z@2utc*VSTp8}7j4o%DYi&WQB>1WdqyoQoPX
zSkbAj!>z6FP=k~31$+yi>aUh{c}(5%PgVWC8e?iM{V(9}@Rev|QRjUs@|~4c$l$@&
ztlK1<h0jHNIA;@ANr#T9_pi2Xr&U9=w_nP7&IklOwMw5=_ei=|J{{8jWAGV#3FlzG
z82+uOQRN{Vgx`mz#L{O5J`r!&Sy7M0ylZN6omcD3jjijii+B19xU4p@s$C)9>r^JM
zFx;s+51+x`;Y-PRR4JHLq&lzm)5_44PiX&n_zJGTX^q@wHt2KG`X+8{J;QYo|9vF~
zShYc;bO}8vua<2k?VcBH@7JO&jvn3AXP&%zQp!79V~A<EEbnnzuUa*LzvIS6d26c=
zXc+JMIo$xqb%RV1+-bGHvUUAO#X0+l^mJ6epTLC9$|{w7%FpG0S)|{mr01yfz720h
zQd0LZWtabH_*zSd9_yZ9M^@pv2w4YCIGgnUID7$L!g)BNSBMIAYSn+Bz~<Ig{hxx1
z@U?jV4hpxarz5lgk0gTYLvseP{x9K*RP}%&=#JVXQ}@s~l!LF+<^MT+2Uo;hTC+*>
zG^2e|dUb1C{lB2~UlMn2j2f+|cYO+SxVcsS4LB>}-LpnM6Vq1fBGjCKKhzdFy#8<C
zGdLy4f|yucgR>&Zs%?9Ilm34v>bThr()*lo%_1L?R;GK+HNOXBQs893Pl@?-pj&L3
z7@1XAm)eh>`>$Vs@8JhHCn}zf%3~=dJZDET^<{4gU=l8fHuRkGAEQQjU(*n9B!+MJ
z87{+d_!j;HK9QBI8t~R=vF$n)CN_0&4nBc@h&66X3Glv}Zu)RkSF<W~;FjNoC*VAM
z2S324FsInHZ|GE`rjq@w@;@fxxliRNsw%Qp)jBgH$C^aDd4^}<5_~V>!r3D6LAA&!
z&=F(z=3R0WPK!A7qShZXjwP!K)MSrZeizT+ID942_O$+zjXd|&wK;0Q`7d1lm*H>l
zHxYl<0?$)e4C7*SNk4wq^``-+;7j-pPU`y8(3DaxPvp2exBUzy`F{id9X?Z(;5pzl
z063!nCQXHH%KxDB=Q}uNfB{uCE%giqqN~30g7r9Q2H=oXrE4R<(^87`Qa6U=e+<5q
z3Lm$5>lpd(O0FH%6P8{6pTd8@*Kk}dc1izh>RXxB@KD<STkgZg;4FL%KfooK?z$X!
z>YF-Xyiz^oWro-PAMlyj;Of?7F`^;zadqyu{QM^U|C@+EXNts$Y0oFscav(qxBdK%
zn)kdc6ObEP#gs}y5tUr`TyC;I$$ow&)VhlFB)SFWbgk>FcXs1*SgesLdY-ZNhBQ4p
zqt?Zg)NR2XKfhsr{x0HARS)tF?WrgrEX$!RdRcZBF2XnP6`Yp!&kbiT{k@vrWKC#w
ztLaaV!>90>eD6T$x5$Q6go2O;_TP!Kbwheb;0k=B%g?m+SoYMYP|+Q^uhFHISpS@8
zL$APbn^rFwe0)uJyq2mO8)dbcl2^(5Oxk;PwDL^yt7w?zU2I?(j%xiE<RZ#8=z{Qo
z=5+dp<$nM!NdCt}oz*v}$keeg16V;RRfeeR?si44JC076o8*QM!bNqyFU9&xa0o8J
zmvB*}_a)iwwtx`oa8U1(x(08>MSKR%!I$tEoG|9#FyKg#Wt-<)PWqFq-<R-(XnO-{
zWT4~PR~PnYhVHp(fal<X_V0i-ILFjw*qMR4CYLpvw{gBt!e<g;Lw`Fn;J}*X+E&-z
zMq5JiKHn&Gs!L`SfuB^goD(W&F6k6(uK%oM&PoFK&+G_}!C`}(TY}s1YE#C$enO72
zR7A_{Df)_j>kS*2&hO`N1<soO_6%w@z2;#kE2*^*m!J&h;j*5ewWE?5Tu-z$4AeEe
z8a8+qF2Gmtso5U~`pK2vP8rDscpY7<Df9I?d<sVucxPL%?$i3CZGRyG*pz(y6uyB^
z<ozRhDk)+~=eMNwFT(B6pJYG3f=}S6x=Z8aRuqqoXloZ`KiAxQehFt3C`9{3jLgTT
zuB+?)rQ6Rh;Cn?6b)DP+W&R)1-kOatw}d#l4&-nez7=o$R1xRF+l&GOrX8fi7-@3y
z{bhk9Wbz?RL#r%XP4s>GjY);CS9C+F8Pb~%{|EL|Q)Q`UvMgued-ztVyQ;7{q6?sF
zcd<-Wf!8SrO5y)ka1jm|NldiR2I5IsD|#=)-99ia_k9KCjrFc<O9*3zg7B1BT~B}S
zq~w1VrtR`qvdR72h^fj-tiA^h!B_ADd?MQ0auF-GZD{T_n>=`zRy3~gDJ37qjK(P3
zNN$&~T;bQ7l*lW~0Ddq1sT83?ef?GO;=YT5*c@DezbSK3HGslF5$r0Wl0_9H`@kq%
zfbZlzN4EgQbj$(OmDE{)cWM2Z?%!WX{%m15>FXX;GsNSKg6cec3;zv1g-JuF>udjV
zo#C?0uD^+iw+XlaUnq7QQGlFj^UDUCS(OiZ4X<PUCv<&_^II~U&$#>KdWD`G&`Ppi
zpTiHj`_wH)?-)+)KxVWo<b$`NKPh?giPm4S0IXyH+~GqzI=^diej|B0CH?Q4BB0l)
zs;c!C-lYH{^#7tR4$+<3HLRts1#?R(==R<Ie5K5fH;6|Ts4G0Mlow6~*;cIoJNOz-
z7*<c;@H=V&%Am;peb~>WKPh)IV|F^TQyiWoMU~4sfBmFCm*Ma54V<v~x*qh%3V~N&
zcgf>$Lip&6z(F`+RkJvEJ&RxybVnJcG)j8_vRv-}qBvu_hBy;#cu`5Hsm*0h$yCNw
z`JPkfNu?<FuC%8j#^ocL#U~knBXCBUhltt+MZ}bjt|jA|#+-$631&2%I93GxsTM6+
zS!|z&;J5;XjqvNJs%~+~@7vMK%$ej`|Ir{lAJcU15#g1Wir9NqL;V_o!*D^ddM!=2
zo`o~w4euG~afFYVPVWIY1s7y0NeXlf&cIQ-K7_d(v97v`k=7q~((`%c$0sfM6h^CJ
z;H_07`JPFljs2fcxu9i~1L1uK%E%qiAb-Qd_wzzX7}HI&C9IYe^)Oap8cxCm)e|=K
z|F9men6|DP;c;ENkVJS=SSDGL|2fIOroUa&yL1fvHO=rGSCw}()_>dzmnA#v6*C5C
z2Jr}-lCo^*f2icR;Y{}Q`yE-bnuPVK>NHsiX5ozLJ`D@nRt={Ty+vv>I<AXi;r!03
zbE{@~zMd>kU-~<%QRK;3|2&+LNr{Vd$=rA-QboU)pa3@7=2Gv*thFv<ek<HiF2?th
zF8}mL#{^F?*tDLTLke+^t->svRFQ6T{ZSwC*^jsZMJ}|W^L82zLOIFvgpCGA^wPKU
zi`Jhu`-F_$#`&95(Was26|3i)ndFYXwIW1>RORfBs_$yj*zsaGDFU*tPIX<aVv|s-
zu|6%v@DT&2?d#Rc%4g;}$p-`mGY%`6fPSZ~YFRR&^>qd@P{v99{SpjDVOk~Mn%p17
zeJdN9R0F0|&)6VsPknc_Ej%lk#4*sf)J1Zh-gl`ClffV@>tC|14^7B*1yA|l13}t8
zCWW3+iDF=T7_tZ*$Td+u1N8>21aB~><YIWAOleHd<>gv$YEpYwQtRj;%g_6dXNn$5
z#^o_3Ab=;(PwPLd0~GI!y-Ue%b76_bp;lpi(vqVXF-|OGL{WQ5E!Hy<)^Y~Za1@T}
z^4z!BZJ_a$iWIq`R=b0`RQE&v(*o9r$)vVnZpK@TDuzk&Uz=6aW!;J{9m9@`wRBYy
zIH*j?a<b`%t#vrFNZ*Y~+zCfuTKco$zz)Dcm@xVGb>Jg($Yg-V<?s$bqt5D_4EnfS
zXGewWlKg4Ss_qrTMt)y{QIU|3S+MIPUL8q3&f<vU05nt|nvm=0O4Tt0wM;SCsOk~v
z{I(|4@-bpabtNseVx(i4aw#(c<66K-+fnAtGe4O!_7T|DCGQRWZ&-Z`-4r4PS1KkV
z%T^MZgnFITX?<B;4hT!$mP`vu8u5ttT__1xp<x#%GdTI?z;(<1zp7owx^<gvkFy^2
z=ZGV7t7cRcoaVV9{GnHm-AAV)tYZ4VHoKfFM*3W~y~d_HbMvERY2}D+Hr=WM)ROtj
zLD87U7v-818BeL=3Tuun5r405zdw*w%A^u8W;RS(-7OwEisUh6{a<y<llG#(IieJ2
z$6Q}l+}H?ZQgB|WO9ASp{R6R&g*Z!=g6dn8x^bXlFn?gX7(2T4Q`Y^U0<D@z!yCnT
z$7?|G`)%W~tU{iE31*bJtJ(|4%a@A|q+w%j8#jtgZ))*wth{1G#64+|CS_=isK8d!
z+RD;sk9%SZa@9m9p;~1v<d58KX3u1%kpoM_B$eGL&P~CzurW$S1LDRY_Ae&w@}&Qx
zioLP8sb||cuzxbeKod}{vN~?CMtw__#wSj96!c5M#hgojqVASi42abby5<IZre^)B
zF`$R=C|aSo0VM-9A$yfqXpI;|bWgCIwIboS7bD8X8NG3-8&&Kbr#-YNw_;pmbnC`!
z(2C1)*`TZa))MBdzr_TXD^?sS7f)N?EN@AV_3iN2!xBepOV=nnO6~x9`Wum$+ANNh
zO;A0Roy(>JWvO?jB1R`jVN1F&HjP~n{f%6*iW^l{<a64@SB)6o1rUcOtZM98JxKgA
zGkrbdHuVa2^s=NVy%t_2s(UqaqH%@95{RKdZpk+tbYuxDMY02Lgo(s*+-^5bwEt=R
ze*?3m*uicYiRPxEF5BO5W$0O0+|kv(KlJ0};?e8t{@=G;OsVJ_h%F359c$^?{>3YF
zWPKz0C}ma|$c^P`sD&!CtEm6wLY~dTpAcb9XRW3SLDQ^y3A>#}>b<6THOzQNGOKBK
zH@wwQZxLp?-E>7X=uu4``d>A&0<?eO@p~#PbhVa&PEiis+$uwu^^Uy1x5F^xSVK--
zYuoFoWSJ=@k0%F3F>eU3u4}#r5SeznqD6%k_U!F8JG6$hWnf0LZ&Ivk7s`-%ioI+L
zt7t`ii{46>8r@ibSutFUp7*7vRYTiF8QGGdx@SY{cLUxlR&Ux?$@1Gs$?t2uV(V(p
zcv#9s81&79Sh8R4%Xnv-rZV6BEsTgFDudlH3msFm+*pp;Qd8@yhinZat!}DaE(G1Q
zs$piiwa6%SFk96(Ll+*{tzsmD!?10rR3we#hzzW1T+tqed1<Q`qX}JKl}Bw9?XC;9
zY$Dr4Q%)@nF~MK8Qx%en?@Afg4IkPol6Ge@k=Xl}Y!`cK&R&CdU_^BEdj2~d8NIHN
zMy;xasy+1ML)WJ`DE*>I2`xzfY-&;{cO-x}Pq|%aBD>LpLElp3rJ~0gx<RA}p=BA`
z9?Dj%=zT(S<KfIyILvV0<h`-}p53mhvIG>d=eznlE&bDv-(S_~t=NI-7HcrEc@Z#e
zGGYw0?E`aBaa2MOZG+|Lh6v(zfSG}C38kXz?wdX8_4IwKux_q9V*JXcG1)u_{m}o-
zoiO$6SX2@1EIsp$J)^_fRJ917BW>6#TK~+1vhh^ESTqWgp4&RZ4V6r7+d&r=!9Lb6
zl3O*lZ(-;_u~9R=qHo(1VNv+qxW27Pz#-8-^t(pj-O>3e*<Si>GQK?y=;r!E!!T=I
zBHvtIJQ{6~8*NzYhQwG_rPOQrnZ5eJ@cUL0Y5S59ruyA(=(ZO(x{jb-T4+#lQ8+8!
z{nS*&VJI0IC8~8_fAo~~?a)-Dx6w|4fhR5{1De*qCIuSE>-ygIO-Y-uD1zUOw$`_5
zstyPptdbIbBbUKiP#atap=rN`6*Z7$mbZ7S96A`CV6C=ZW<_Oy%<QR$Poa-%YJ*!x
zP2ulhxyacP?_AP1h5>9_WwncWqn25MvN?b-Ls5-i5y5l2r6D2Tzi`QA3qN}@fNPE%
z+7TB+dY`&nZ={{v8AUwMRvT>_?ME7Hid0dgkV2Q^&s*wZTYy&67|_S8{XVlNGYiqW
zlJ}<mtO^OTtujn*6EJN>h7pY>SdG^K{@M}}X-lmx-UA)C?vg~<QVY;x(w}AZv`#5l
z?-gl-qn;E_rW3;J7WJpA|GQUox&u>$o}J$w1_ryPH2R=uC0+A72F<me^yj@WCPoZ>
zpl?BR%S>{w$f32-)BH|7y;W^kWMm?M4(qTYld++)9pM~xwQZ&17HlS4WE9(ag-+_J
zeJ^`hm%GA@N1TfgVKbnWx_TLuZHk}DU_id%Q;|GY1&Z=QB;gx9Rc~Nfb<kAr8MoU-
z*DkMZo!OP-!IspJcu<+RvdI8jQb`B)RglgP^xyB4b12IIhdrrUQXn<3y%Op301;}v
zmFp_mj}H}L1_rC}Ed69tSl(|`g!9-bB$??vrm^hD4&0v%(A%JZ(<&l-8JGj^X{vA&
zlw}zn!+rH&jus;T%m+tmCN24GTK`*toXiy)EhDC{ORfX`{%bMJgSZS{z;SC7jXamf
zp#$|`z8E6iegiLI!2tZqa=qck!XGWe6L=V;+wTQ3bkOEsBCM&&8TO5N<*Bbf>0i>P
z=`GcZ2oby5*RUCjp+B!hLOxUMfkyTqr8zoAJox}#!Fp3z{Lj_G<#w}TH-OCcKk3VA
z`uYWU3ePk^U)7UcbHHU~o|;0&coBnCYvzNdjOnwdBUG__Oxx%e-a@k_n1ffEbEEkF
zedWc<c7D=%T?_qvEuNzhGt`j__2{iD0=Rc^epcWmJb}}ea4HoA8KHDj(DfS*umx{L
zgfVHCU~dF$>VBN=H*erY^uUMyyoDn+y<V~GeFPw3f8HrzBkX5757q7H60tjdPp~M@
zll@7D`6axUyN}b}QL>|L)<5ju>yY2FqS;ol<)Q>#@=dSZ6RyV-xDQVPQD;>+AF%-y
zGXS|6u)Zj*9>ddk{yjW_M=H#R2QJ%1tf#8LB0PqB@H}2%L4;VR)d`p>l<FxaD;f9O
zqaog}RpI2E5z;}eScJtykgU}sco?MR??i?(qh5%xA(5^vDf9Ib?g~z5a|2$&Q+Tb$
zj2I+~!%(vPOUQ5Y06&M@ngE}Z4UgK!zzpcRxa;o4Al*B-uQC0G)*ttkvU%c`_=Fxg
zs@S3&&~bsq*NasF&jqdN_aDdpufYqr3nv7|R?#m<92OVsHF5XdkM+NXJL-%aQ4G>6
z`qQ@V+>7u)a4MVfe+G}#3D~fU>%eN_>k0*54y}Jxe{{MyfEtPYp8mZj=H1(HC)WQK
z?g@OQETX8~62*R`lec~B253PzFUJIe=~e%VW#aor{_`RNh&7QZowqhhEH4x(xRz1Z
z-4FeL1NY#xkTP;Bdd0tIG6d<c-gWseiOl4LI06Qd-Yk~yduHu79eC3JJ8(=LGCqpq
zN#%~Brw8yb2I=0weK=_VhLQFSvlD5yMYt<Cr&Zto7a9VI(-K`15&Cu&KtF<K5dba2
zBe<avtOHvbCraAFfvy`5b$$nkjOHmE-%_%!6rEvT(c)cSe+o$M!o0fVqaZz?%>P?z
zye?5)gWnv^WuRcIZa~6~VqJWo)B7B*!oT46fW(Tz*@Sv2;@TC!4J{2H-GkrYXCK8+
zcdLdmUYD#a76I+5@T`7^f5BCUqlrm@TC^uwSuoN8YvRrR3I2fRY5lR(QT-EHgt+*m
zoBu13&j0SLl^vDL##Ben?atp;YNZED@DTojU*V=tg7#F38a30On-AzIDfR}g!hgaq
zK^)i8=1pjTdT8Y+4t@uB;1~Ga;h56-9x;-dirKBq5=rY~p8W}~h5?Dpa7+!<tt{Ks
z0c^|r-GX1?uaJLJPIFo_pe4(nm6di|maF>>euMkrA(DTsi)X_-D#GtHm3Dlk{lDr^
z0$nQw)~&Ctqwq1x=da)f`~rW3{6qhvmnE~hg$p9&pVq&rKge~DTK`d8f8!LTyVGAH
z9(?cmQ?<q>tc4^>5`BeANrnFi+S{hdZ$f#>cJUJ5Q|$i$euf_%jw#k3y<ioKqcbUA
zTc+x&Xlo1SH`iBGy~o1ZaC>rfci>m}BjCu=QX6VU9XAyDww=E|tcW!JSGecv&&4w}
zX*b}$T_1b0AJ0U5dMze#2U3BCQ1q)sD-dB+$p0t!E#QD!_Hf+Y<M}8_q3QLXL_54`
zKjWHO)j!grGXHl~{ocX@ktSaq0&`o}B{`EN^CD*7c%!evKgAp03IViKa&-uc$fR{N
z`gmKU&)0^)s9ILX@bXbuf;9zNzr(-b_aQK)mh~6b6v1K+pr_sYOY7e>Abs_Imo0@+
zvj3|o?{g1+73uuDq4h_YP`1U#I9*?Wnt#GiBK_W6zhCllGre(hU(xjOb@(Uz%jF;I
zFWJvV_|=#E-oRb>1%8AZVZGC*X`@O$M%lBr@CA4z@B8~!((<-M=#z|n1EXIR?ej12
ziyOf7eDt?DiqSx~nPmMRi?saK5Fl*J3U@U#x_z_$>F#?KeuUrPiLbw91&|0oqu3Xj
zvu7gRPU~-m{3BagDc&HWL1<n2^Ar3Nu5Z<!o{c(rDK?simf$J;3IBHM7yH+-PW8~A
zNPD>T{kM4U*W>&)P5u>0E(~}_(c@kC8U8usJwyN5GQU=~>)SHCfWP2h@awknFDY#_
zRQDSw_J094;3wIi!unB5WW9L!I*NXmb^d>W>qB6M>&32-xchbN?zt-Y|0w&@_4T){
zH>+$l6m$d^rGNi|pWOisbbce;D;bb{c=!d=pTCCe=epfgE9Q5*l=(lyYhjrt-*pup
z+XZ{$lZqhg*1j9vB5Pv2PS)!exHE)m-&P|~Q^IIVB_r){Q`Ym^Ryg3EibCt!w4N2K
z-iWdM5BL%OfCpPq$yY?#(6`EaOG|q%-~XQqUxwGeW`+G2plm6@^GMz&Y0vQbyDEyW
zTf7hrQYlILK-TMzZ4rT2?f3g8yi_8826q&DC-+}ie@nnR>sDaz>DKlZo+$ACH(cEc
zCE63;MpGRk>mt!v5*G4p_!a&Izrn+;Xq+p0K3Y_}`il3$GrtbMz)x^{2$8(2hda75
zL|;cs?|;uu@2jC7?^<1NU5eP03cnVX^Hs&Z58eC3`q#9CjuL*WD%LHEbp7Ab|K)A<
zzis;0(*C_uj^G#gH~i@^U?KlCYh-KN#iJ$Qg7p4BZuOpVepl4W5Le!47)$#1r>y6#
zq4h^|a7Phnc>kty6xZM<_!s=Olm4vf2#4KlN_ekDdjI2)_Z-syjyb)K(5@Dgqx%#7
zr2t@QX#HzuJlayBHPhc@KjQAW<N0ly{<T%Hcm<E-y?)zPe?s{~3%YiSUJ1_pC#65O
zk$;$=l3HQb^n6M?{s{l+chC6ry{Inh--K04{C|y&<PqG5=euws926e<ado7mXnIjL
z@E$zb2~T?rj%ZB&kZzQ9Xvu)2)cu>Ce!QVE?X!mb+g9p7<(X6VYmfC`6eRwnrhDGQ
zL%0by;n7Zb+7oab&cZ420gNc=o2sNqelLckAD@b5+Rwu&Q6lt}`FH|%;odImF9~Mh
zjAnbsC4x7qvpm>E{-badPH3|Hkbn~xRHR6yf|ol9uL4K({_{cx7^rmgO#A<0cln=!
z3vgD*!#%ZKy@UsF7as4#=`jW;^!o=55qU*<^LucAtMzyNISMDWe+M;zvSja{_OG|o
zSEMofGezX;uCSEvz-@TClm3jtF@f37+gw+QeIHByFLwIzhA=r!XtsU|GL!#(xU-A?
z0&|l8Y0U=bR{Fau`R}OzCk%FYLiMCYcq-vP*$U(wpPG@T9}#wCP07dP!*0Xloj9FJ
zFmI@M)5eIltmo7EceFq0{GJt{XhlVn7iv6s1g~~Fzcpd9oYJ8EsJc|%!VBr|j`~v)
zrp{?0yNpVI7L8Bh;ZB@TV=xb=WIrZs%IdvhxI5dDvBdh1!bKsyOhJVb|3?Lp@6`IW
z$0tpe&j?wes{F+p0WkG<c{mf}_N1jWTPl;k-483>gpd}-Egu=Lu(0RzX;OSvvz6fh
zUORx*PM@-1C1y1&-WH?Nf?B`!_)Kd3Q!-%dVibD2JDpmre+CZNY<5?j4{u;`clnnz
zSUW3VsjlEN-$(1`p7KAS_16`gE=hlu_k4atq*_yQAQ1<@4FI~`<v%TCfK<;|)fT+l
zr~XVE`0zkM<Xd$s?(s><<T>qsrWEFT2cX+!{gV>@n7}z!CA{~0%D)7YTKu#?wWDq3
z-Ja(!<Ugn9%YwdpBN6WQd@kX~0IWb$zxl<W$KLOUJa^m{-;fF}?tlFgLOPg`1Bh1P
zbPD&dKQns&Nnvs<OY`=YKem69Fe2l(pyxk8e`eIt*R{@}ce^u^rS%_D^paUB=B;-g
z?er<><!3jLK}t-kl-1=!KA8>dL@DG0KDi}xd(pDL|4+qc3wxGi{ea((YF<~@!NLBY
zuUP|PWP(3<{W0GBG4ijO{M#QY|BA8cv_68gyP_rluY-NSC%0ax-bbuIW<3WV{rh$O
zep^Y_5BOwO4UeSEhsm#I<=>CdpICpCqCenMmj0I|zYkk~%}R-FOAUX3{HoRk_`&<1
zs~Cq3%YDG7Zu8t9v;K-T!F;6t)UBS-`5^h1gmhB2jOd5`zKmaKr@v6M*0T0TC}`aX
zOFqWH4LOJp6A)4`SC9`|fBgQy3O*n9`@X>+d_ad3^n(?wTy!uWtRO=UrlUgs2gt8(
z4YD6&;A{UC2bJ3a{TTgeNPZtF|6DDrD7yGa2cLbM{)B=4NC#b3uN->+4>s^V<Nv`9
zG8}YxhY$OG*T0@p_8;&mS<n8*%Rd5?50>AMga6<FxojP0-CgvrwDq|OrFf>rxW26X
z#``_57{HVmn<sRDR#oTQ?-glQ_>T=WSgvZgV_`p6>`B32r4gs-5q!U=8fHT7n=#6|
zmTrk}_LHI+u@q5S|GJer7x!HMj8asSf(Km@JHgxiq?jg({cfmE_)Y_*d&_@DV4e*f
zfK^r7-|nu7Wu=&=0**^94lnm)>`tF?F{UT|Yg=+@Wk32iD)8C^#rJ<B72IR}Gy45W
z>3>YJz1fc$ixktG#y%oHw=ex4wJggRR5aK9PCPxk=}$!uPpMaLpcLKP0Qlci{!@mB
ziOz4z5AQ3*H>(tFMK|nK%Tnzo|6IU(Q{vt0$^N|e;jrE1cR(qYhN{I28jjjmihEiG
z%JB{2#0}$@&hKuXG`IZdsA_yG$`5`(if=|=F|g6_eWi#~if>kUy1ANk-st=fhJJAK
zGXY286dV&Ge?#rPQRKL@6Jv3D`y+53K7}(Pe~HPIr|=ja!i!yWU`D*lXEb<Q(sW-c
za^2mf$TlY6^m!S;l%q`tI1RTv-K9u10w+Y;d<15M478?c!UymOUhEW2q$2G}!KWV-
z>Gry&BOhrA%e$=qJe(68dgO?s!nH?{`IrcA4=G2P=>R-~hwuQN?qbFcN(5(A<j&=T
zUceo=0S|VP|2Q0n)2dKUSP=XU?!pa?2yL%FWjJg_^21P9dG)#4Tpz*9oz`E1!*Evm
zGcW42HJfjI01tOE1LGp?PE8Io$}umCaXmFDykmz?t|I?Q+k&2o?2q;1I~EBu>CY(@
zsi&Z(_V84sd;-sQGUHRCYd<0R*9;isDcshG<@WNg!C^Q7#|7V?kSAia_$fSuXS>M%
zknP{3f$KEYXOW83+w1=XoYLW&6Lnix+5Kml$J|l>bII?t>g=h=+SY{8?cJL!P?7qy
z0NfAA1ishr--E|Hnc>u=a8cY1<2t`f+W&_t@^3HyIvj;FQr;*sFYEp9YyI8b*MCm*
z4@rNL{yu`o8d2LkzqxDAMfd_PNqc7$0k3I5?*>e6ZL$yTIRT%+H{xBMlJ>n3wnOT3
z{SzK+y9e`d0j`KUa!N(MHAQeo1ap0Fs}4<wwES~8CIIG!x+LDfV;Qh(TN&#q@<yGO
z_Zg^tEbZY#xS{nAyYdW<!KZKqE{gx9tj4hv1V7Z2U13Tm<bO`2>F4eAp(6P?xDM0s
z*Dmt^626Ah+Pp}!uB%h`_nk=VOK=1(N&cr5L6<e+^+t56)vax^h_~u%xT2BI+#ti6
za2gKl&v#t^CvZhw3Ug3X`102J?Eiv$TYtX<$KZ4L7CwhlT58X_#L|@G7+l-hEif+D
zzAr?EGNx|p^zbqD&D`ADCJ0pExQIh9i+-SP=z1^U7F>lJ@OY~Y&EPnErs(OY>Xf<W
z`;zs&1%GXAlLe-A_~OS$txIM_SQ1m)GL8ccI1Qh{CveJ$GpiN@rt|y9PWp4)Xp2uk
zL%-aC44Uv1{!*v+HuAp&e-rQ7gwos{5sao8`zZ}64g1iFm~4{#PAPy%H`upuQh{OJ
zryTt=51)$q@1W*Z#}ve;0rx-P7r41~{gYy?{8GfD4W-~#MKE~>e$$lbu>42h6nqX}
zO8-U-`!DS86x86+*7b+{zk*9TYU5B9N9|L10N3Cu+zI{3L;C08d-z7NU)(<j%ArQ%
zRQi)A{_8oM5bfkwvL0zxtffoUDKUa3sXQ3k!+H2j2K<a>_RAV#%nVsS?P{UbeGJaQ
zHzF-Rpox*H(8(snGjm8*;i><49nQfI@IT-aJ)cZRMSTZH<W|$J+;tyT66yKh#5{UH
z;YUjYL|qXLrPhnT{0xtawE0{3M3@JWL+mQAHLb?G?$G+na16eHAK+^^rlZr>+*=MO
zWmVI)>gJFSI|5hWTlfsl!XaaiN-;pgQtExU?az;D{eOeY*5H|5zOK~TxLk4{9uJ+~
zQ}8u>3l|kTkLw0C3CExZ@8O6<FueZ5@G1NN-)d&RZcrJsx))E2GV!|Gum+rkZ{hDE
zE*v)u=n<u4M~%MX^^p9J!>9124A6{OpENdbL_9AOQ1L;%SpOCH0WQNKr2rdBp&WpN
zV#P~Bx#jOa2It{#@DI_()@(pDZA5fE54P%0MXaNLgNt&#Da=k<I473BS@rGQ8Cw4l
z_!9mbe4_xQEF8DC>H(*9etK}#&+iDFgYV!!;F7vqQ+A@Gq*U6!ahae;{^Oav{||7<
z<QHX}hN==_RPGPipF{8od=Fpf`Vv#?=@LC}lxhRG?lI^XT!eqfejYZ!lqf(Qhhs3I
zszAYh)};U6D1aR)BD4;yaX1ww9}L|C=HW~DAMiCCk)BqS1)El0G~Eod(EmKN_Y3#}
z&cQ*&4w<1X_Ed3hsiw8&ci}^D3BFbMH))f8WfjB*FsEp6Rj?5opDDN`*85|+{KS}H
zTgzz(Oe6&qw~`FP`h5-OrM+dHfvzNRK(&Fkl9`(Z_y~Lf|9~&x5TG}V5%kEdrpC84
z1GH(w4!|c;fMc>46;tD?(&JGTScDr%2g~vv-@|2?H^{t9<9iJ_q?n+slFW+B|1x|F
zUkI^gATZIg&;mwO*&Jwk<Ze7ap#%6WoDh4N52>f=w>q5AJj)u~O-3S<{J()qN?zr%
zI~`e~x~7lXLZ@8t^?xqf<b&3qf|RGO^!%tAAD7^sum3b$G1Bvj<q68hmRuEL$`ag3
zaag+hd<x$fZ+LVdl(d9H@`o*rRBbwt!*B_{7HRpMQ4sZwWN1u=r7O^?jn5>Uhi~8;
zIBsYwWu3y@Xlwc^hTKc<SQcmTm(u@+>_*I6r&D}NC5>VIKM0rM8~75ADRsRr{i#R<
z5l}9`lI#D39O!p&){>98QZQAqyv2oLAxP_!{#=p%A2jU$zQJIRDF7-fC$VXN=HMKB
z0iVKgK_*trfaW4gEUW2o&BxVJ`2H<?1*g;?97XKhu+mRydTj-6rt_QbZr_NuchU$_
za~qi+l@(mk;VtO@r|>m=CN9a$03vhMw2?Dt4XuA1&cIjj4V;sa3kZm+eZ_=bOK(B1
zoUGSp@Ev>t^G2Q2H66lcp_BD}7hpvx{QE(q;p6s;W&LlQzFY=0l{Gp^fT!UH_z$?O
z<V(GXxLOf&%%m!73-BhL!CCkWzK741_bpo$f1r`QOh`Thk%n(5&?sDz^**7h&yoPB
zt2&HTiy+p#XE)aW54b3>GYqb_WK;aP+{+tyoe}{D;0pXV_)^v<S6D+4w56c~lxmWT
zvHrNe#oeoKAo3MMT+Zc#UpYYOQTQ7E4wtq5zR^o`HNlDSWCm(mZl(2q4u6yUCuI6E
zncnCZF3B)22rzPE{TJbH@Ex3y{`i^)nxc=!+hyg2I_Ub99MD<SZnGj<MCf8wNtIMn
zTL)$oJANU+&5|TnlKex3%Qktik;NN>OYogy$5~^ZEg5A-O-ad;W<NIyqDg;J_<l*=
zw`%y@eF-A=udlNHda~Z1!}stR94_9}dPTI^Ob;)>>sWsZzrR)TaLORQFwD-X5i%~T
zrnzwmJ}%z-i+a9X1S`yo;GZ4YkM*!WKfo23*O?ty0Fs*lOC_Z=y814yu>oh4c|IWn
z?AE(4AcU%dxwgv68~Sq+zJtHRC1pNqCWR!3u6l8*TEfQI(zKo5Lk1XHD%L7W=0+B@
z{tfwGfbZpfr<HuG6cMv4hGMfK`FG>|ru55s1^9vXQZ^G_R%=;DlP_x-BC$>>+9_%8
zz-HdNhU7A-LhF$r9iGy3<WWth1@uQ|tFMZ5`xIP&lTI3*224+?bE0miXCNH8N&W9U
zoP%>Pm!6-2lW;<p+>xT|@OwxVCN<4CKSWx6SXjA8`|1Y&Rgn&ksnB=KNvjX*kEV<?
zy`+85?BA;}CDQkq^!x!h183ogtXH{sh6YATk$(S#?$R6bKMfa^;`Lw7ty{k>SgTo0
zlWwfP0dwNLpEB>!HUN>Tcz#or?VMcAM)-AJcgMQHwDoOTBohqN6dZ-)PTD?cwfhN0
zN)ds_0UT%{Nd*o&>G=^j2PZWhni<^Bz&af=^_ZPirgNikc@ZueX?I+C`euMK3F{b~
zha;gs({Khpg;Pqt#IhrZjZ@GNY5kE{{}>!o?J)E|UcP6CIoHcxQl7Vz>H2YA>D7T2
zAAl=u1CE=9V=A$a`1w60v{;PVp{jn|x~yXAu{tVT+zpW&QtwE3kG=^%gx6I%>1>?e
zlJw^cOdCk&K&K%3fzvM5B)@~c{!?&Kcc+1ITx9YA4W&?L;E42p%FpjvWvIrCuxDVP
z?L9MrCtd%i;fSiX<Fej;T{GII2uWCnHOif`2~5Bd{n4cCN8K*I5sXztSba>n<PDLe
zw>qGNVMi~I8gRoRr#7o&+AOkk`s3GYMT;3t_#LwZ6IP?&P~Zbby4*Ehy}s=yYEGSj
zW3o0S)HfD`>9N;6dzsvV$fN^F;Psjq>}Q1ZRM%Tzh`9|z@f+K^{^=rZF1mA$>n<2p
zwX8m~gK+>xm0}(-Zj12x6@%>@$e+z>{k8NT37BE52zDM%_LO#5m%%(Ll6Fd4Y-&ln
zUoB#%^`!$biksRrHV(j1#Xgzcyu*Qy=m1S=|4zazRFeK2mqE*I|HBQ$VM3ndkUM~J
zI4BbOiiHc6?JygW4V`lba1f5Ea9*|W8N=I(wTuc;WDe?etv}s;!}|A&aM|eE-4{uF
z`Y|$*&L5Ks_=cCZgYRp=L7g{3|L1jg@GDa|5G<EBuJs>+M#%q=kz#gKeF@hWKBx*a
zZopID;UN`$%0{*@u#0QW?g0}Dm?pvjOl$u#!-nlE28di{Tlx5Tbt={B>O4DQK<k<D
zZItcqkjt1)MU!gM|Ko5{4lFC4zxb9B1&~u(|76&o1BxDfcBN;RC=@=|6~-hLuIDNi
z*NwZgVtJ;HIe@+#$Pss-6B;cZ-NKGAkmkm$VFxNh`rlBAWJE7tu_2PaQSwDdHlb1K
zJg@0wj}~F#OGdI#GW6MSsL4NV#F!16D8$erGrdD5SC$!;!A454VL-_x`{g)5SSc<3
zujC>tRhr|*I6jcOz$BR!F?35JZLcGz8YQEWL{Q$6zrgSp<8n}ePE;mx<M$a@7GWSP
zzn-L6(<7A{tqEPcDDvW!jV`LMJ*-J#(}usR!nh5l#|;Lvf#@4!3mZln-l>a3yCwyy
z8scnb@*CKNEMW=Mt8^yx`wjIhv?PnFQH4ZOu_iY%3h#15iLRQL%q<h#vUKD?nC8*v
z*sAMTk64~2QonJ3@0tx9Q%$;3r5WpwyAS3Du|;KS>?`O`1Q26pr7#E7H^!{A1$AK|
z<j}1cM0o@){u$UpvLZHfwaVJ4K{}KzZIBg1*^&fN5gf~C+Uij$PuVE$N(M9>>9|NK
zPpZu*hi=34C)VF9#{PXdmR$OiN^-Rttx0vQj}}wk1AB`o$fI2Fc-S8g;Ibk{LC-u<
zS(%Ly?cYe~|D=irQFQ6pS&lNyh!!=fdPKEKb5z|8W%C|=y=}=ndD540bpworC76)%
z^=0aM+71s!I>M%(ko_z}Z$wiYqXxBFQuQVFGIXh`V(KVV^UQKhJ$-*{Ku6y+Fk_T^
zVkura^~+au3^P0NJ)Ng8=}}-Wqc6kk6Z)5ufe&`ImabXiaz)lGW`0WYfM^@~_V#g?
z+5jTR_54WDM2Fufn-0fe9GD<Fxj`I<qiD;LhWn3GnYpWrxT0+8mTw6KJgXQ6ycTgu
zpc3TLq43xP=|isf4>50)$l{0(?4S&}j<^bj4^4k9E1Sj*iYG2Eevy>u>b2`JA`r>V
zvh8-+aBFM2f#y=gTpc;-%oXw!wW86>Y~@|!LWv4RRyLI2vXzVC2H7{eH(pf49NW^&
zuCLh^yF2vpl28{@l1>pQx@70CBSFTIu0S4_<&Ncv{2p2=p6HU^WuqRK>6<dym$F4)
zxgE!W<)S>FzsWREwO{t+9i}{?&Kp`+k?^Y)vc_rK+$UM(lHBmdoNU<wl}OlCi}uIQ
zY!nvat2?<}mMfNd?%Ss1`r9#Cu`%kK%bJeM=ESe%##S{b67Y@0oO)E(Yv!UO*%GUb
z;}kF7kpA>4Vp<q6TkbKSr=U?=hSu+wOp1T8Ylp$Fvt@JWn{V7vq*gM*l(2h)Es7EU
zY;;Mrwm5)N(W+n;E_`Y~Y+y}m4jUG?IA21~EM%YJN_r;!%+7Y-o^emIVd466ak*7b
zx>iJs1Ds1MGD|XVzG}GDSZ&*e*j=cbXRB|^?%5rwYd$DULfJ-hI`Kvkb#!!V3pW;-
z11zsFu-ed|ul;MPwLMcMAoi@J`+whZEM0&3p?4YBsji4aDy^bjG<XBI((Ye-TdZ$b
z!k+n~ek3Ww?{`hl`W9={#X~wUVCD^&fOR7mM|;!LYY+6=o>7WbZ~j|f`<LrC!-4iJ
z8qBndZRqxto9Y!i&obKrzg9<WZNr$yF-qT;7dexAT584Ks4sAiPH18DDdqMsw(OoA
z{+>PG59i#oE%X9{pSF&rqc_fPSBlw@(XCp<8XDCv+S7sk>Lxo$q)MZP;HJS(-oTGw
zWJEH#D2i3<uH|1d<#LkB6xtLb#OX?rGR19iEzPvpIP#lW5BH%F0G6$cP`2*K4I7hp
z45Pke|MboGRV>6o(;dq}y4@7iMae{r-MO)y?26=k0anv_Zi-i+V-_2Wz5Om2sgcx%
z*9q^@w}eG#c}FKEe8WJ{8XGC~wn&^;#4<lHV-TW`d}g?Ww$yHu!FpieJ7o)bb7^eF
z=v0ypw7bHn@7i_^4ETLr&8WHQZ8J!pJ3>NDuh6$zN%T*Z&6xC+rde^)_9byRdm6rH
zMl)PrUoW3Jc{^?OqbyphRBjY1p2_YP^=BPz#;S4dME1998L23Sbks(g*1MrU%i7P;
zV#UF)Uu9Fxw)&d7Z6Q&v+tqMjwkrPJU4PR_&)3v5)>9cNeDJ!CP*s+wExlX|8_+8P
z{D%RLbS!3lkuy%6pq-X_YCAe-ab*d=mlb8v6$!<9oZof*a>ZO<+&bJsCGBeJmAfI#
zFw2=4s&4VhGEfW9vXjm)3F%|f`ZmJ`cg-ZXj09pWNZVHpqoN@P=83evxvrADXJqJ2
znWbK>Ee<%YC0*6y())L!*SRMVmKCO#6zi323p*NWdz;$0<1n;~t1TB5)@vwTZP|v^
zXi5dzMf>Yn8_XX>4BvrX))a&Ig5A(D8x(i$B-nNR{a2|6v#-;*ZU~QMbLAe~v^CQ9
z2p$FW@|t?`maTT|a?ey3uS$(pgv+r(dLAG%^&(?W+d}@RNv{eQW-YBhZOH2)Bw)$D
zB`own#hqof^7hg9z1A@-DY06&eTz`OE$h3XKusfkZWe!eU52fr+F?Vs^*y{#>+fmG
z@P+=UttE`uRWmNUeNAw@ht;IvE24vVClQn_@9I~fT!W2oMDpHR7m4<JIjXqMM>0HO
zmw{B^P2d{!VL>GFE3(0ne`(2L`pvzgHqwQpKP&JGp22Hak_dVh0ru_Rdzu4z6Z-Q`
zr045OPmWk0L06mCG_Sm>fMG)!U&AZyf8V0Ut_4N|*{v0kv@a+9S+};pmW4}Qvv_U$
z>b6wo9jvzuq-Nb3e%MlhF3EMhg%?2@-V;vMiVRq0)(4qxZ|Ux-m3?ClVpZ?6AmR7T
zVs^|hmgPNO!jlB8v<`3Kna*P@zHjcnU3~v8Ea`n2h)?R7<{HYye=1h7EGzn1a$KY}
z>GNArCB-Gw`;oEBWm4I9u-+7%z#H`llq`&itU=rc27={yndH|MQpj8LJ0AK*BDJS9
z`=ZYOYOMbiJXKqIsknui{mVc?U5NlH@4`#CqqfY$Hmiv|h__=bi23D3AWAGr04>wF
z(sl<h&A&W_+wj0iw-ZjzbCE9Bi#fomIi{|-@17*=@Q#>Z&Z{va(ktyPY2cP{G44BQ
z^&&ihCxV}-6)jdoV10$#PemHO80&uy58#4Ib8&y`SnQGhuqo2$C+Yn=@CxqQ0B5LM
z#4$r&`?U~RHWc{1a6VFJ*qGhLy(40v*zcu4WtLq2PvDh$gZvW_NO+HzBL8>|{ef_H
zUJ5^_A_wFdF%$%^iuC?P$bUf~F-KM1i(5xnkBA1_cHlSUzXH$UE*!BgC~sx=G-^-m
zOB?I&!y9-258$BgXd$qHg4hs5OWx<L>(3*&4YPppykI%MvQ*IJzp4LE;f_X_q7kbj
z5Z=&;q{#0CinSsC2mod^vJ%CJNL7^NIu->ARgix=Plr|44UduQP8V6sHNF3vIRC36
zq&p$z=Dg?tt9B<}7xVbN#5&p&Ov~fcRZ|Rku?s~cmBL}K4axtAAuxCw`oQ*YO~3a%
zCVA8PZ^BXK^SzU}Qp^RW0ziWHc>}$k*tkxshq_|bbWeUIDJ*HfUdQ_1!((`&&eRdR
zLG_9bAoS;fBZ{plrhWzUhLY`;3oU0|`tu;Izl@+dQYm8G+G)}ohdf&%o&E!UfxDqV
z9W@5bSU*(BPESO@DR#Ln(&QI@&>BV&R<=Kn&W*Bovu}ww?P0Pl5g<=Ufjc(FTa`;_
zYI^anq4l?my#yl)VoPQ~&}ILL;7qOtX?c?Gh?>)4Vy|5E<_OK;F8mC?!lR_89rZ>{
z*c4b8-P|xOR^So*2LFZ|VZhr;Lyqe!>cyxf?i9=LOvHtM#ro4Lr2fK59RSaH#pzuW
zs@AXYJ3NUrcwIf#(<%TA%wtFF))MXOPjEXNSX*`BQN!N}jgP!_Tk~IO|No5jr*~)=
zf@E%~=Qw)k?sP}Qr5p0^s?abY{3W*-eY;4v;FY)!Z@T=Wu39hVn#*>N>IzZnj`aUI
z_6v34eT=DhpyOF~+2AFu|5tbz@~;VmbkeGW-bIogUKJeAug*HyQHf(zeb04s;I(4l
z{6gHJ1^w@9MqpfH)R@r>Oo|ll-++I^uR(j=)1d6Q-I~1;6pbcLcrN|95e~GioXezo
zVWN7GTO^wF_XS)P?e>e%pRQUfCS}6P7D`7txh>xN-{99kGz<BS*Z^{*5dEJu>HjYd
zr_|Pf$E56UZqGy#1YO^5!SAx4gRqmMFlEi`CG+8Bi(X%u{y%_DIgTiLhlWM4Re8Xa
zHh3!9(5pU&JTS)Q5#1oXaI>x?`64`o-zB|Bdu=2?hVlnlv6m&h$G_p9a3dxeJC-LX
z%h0UrU}tiCi$yT17ry?sY6vl~R#C3Ir+-O-=XLmLEA3~~Qv7A>JBwUKx_-Qadt!dQ
z6$Ydu`PHlw)AtZv+mihM3BNh*=fG+>HH|z_Ou#1V^F*|%zc}eV26tbV4iDtR!g{R=
z59?3(#c5;1#MG>raUk7ZH|&TPA|3x@h_<vPcRgyq-?bZDTg<c9<^Ass$v-l0F^$qM
zc4el^`hD@<Z_2-Gsm_|Rm|jptM<BfCuS0lhZEbke;fAs(JweLdl=s|5|I3<VshACm
z>q{3F;XeF4wEnmz)y@0+m1m&9^|@$+{~Xevrlb}9kv`AbH$c%F@GJc4r0YZaGiIsp
zD7_4X>URr%grA&rKJ>pVN8znjag|t={Qs%wqac5@|K7|S=AbD|)$1a?zdxiuJ%zg?
z#r0>(MXthIxC_4yIiRkczp6E&#mYNM^F7;1{%vhxLjerdje;_t?-V@@>rczvWnQc)
zXA)3Y|D^vn)A~08MbDdh$dOs#_ej?N_aVxAysB?af06fI)wS^f{2|ig1%G~^Qna^x
z_l!n5-F%Yv{8QoI=4VYsooLU9CXu!g&L-b`bt@WZTi`!UyR&t~gr1U|k9B&J_AmMK
zeLdW=B}Bb&NpP-d_kM)yL(IFYmhO#eT2HPk1-|!<bo`<3eyqP`Y22RD&1qc=@ECp*
zY4)A1XsT=GMoX50>+1POB5nU=EARTMQsfc44V1!I(bW7S_#OVGyIW!XT`K}tv~7{k
zZo_N11HUVFxiy67JSf&7TXwj6Dl$FQ-SbDdx>f$`I!4(R)3>a&#qaW-1@D87xW9k2
z>a0ospPKyc4CznkzqfPs?BbNXM_T{C;o8>qhtBn-|4nVmn$m=Sz&{oHZKFRC#ziYb
zTP4PD{XYo}^WD(;*Q^x;OPh3+HhT^?;V0?;yP@^BiUZ7ke$x3%>;F&qc`KS{*Cr36
zSX#1+%X@eZ*Wll}eir19-anf6ykfMfr2lR4-d`QEKkLR=A3g9b*`=@_R~0>MBmZ?9
zqVfRRWA|I|GyJl3{R90})Jme1-jVly0e577Hv{0vH}FiC#Fv64nAT{|vXZRV;U+xU
z7EARtT!nRbp|SqBj;yIr_)w7CZ?-<a4tI^B`J5)3BX#~laLzaV`VpV!3bS4d&2rL^
zf1beuxGg?`?$+nu3n1?`Jk?}+9D{U#@4_v(zb*6dD%@0<dj?Ku))`ejpTk4(Zok<2
z{1e4^PvERbuj~5p7w`z~!kwK+vlrpE@;uK4DHk=Rb$A3<;TqiE7SFsZaIzM>gcA~B
zRhg3~a1(B*095!4<Wp0U@Qe)xw*}ku81BHWo%jfr;SQ8w(eTtq1%8zBtT*8vJlPh{
ze+};Frf?kQVP41>Em)NP-4Z3iHu^JAiRL^U6|6){Rk}xT3vTU1n!YH2&eW4|M#va_
z*`VZruEYJUSl|d;uPsrZG+2VTzTJi!aAz0!uPB*(SksZ{P(6n0aD7`Wb6^2(>ZhK;
zQ30YNp*)1^!t>uo|C{gt`bv_VFzn1IJ=}q7aBC=fNS|lYpJ#B)2BQaB|0B2w*WuB&
z>tBZ3LOwnx;N9_J`r@Yae;fTt`rlKP@T54cP@C6lxF;;-ZPx!v5ktCZ928-8*pJx1
zN89Spd&&PDTo9yUR9{n_>jwM@ced4^mfT9}(K&CJwoN&}dvFtOyZtN;ea!?M5_0X7
z!PmWnXYec)4%qoK3bSxnq~9qYw+ip!6}*h*p`AV%9DpO{fI<MTRO?#U<@s^J8pLFG
zR|Yh7F1`26!`4q-zkgH?Xkdk}7he0?_EXlF_F<t5p{BRj;^kS|<@s^R|FBTO!T`M#
z0NQTyAAvc8qxS6mYj_4Pb|)n&!<?XYW`$hb)c!q%7rU>2Qn1AbjZtk$!b{B#yUBkH
z4#Gi^-j6Buo)-VyI%T&1<VN~E3zJaR4d#uuV2}0BYyC<8TI$h>{ohUg0rPQK;3y~p
zkLP#O|6J$iFdWePca?ol@?YNN`AN~4&8nU?Pz3r?wa4x*&yT`kLncWXnTX1s?MaGO
z*7^@hf2vkEPv>`Wm**#7&KTiFWK!RY7HE(4hy3RSPBD-JcnMF{?6u=36U@xKz^4b|
z19>kI?zaAE8LwHxRchK1*kk=8Mf+P-E6I{N-Jb8h|0Th^9Mi0Pt{cE>S&{cc4{Y-t
zBO0`f*3VV-Bkt=Hv$ZK1-O@#PeV@;#nrt-Vb(Bb2+1+}?9*zlIqM{VQax`u3=To;#
zXp|H_VEtvavm^u1{LuFwGeq02C8B!!e7<3`tpi<0*Z2SX6(!?ix@B$u@~5aj15I6g
z!20X<3MFm++Wu)>hnhJ%mP_3GQ_{<in*MA~P3&|1BQdLO>u>J!`HF-eHPQ7Cl7Aq5
z)N})hM)!SvMhy-b*{}~gzoS-93=yvGXZ>z}V*P7M6YW?2V`hKaQo*%-J|E{do^LA0
zytfG{oAJvmk@f-d4{)5S<iD;h+}Hl+YN%-F{I2fwd}$-guwT<Sv{Pkm@P3cR&@!V@
zVPEgHRnjrEvR~6O^n68`+Wl_L|DV^NBCO8Gtv^cqy^mhMw@7@<kH>J@$E=@Xzwift
z0>7g3anF~{z<tc~xjL45Dj$59{A$+k`7!dV*!n-_{AGrjuwT<UB)^nt?)_Kn2Op<@
z*jQTsBTQIkGnpT0|0>17|BsPhtp8(FFeVhbAMyL8jp!)nON}Kh?a4SD>z**mwp=NO
zG@P-wlO0pDTSL7z>!PPv-p}vXh3Yq|=Q|4W7xxonmDMsgVE{18nx@@*{iA~Zh<?L$
ziEv-*uLz55(ozJABH7s65|k94MAu?htb_ZK|F}VygaWO=`#l-!L;h33QY#lH<Cgcc
z{<3a3BgUE1(JQR%`TU6bB}ddRw<i7B*CnrJ`<HI%>w0)!L1tOaf|JHgv@DXI)jgjd
z)$`-lV)p_2pTo4lpY?4tbzwjIpXvNeD#g^4^6X3hN37X6y6l$s!&o2dA2*!cl5QAj
z3-`Wc<U*y5mi2aVxN&d!&%lf!=%qe|q`!N+@lgqHLh@fV`Rz-8CWIwdHx+)T;jBGA
z<LVcU8%|8NE$&Bu8tPY{hMb)LgXSXBFvuP`x^XxHr>w}A*07>st_SdPr)AXOkf2{@
z<O2p)Bz*$UcgfL~L``=LjwnagGtSI3U-f#IZJdCkVn;lvmcf>$fgkTqT3^xnQ-W<u
zDVh~cGd{I}#O*(GDlp6|N1CZd@kBY^g<YOcIoc!2amEr-k@K-g)3=v@T{yhQG;|XO
z_$@pXWBragSQ75?2^Fa`dC$l(Ki`Ek9p%`M2>pA+dTrim9_ZDc@;@&0;#|N2Z<M3n
zO@B+mvOlIA>!`*n-YUoXY>4b}C;gchgk{BwY>y=W?Jr~%(J34k8gZQdbp9U0)BWU_
zPpe2gVl}GNWbkMg(s4v5aZG^XQ=(mdr#^!{^yi?Uzh_h<s6rYhy|4Mc-E!2^(%-0&
zRFva>p?S<#yO^;_aYr080QRVy-<SSu|9lw^!bvr$jOzx!tVzg6J|DXElfhv@PaoCq
zcU2*NrX#$Y{TUav>=DDW?Wy_Wq2{@^cOx}845#6^3LNPYxvnOQyYP4y`IiL!pT6P%
zRLJ>XB*2Go6K=shc)qO;O~QHj5<Y=rn!3#uM8}B2Rd~Gh9U5>7K7-2|8Eq&6Skvjh
zqbmD0HY9^%@CkgXE~N>>hiR%&<SN{Z5uc&YES!VS;S!vH12&@>&;Jg8ZcDn{fD>?8
zoUik8fE~4dq$>YkaAT+S&%*`yL|rDcVkAh~{{Ze=m&!IeG!19qvPjcsHTO2KHo>cK
zecMK@x=5!#6*qIukfPFj+F$So+}*azpe*@+ru9##?_^0n;E@`Io^8ASIk*U)!Z{i6
znl(!%8*&3~?X>=r`u(#ia_4fu@6_dReWxyyG8~1=@QGL(CspHUS(n6LaDCe@v$XyT
z@EKeXdC$0toQu|FaSI-9b)$8WmVaiP*OC8PQk(IgaAVspxuid5;SzizK9!no0E^oH
zq`!~1J-<`ZpL447PN`RAQPZ%0X#a-gKMJSdQ@AW{;1PokdaV|x)Ft+M+w~uYb8sF`
z$pnn4%#wcpuD;^oPUrU`d?C{LN#)4fYV$p&{ols^RNy#VhR;R1o*vFMpZU<bY_>VS
zWjFz!!WG${18PxQ*6*h-p(MZFw(CDD{h7D9y}nwH?!a}px^1IYI=@%+`v>d>-PVNF
zpYYo*_U8y(f-m5*rtVARoYgJxl!B9K_zfNpo#8P!1K%3)VGKN{&gnHZ4yIFm-}kTr
zC*T|S9xjNsG`g+Rl+!_lr31Ou!sm!c&#$P#vToVq4lK&8rm*h8&>l>|MfhICce93}
zT+taktV{hM#*AJ1U&FU>$*8}g$g`@(&oKb5ZH?pS;S=~0t{C-D9N@!>EMo~zhMu1k
zao;y$evOHrbOsIyV?MoCVbCS*{%7zFe4?52Txr<UsC7_BvbuHsB{&LSN&Y8b+UQ+Z
z)G&QP%(?xootD#ZS;U*C?euo!zz+%|K2@#?`cpRj`BZFlReA50Vu({PBa`%I>-xWh
zzr$x@^Q!6+l<L&4VSeLO(BIl=ISm($_;k_;LX!`gg@bTNfyGS^C2Me6`txaGa#8-R
zEE{xMy_Y5UYsd#D@B0~iA?trcOYN!#cTy~0sqDXv{#+Ds;xXgbtEo;nug1e<)eA9D
z1J1%X@Rc?>@~7#v98v4r0jR^Ztv=u|d<tJF`kFKLyVQF*D-}qG;^B}Fn-X#2w{Tt}
zM9MP-GrE)2b^bTcZw@Em3j7VOz;Wds1`2~(qBKjrzgLHBXi2o!Nq>&1M>|*k>9Dmv
zrr#g7pR=Ov{!*|z4Ko2v-Hc|17<6~*`oGfp=jA#xgLsO*sSK_T=}#v8`9}6nfGhb%
z8hJ!SSIIVaL>s%Qz$f8LDew{PTcF%j^)h8^TG))E4#GwF5<VB=Mj(EmQb^fSqN{M%
z6>u8P!PoE&oRRd(<^XzzeKt_j!RD+?x_*2i-tt+sI8;>1>lZtQ(g4k)c>WMvfxp8S
zBA$yeyBIbdv97%q+;9UhtJC+n-9Iaac3FXOn1yL|PB-0#HQ*$CA>Q|+svdL&q);)q
z--cRW?)vYahD+kYoGKD$4s@5V$tpMDx(6Y1@G1Nqz8C3yL$13bk6RWJNlVb23q$h%
zMh9rlwt8KO#D>Z%BRUd~UH+HhJNOEY%XCx>ZfR19-G(wsO%K8k!YA-I_#2#(mmU}|
z#HgXcx79JUnTr{L(<0rzEbmvfk=?RtQwId5v<A1rh8=}3;5+ynj>&YFjOYqw#RJIX
zy658$iFWx5h2K$tDeL@>Sw?hyNPp^ZTG87DQEyds2`{Ur-xaP)%BwsK>vvS7?VpH>
zu5J)P1D&OYu_3Oz{HNh8d<kE}8F44p&48iYcrr}ux)T=Ue+9nK^=VRfmo@0AX=uzU
z>znHj@8jOvhuxFMAGfUjy83oD&(9_4-zlR`46`+2tzUthQ&|5g>CXk-9dh~5t_|@A
z0znJz`}#jI+T##j*uPx)nHb%?=IcKw+U2hlJ(e~`(ejN!i)4llYV_!hNXJ`Y04~Ef
z7CyDDc_u(>CUs4HDYVEn#Cdw&0^h8NvNKR+H0lZOh1{^=eNMn9ioGWct0Y<|G9z@1
zv4wZxL{swT6UF{M>K=_KDH=%E{HDs9i)sBQ;dA&LPAl^ilg0xna200ZyiDIh7~ovA
zt)IhLWxuQHA1Y~ZbVOdMZRn^iSAehK6AKgjniMUm@e4(8P2qJdCha>87Zo^88aNJy
z+tWxSv(Qw-{Ns@Sarj)PZ%T?9r>CZW9}!gI8v#si1n~#pf-c{OtUGpKa6=6N5#;Ke
zPh~a6boxKD%wyROV7TG*Qmg9uUy1b}Rrq`grscXKfa%MDA5c2#9lQ&;<B<OqIIp{J
z$-2Nw@*WLkb=Nd@@(yKQzJO1Kw-&;y8HI4Fu=IsF@E-1k^*EvRpVR&gwE1B@Q~TjD
z_*`x4oBA`O<ZZfsVCc_*dC-7ap>p*FIJpE&O8%d~abaBz<Qvc~)?pfst7rXX$iJfG
z+oyK-t{L0RKwwtMfvoHK1-CyJ;d3}C30EY=OlK^b0tYHfr^-6xmOMEnu)56NJdSCk
zqn^XdFrXEfhf8qT>`z5s(X~W)O#x`C3T*bIg#9^Tw7<EvX%7?w9<$l|g%ro9yZ=}4
z8621KD4XjXNKdEWg3|HtLVrU4zlMvJIn1C(ng6FSn5%ZMQ%(0(l0hBL!4C@mCY5(?
z7qu_55%#X|50=vDxdLCoS;bCndP_P?0V9$k`1B6lwEHRi`4nc0C%$AoXk}{*Ol6(d
zDTr;r8TcALhZ9nu+7=G4o-9G~Bxw)dC-?b<l2_A;k$XBr7;s+nmaWPFchUx4fUn__
zg-@BTSz(4U<MV04+ZY-eflKgr_y$fYeCrvjO0IyksuguKx3U1t>+W_@rHh&cKH)>k
zx(5uDQr?jN0r(vL0aswo(D^BL@<k+&Y5kk>KL_8#H*mu4E*%5i&5O*lT}{6&pybUD
zaLJ~#yIX9~z#8ioHFVWQ$*1q&GR(=vwk&!n**^n?l?#EmJqni;J{^&j$tA@QVP9`i
zRt)tb0bU-5@8EmgU8_dYQ?X9Ko}uP1!^_a0S@;~jhl?;P{VN+%84CPbaz<Ub;T4oW
z`x?H3Gb+3f3_2^oPbxYZT@5rZhW>m5XJN`Rzhz@zD=7=uS4uk#Eq0Ul`2@a$PYiH0
zHv<@L+|gLM=Jsa<&cSzZ1rBRSBqk)92KyT>HIoT=8|(ibE^ib*H-nzA{^8HFB4q!D
z{(lDF!ez-nFTVW1j!;#I?XP0!wWj3x=URW&PEp@*!!pA{i6wAC`g30PCo3M_zOI6i
zgjjX`nNj%vh3@EVrq?nt3&{rMnx;J*q{Aa{Se*}JW@~D}%Vw-z1NpI&FbCxv4!{YK
zi03+bWkY?dtFdt$=HLvR(~kj6sB>Xf`Qe^Xb(GC|)J3>?LbLp&1AIu>24k`^L)O2f
zmmgK9NP50fh65Tct_iE8XBt;E1h29f>(k#)F*##!K!J1FG^S^ml~^Jv<h9NEYx4I!
zvv`rruL;pGl@w+mufY+xtng)04#=D62WGk}dVVfQ+b7_#(pH&qBy<h2s4Xy_3e3Vu
z-8DAee_m}XH3c|riM?WcdsQVOkE=}~uSg{V8>6Jc`+5=mCP}fOPKV9)k1M$xrmt(b
zQ{AE>)nP`RQd6P-v+6t=F`k{=IGoxNN+0Slt4^6p1rDffZQAAwD+YGh6@XL^axsn{
z2>lt?Xl%re6%l0A%<m_SsA~NuVJ0c~s76B%szg+`4vaYenX*{ZFsmeexvc5pSsniP
z{miUqrn6L4r_!wJ&%E09Y6eh-Js&2uW+e3!uK&jryJiMA)|c8wB4kA8H)YCd6*vf|
z)U{A4qU;Qeq$i#~WX6u<08Ybcn6b&ebwkgJQJRwQu;<-=HZ+}oNbnrF1;Ck!vny!V
zgF3&pn(V>6Qmq5~!MF$JvdE+AlspvrGo|U|DMb(6O~R&pGmY=p*-0lI9+g6j%JF16
zy4ba=BQ&xcCe&wFud_O(0&nE^GJ7(Duw1K{Q9+{?dop38eLXw$6_aFHK}4)~WBm;k
zHzrJ1BHp7Y$;?Q@C!kiRGc8z?zK(d!JVul<dpgfGn?v20OP;oytKT^X#<o({zc*CV
zN_B!xO{0ZnixhjZm?f<|gw+=uLRx=!PJL_Bc5}tPB`bz9H8nan;C)#P*b}-TceapB
zw1op!RwCqp3|I#JhJeCrx;qVuZZNahCoN<~$@F@Il>=(JbnD@PW6u;Rvqt9;cvpsT
zm?~Dy;-?~)RJL528>|X+s)iL^HQMR;iZCfCG9J-S)@pRdRGqKe8+r`T*CjmM;CRfT
z=BkWD^nmG%CsIQxyDF%rBh(xdyg(E#b9G<#bs{2*n<<PKPX@on`ndMLs^6^Yr+QMy
zvN8xM0GxLH8P{f3ZDusy1(STH(={oPm7zDH^E^^?ngct9{_kkkXdEhaS~F@o^`jc?
z53wCR$ubq=4kQOwgBf9iM2WAf=&L3xSh7P=mk36~{!D6iVWLP-P_iZTgsF*CcpR!#
zT9b+%k?{OCvEv-~(6K;*s=$<tv1iECV3BX$JWFOq8Wt$_8Y)ng4Pd;j{9Hu~>zO1o
zMHzW+Fym2q&&=-hi{vjJ)8`n(X>N~2_B{-ExtNdlbV9GL$)mDvi)B-{j(t~n0^&MX
zDKajXEbok&&azaxV^LBb4rxZ=U&Wq`TVPi7=>t31<pkMXF%y*)-Br|aGi^}RR2z_&
zbwQ~c%Iu(+#vYheuINIxk*dyzNUS9@&zua3RxKB~#D>qR8A^7~E<qigq>8LpSdzN*
zrBamvtVz6GDQV<i;*COVvHu(EAD8uxYfj&MSZ4F!J*X5X4L3-`W6!Ze>!u?&PePcq
zveMmUvt9#pV}mV0WYvKrDvBn`T4BX%uYDtKD9b{&jnksc(0&&4AEjbQf^2AAF-Nt+
z`j~`~Tdd(%sgkZW=>10m7(6$c1PtZ5WDc;aoPZyVD2eszstyOXNx7)@HH)4w6OyM4
z@T#)w004jhNkl<ZM9oUpZNYGcEu(0N2HJvpIN+MmJosN4${b?Z>6+CkdsPEwsOtQ5
zbX3rx>=dP&4q{$5dY@h~P%NAB_H?sGZ6V2o646UdFPvEb(IwxIM3{m2vM{oRdP%0`
zwk;IQC7xW@$#|c_C8w<89tpsRfWj(xZhP}H<SadD8Y}W##QL_R<8fm9MV^LOBU*OL
z*a$ZiGI~BLX0!(ugqF;2d04&i-J1u}JYLysaK~PxHN@{1ezT&c!GT%g=%VuDy|Ie$
z)lL2Ff&Mp6Vn@lls`j$%k}q4Z6tQ#9-Z|8`EIo_VRK}JQU(&6iqi5m_b<CgUW^gy(
zHsXtjzz24D_OP<B;hPTKG$MXB+(=syf<L&anDI|pN`k#euE@A!A!@<OcNOgSbsV~8
zZZeBZHxrau5%QpCvW!>gi-kHYe8;hkyOy(9Q><4q*O6<%eY>1*=zC`EJwwWyna2wS
zt=bVNqcyj-kQJ89dqqrM+Pd?8_$zWFb#?1>%pdeO!&<ZFeOZN$ekb~zy4u2eu_vN$
zmUwgj(zldtA#u)o_WnceVN;oNGogL0sAeng%fmL!iVg;5k9+nlW%HOW$nb$1`de5@
z@%m3n0aX+)<#slg#ktu*lk!V))49xe&roP8))?3m1mP-lBELDX!(A?JaVQ3QjiwCN
zCZ|}}ilvzyhRp7wZ8e~-3ZlM?8UmN0s~SLVLc<C#v8c9?0X9k7+xo%5c3Kij64s&}
z>u<)F8PQnRU4&K<pAbPe4scg?FH&`JyyEDiZ*PSfwFSq&Xz-4?MH7Af?ab=iYk^(W
zwYI{pao)5nhaY#6sDQQIfwe5^Dq9UB97WXvkd7*QYw4}qY76d|>4^$#*rUusf{vEl
zM&*Jv1DNky($}Sk1l-Y<tp(%LnmSf1mT=1DdcyVP`t#VrZqKamy6zs{KNgpjvfi;L
zm)u7G(KZt~uzyOjWUiXg!j_~~KzgVd=TT<9Iow@zLnIaMb&aupAR`enk1!5PtalC5
ze-ox;MQ>ZNt7w#;%f*hQo|(Z-q>hf@KPqNxLoOYoVD3t(dmX`aEGdN@UZyQ{y0|z-
zJkVB2b3^~vgx5bJ5k}lIFq;>~qiy5_n_t#7=McB<h}+82t|YSMw0|8tj9&F0*g^7J
zLs?ST(4Sbp+mDXEA|u;G&Wj9XSF>%)X{(pT!Mb3PM%|*8xB@pU)3cn~(3<L}ShYTh
zyy%n4<`w#C!AK5h0}A@R+9EsVN&4m)yShkr3|b|%2yc>huPDr|S`bw(E+>7uGCS>-
z`f1)8Y4uPY$<0Vx$OP|W{mbenYMPhnTLrUaP?wpV;kC5Jb)DX>tuWVCtqEtcZPxR(
zc($7@A$=^#hDWvz=}=Sl9K&_lkbhV2v!E4vhAuZV5^0_lwSX+g0bbO9#AT~$!J=ED
zu8<ivMAj~nmgg2t#U}#KzAz<TMhj10`miE7FB@}7*(}k(dJEGMmLdRJ)c3{pDJ~1q
zIpZNsTlll>me#*eJOYDa;9IgtCR%{j5R~#;;T*+%H=-gGug6ugDP)VKZ~%)s&ubA`
z*${+`!REJz&20zFMZ(zV7K6x&QAYOU8(KPl>nId_r?%Rm<gMpSqM+W9&q#lNqoppj
zgf(T~Jo<{bp=ShK%W-~}^zxnJPkH{PYtE=GT%~pNGH-;FRVj)(OfQz&X$xIwqm4MV
z6~ECET=Tl71-W^ZHN|YrmMm#UX18L#x?>F)C7pt{**}`_P7TR^yVdnC8=h$C2!8ub
z^zp933%D;H>-AzCC9*qZyT~s|X1$JJ5?+fmzG1c^5)omKQWNlF@55abPD*Mk>=%QT
zo*m$l4$un+6<QTe&MUdXzJ)wPyVsJ!zE0Ry9g$%?F|4ad*JgGe`m&bK-~l`j`7N3Q
z>R605C?2;}o$4hKCoCgvcnMF1#oo93Qb(r0DKow<@|p+m8ah2Sl)W+Dx1l<CDbkfX
zZO<I|eNiAeix!8ME%wLK6ltXPiPN^Mz;lskx2#e;s;1M37rXKvkKhr!g-)7hdIHZZ
z0O(kN*p~xHJ<Mxj?s*pavn(w3w+8p-sfK|ajDcR^MXY~KB6%nI``xi@m-?pe4sT?z
zx=H^Z3uLBmDe;Q61@z2xJabT{f!NfZDS#<k;k~7MRKs8q-U<|JA+6tiV#}^yVSxQ0
z^kjXWC+>v~yoS4?5m_!yzzp3+N`h5*>T>`ql3yClXj^X6JAz7*;<9?Um*e@D0ts4D
ztQ5^_v5RF{&t>6qEhPP4k~bfy9@3E098vWm{6SZ^O0Qy&^Sy`!T9Q!9*aJ(&n{8Xj
zF3&>#VSirhQWny}o=5xAQ4jQc==Ed|-a*UK@mM7`5`G;i>mzs>gPiZ-Av`qs$4x1x
z%>QP0t`$+n{|bLN>33EH0mP#Is>SqO(U$%SKgTfdKs*o8NmeRS-bGW$Iy@Kg;Wg*{
zt*NnSrq~PM>E-kmOCp~9D~KO^s_oARK`;u}ebd{vfXyDl@9<mD=61!HKCNDuuHC3I
zOFpfLw)AIbz3Uoj`k1A9GK=@uwWSN9?fgB~-?iR^DQR!1m_<Q4uqx)`--9@BAOyn+
z8H%b!@Fn|-HPIgb0@q`bG`;^6%&4&^@)rZ^eJ{!S+=buZk01{1s4hIMN%@M|&8AtU
zrf7SAhhKwuv2Uf&x>AsR3mu{zstGUQD*Wi2(Wx;jO8Fzj`|iLzak_gx5b5_L*Z(ms
zK7BlV_9Sy!!k;1@Y{u_TsS&Pb7x~Q68S5hBx;C`_G@~;CGd9E8u^LLb82jIWpG6zp
zM=o+y2#}-2s;S%Xwzxz8@bz~k|0(qXc^%lTfAV%W;Yau@HL?5i6V`?uWx&4qqi%8k
zA4L`^7wP$^Ws!#Ju$jEwW7(fO5yOm$XJ}H1i!i-jvFpJ*xDNk<KLgF8EBiAol!Vn{
zD*_fkCj0ZVXrnjurwn7pkTVo|)hzp;qTQ?Td+7hb4$pWo`PsIRv?)}<n_@kE<N7lu
z_}8fV=g9&5zmze1FFdQ?;3v50qwpP7+eeM`eP9uA-yn&85pD0rD0Evd+Axxqbv2Df
zIwc*Qzl?NyBjwST4HzqeKtvD`rHiE^?eKD}ziVNl53!?@5ZBJ<TEewZfVMTKP1zV`
zC|yTslE-ijes=4Jyj-NaD@6kkmb?o~a2I}opPh6aRnR<qDVYzh=>1pV1zdw)oHn*;
zRx+k&s@B?)Vy{)vw*Cyi!=o6Z?iFot#qw8C?2XLNU0Kgt4h0b^P&W**Fr5R{Eqwp3
z1r$kBl8pVUSmVJ!`=9jZCj11yMf8HT9>y_<s1LhHr`Rd<|Hjt!mt`>`)#;^+j_wXu
z#asR?Jsg$B3H8!={wiKR1+F)wKTjf4*OL!wD1hqg@OUXa4WHeGpW!FC>rf2dTsWzg
z!hZ3ewwRCqkO6-lFFz<Yr$+2Luoy0RpW7mRe-Qc;<+`fc!$vIzK<}BN#{e`z%f1Jq
zJ-*>^HXSSdG>QloncW@M;2Hc5|AxPY^goZrIP+L>1MF)6{|@{JKf?W3e>51>lt7O@
z1mFL4TmO$iy6*O~s+Gsf`?tj1{VMznS400>w*C<-*G5Sc9iGtNUtRt^Wgg>s>pdy+
zf0N^RCerg?wvrxqrE%V&;Hmdj0eb!k|0(y~4;5dxYuiBL(k#M8HRdH;gMY%m;r0+H
zYs~z3+lJ@;*WoSP)9!Dx{z!dB$uSZkUBL&00snmnP;p%^A5nAot**}fJ(14;4)=%F
zzh*bzta$AnD3kw2r15F}FSf2fM9|mkc)jjDJc2*rU+|j<P7A*ub-c*icEt7cUYx(b
z!mmT#)8!u?uVVo~S8%pB;8*xb^51Nv=vYzMQ|}#1iY@8xm8{QGm;ajSU)7)h21?td
zyVrG*e%~2F<6P74c=<S*7*@slcprX)pJn|9F8_5!0(Du!7%FN>giqlb{7VLKvt|=M
ztYp6&zrSXjwZDjTer-tpG4>S6zrJ-chxPpl{^g|KVS{5vC+f{H2-gy#@EytjH+by&
zvtI0-cg=07NcU9U_up`92-G@?$QAXMM>|$DUo9$j{!i`yu>OP_@G@d_{=FA(eCYqC
z{;Vix*faal*ST6%+U_p=2>%@N{vrRU_g4+Yr>P>-8|lxF@aInUC#tq_ed_7uQ|9xU
z1z-jH)3%gH6cBorvwH@2;ZL34#i8{_$vLyTSBrcDb#4J3i1hs~ym0BK&jQ@im%R}n
zXG1x-HNp9&%ge(dNc2^>r5OFW5H0F<Ay|Ni@YlAh2L|v2GLg<-FyPIBPWTJB1AoD_
zA?+Djf4Zk0){Q9ehn|Ua{_58Cci@5k-V5QOMIV0)UcxP5(LUM=$vsd+nvx9X1SdNw
z<eoS1NO0IUw}n%F57(4pJq2?b^~$v=ui*~-0oS&|;kIOu)>M-`s=2(jyx$|Z1y{F)
zgYF9%FO_JH3Ab)skX&!!K3s=uaC=)g@wEPyQmiLzFfj@E4P1v`;ksX+;{SfEe@S`D
zBf=zU%YZ+DtMCWh+!oIInI#F&8lWXQK(FAg$$xnLi*U<~_KN1}N|rQ#C@pv}#7xwJ
z`_NZI#R-iTR#hc^AvpHyaDQ9<c`A$Z9!@JAGOGP=>hAX&Tn!X0|1806!2~?j?~kc(
zcU3B!-hWtsI@<qqO*&f~3||rW)_u4Ex3`6Zeq~LQiyD+}DEXE0{wV+%_Wp5xZ`)1t
ztY9UgaUjnBpWB|_`__<g+Ca&AYPom`_XLOiYODSy{cEc|Y}L?MTgubl7aa7{oz8Dl
zs&b;phwxqj=nc5$*Kf4F_tntw2F?q%Ez<7K#D{Pj?r((?PwOv13*Ny+;YpS#@&D1P
zp1?z|U2OZA6m0Pkr8JWcFX{lKuUOyq;hM&*k18KOs+Pj!{_eq(JqZnt!YP6JWadF%
zijd)z$1Ypfe@q5!M)}Zz@*fXXD{gN4a0L$O$7c<YuPNxXCtkbW_A@O)h(k(AR&=VK
z!Xwps2Rl7KA()PXa43NdDE0Uf9*NLq`}Z$v{ig-~n@f9NONXBCLTHo0Avh{RmvBHW
z6*?cN*1O~SM`2!s7zZpL+g1dAZx=$7vIsFwONA=M^*`|1^S1hPP%s|{jnQk>NW-7(
zzWy;d4(Ei39fjwY>Xb|R*V*>riZBh1!(o*a`Z~amMaZ<i(~oCHh;zUgvASw}xDOBE
z)$aNe??0uCQd@@ck!r2oZ6D6z5FCe73ZTj|K+na-^Ky4Wm^nCMoJ3=?$_t{zc(nWa
z$3<FyO1M@f^|`(jA=UGp<R9xlAu^3fM=yvI=IOT4L+IZj!E79|+4xm;3O?JD5N1?g
zaa8Z0tMv66o)^o_!=IW6ZB7c-Ca&KvB>z{t$UlQQ5$eou{CJ5UtZ!>+r>!3n<Ae%A
zqm}?(Q1{>-pE0B4t0{tBQr}Z=&*$r!^dGhA!>VXWntMK<8|iw($}da%q+}Zlm<Wp?
zR@Caa_opIOw+X#M{P^O2DCJ5*PMQ>UW?Qr@i~FR+oAyCh32$kil$H%4=T2z$qh;8O
z>w7+5G8KqoOk0VBy+t^#YUWsR{R{il-$0%kQ9Y}vR=eeWQu@s(0<XyjEvi*_@An@S
z(#E*L@8}cVm;Q`Mc%#<0z3Q!td#-<6M<Bg_M>oUeeNv)MNq@>RfD8NN^Tdj2lfuqi
z)%jW4&-pE>AU>t@(-NQLzK-CiTz_4MKV`9ER%mbePZ}~v|3mesq(Sp=;BEc*@_x?m
zn5w}elK+~{@5(N&Cq7;Ys#4mhYw!KuTR-I@ukzled+3u}De!}TzhvFG-H-V3Sb`5_
zeDLqb9K?q`?DE5KwLe${RRsYb{Ji&{v_EeBd9nZKV?HrU)Bn&V_~60EC|`(xkJg{)
zB<+6$sd~wZQ6IVfyqFCB;7_RV$2?yu9{!KfpJ)a6`1g-4yw*o7A$njx^8AMV>ik#q
zFM5DK=J(6iknwTrFKzwvl3|%;R<m8(|1w7v9_3nKOEbFrUD~J;WJt%>)kwX^rz9v9
z5AWAwE^<GghLM){^!)mM!jtqVE4-{(mTKK5$@c#JhC!cv*uyADuiJ1`N?h(Oyz~k9
zxs)UC*fjILUVc=<%k}cB8o}7tr!M)0gNst#zCM{$U_@QJF&)15NeTULsLtEe!~0!-
z!#YlU{d<Sk=~F3os0X~s-r;?$e@r)ozLbA$|8A;oy*8Q7|HrJqt{`=NpW$u#jM|S6
z)Zg;~`Wxpz*^e+D`;s5(S`HQNs?NVJ{Xyq1(jy_f53rwO%Fm%2&dP_bKdS9rF=VZ6
zUH`BHn1n-cSP80#uoji(e6{;AXb6h<pmNN8N$)K@h3C7gQ4R-$K0K|V;;x>586+D!
zeWu`$7}F<hxamE-P=b9oH5!G3LUTPJeyb%_`Cq`B-M66(vy$JWW)?bXv`X`7?{|5A
z91g0&JF1qorgD^Nz;-t`&=7R-5n;D?)Z+3M-fNy@H~FVD`C+?a`~JV&o%DWMjP)~y
zjgVS2(lFNRJ@w}R%!^%b#B$szNB?YhH;{=MFo|%+(D7es{kvr;$5mrFpcF;w@qN3W
z%|l4EhopaX?SIO#KZO^&>wg*Mq<_;&(X0#I|BZM?cKl4}@ElZ;H5G{7DZ%`Dcl{p`
zlIWcFKdDf9{&@^2@1j5P{?o=%7HR5NyJu-8bp8(L@I>dtYiaRrHh5I_=a6+swAJJO
zJUA+LU;n&nBvGVIZQf7!B)uOKm~kqTHB?}Er)lO_ySsrAm5Gk3MmbP{<GIeyzSe(O
z^3P=gUZ_S;(4S2Qkik5hhl?t)H&mhPsz>iCT!%+H>A)nMfltKy?hYi?$*#j+aBrtN
z^9Y=P3vdBWi#V;PE`?X{5bnS&c)Hb$l;AL&7wP)EiX?sOar+DIz>}T!U>44bbM=JC
zBLZ0BCESLa@L;DiI0DDvf~xG3#{1RNh}?Cpe;ZeP2+qL;I4SRw%YZDvW4H~s;qgv7
zFeBo}OK?i0?S6Wm!tHGt%U%ATh;(~eL0H=VrxO0{ZAt4(BAxzJ*y(xTnW{*93vTYB
zKhtm$&cRvH`^MNow0ZuyQ<q6v|1mftde=kZ?(3<^KCS=mw(AED!CANnr}c1E^I5U}
zySo$_r{S!W_mJe5Y|x6ec--5`jnu?jepxCI4&be(sqbmbc^fl+7|y{ZI4|_?x>>K(
z;*^xHxzqZS{Ljb$MUnUw+=shx2OjLS{sx?o{Wzq#ujE6L{-hSEdt19~lK#v~|Ido^
zw=YbecT(Ql+aCT{|2g;^F2NB+FEO8XPy6?1r}fw2gwD@N$*-(Nr_?5Y74Gk(f2qUe
z44hD%ysnP1_wWpE!8LfWi~gU6%kY`(Url{$&vpLywm+BPGMq8`jJCSuQkT#@*Z(}E
ze;mGq@8DArpXK@+nMN$;)w_3ls|}ili=xfE2(y-B?kb3w(~-WtHI8q<Irs*?kOGez
z!d+itIHm4`2SaDz2wXAR-5KjKOuduyhHCp@>lvPbPv8fmEsYkX6}4)|sk*gQhekx)
zmfru6<xl!5HJ%YqLpDSjT@vf!*YGLKEA}m`Op)4LPs%Ia-75b%_(Y`B=R^jQR(?ck
z_*lJ*ceeU~QOW;1Q8$g)z(;BnJ1sKuY%A&bQIS@E4u=gMwql)*$*R=1&Uqlx>?`nz
z_|2!}150WXJ)%*{Yg=tdL-N0(0A|8a|9Wx^seSnmcr+ycWAG__A=>J3o&I&@A5N?F
z?a!@ko`{a`ThaDT8{k7l>p!8$_QqBlQW5X>_iz~w+Y-8Jbb6`|>hjR~fqAjEUV+nk
zKJqt<VpK^le}AhDnuGK38C=$@j$51dTak(+{o6+W>*BrtTC~emRr{+t|EI*0bZw|l
z9?=ee3!lPCm{RgI=8T@J_c7brCX6U?{|0{(bzjW;rbeqnx|xi^t*xDsH8>4l!dD_s
z8Z(@V7QE86ehh9n;d$7f%kYIr?+;1-U2Va8IHo_`{`_7Pb>by4kE0Qk<7&O!+x~n5
zpTbdHA393Gohq(tBL>}Y0EgiU{15mRj*B`e4sddUV?lBPZMfrmFb3z~@9=l{MAT0)
zC!TCz3_+G;*f;M|WjUZ9;1lanY#IdRtnRhBN(#?idY_0j@szAgW}_?PhQ5Lr-!~7?
zD4d0F;Rg}N)n$OYFbW-Yy^gB!^@is{$|7$2@8ZpF7-C0Baqy&#a&~<E^YA772Yh9;
zw;}&z0H<{+O@pKNJQp+p7vOL354Z>ipe*ZOm(uh^#-8p}cijf(a1y?U|9~sfo;W}Q
z0jZ8!Owo25d{C@&e}j{X-3F#VlWH9rmp#1YK6qTLk>82-b=1lrX$f9ZYN~Z=<CDR0
z_zM09d~VmjwgDy_kPR4@3T(>%Fnj@jhc86?9HaVa>bEZ}fobjEU6=o;SVzAXab->W
zS5+i5Ei8)2i*Bwzg}<LFfI6lYr@#uDw$x(72In{Bf62W6yitg>l~NuToW}?NcRUxE
zqPK75Jr0S#C+eJab-#>C9ow!yaen^+pNI;sYQ0LCCPzlCu(!GXIk*IW6Kmy^N;fe#
z*;7bYQb<w=phiU7{gr6TYel%Ns^B;dNPaE1KN%c@FW_&Y-JVhS-Pik<t${FIOE+ij
z4#H)5|8qti7abx~vLBgxUJCYe0?u0WdqO?VU7=SuG<_J8OV@|&&oQa+XBx7}HSULQ
zV`GXclKpwP;Q+?qjAGX_FlwUD%?zaU{y<%COW_2Q_I@tX@_BW4b&9BCRgqrz)i}1<
zwKNMCjQ4ol97xsLO_J%2xt)dJoK3OEH*i7P(-($FCLNxWNnRKB+M8Ja5oym^yZOb)
zW2TdssdXS}|IOILIk*hp!<W*&m^2Q|fy~016_K8=A@BJed}o}w19b^i^l(Wv;Wf1k
zz6k9-0-wT{8WE}3;AKsAXP}nRrjl~^;u19{1CqkmQ7LN7E0^^}M|CG!l!sviPQe#?
zct(L|-+Hw&$s%drdtHh*)_)i-!<TSbcbCkd8U_-_B%D^u@j{?QCjB`NUm0(B$;c7{
zHey0y)sn#eR>S+G*!8ULt{ow*WisYtvfgd|{wvr2&lJ8MHrm<DpiW2Nupz#@f#uNu
zDL4h6h_-pmKprp-ue!X?qSn8m|0m!}_!7>jdQ?}7P?+(C%mCWIjZ0hwj>8rB4ld~V
zs_8!pBPt3YQrG3C{>;Kf#hyo`<aNbenMh`$=`PC3D~PoGQ}|q@?NuECENJV)aoLX*
z_0Ddr@GxA2PvDGE0d|X^RhgW5N$-C<^ydJahp%;h$IJx81biJ1s<J)M5>{RRuV}<<
zLe;y>cCn<4c63cY5B;BqPn5fuwf2FMMS{5zN35ykdt?1Ga27s?3yQwO><uKm9!%?{
zmIc<?ECa{jJNP?Xgef`LQW5K-D<tR<8(n&n7BFey!x71+Zx|+-VMvdtb2=4tS}1(}
zMhZA>Ml%d>$;uTm(6&)fIt*Vc_J{$wjwN3*yU-+MS`+DfKY7Q?^8E+w#AJr?)Rj<D
zDA97#^9r1Yzrzo3#s&n#2FAODJ!%UebQw4b-@-rSy~=EUvMtzgHH|L4j)L~2toLU|
z8{9GgpNh4-WxCKU2;Gt#PQ!QbEu2z}j<$WML9?tMSP(iRhv6Hof5sq&+=qr6O_$3>
zA#C@O{LjOe@VUHy#lXTcwaN7jdZ;ag@O5;3`3Am*lNM$*1(#Dd4B%X&L+@h!C*XVd
zL3gKy5ta;e56ETxYC1r#;BAusI9!D9;amNFrKqi${b$<FRROfNQTY5hd?DWWibY=~
z10O9Z4s9t0T1nRTiZVxY)<RP<0}!{7s@DI~NzXrp?{$7-K(=o^()}WsPE+Tnne_i0
z{7vsap_Rv^e`Wzp*z?6G$RAYna?WVKOGU{KWQmgjTo&T`AnZ?yK9A_`vsL~B!}D1e
zLUS|8CfT3A!8zT1%0(?6$Rdu2b@gpDM9k~%cUGC>x|BCJ=<}*{G1Khyo3KBZlsr9Z
zSXCv{zfAF9LznwxKbv7cQviHg(OcZTBM;G)4=ih-{|&5W6_|t5%1`C$u#PE-p22Td
z6$GBvAU4=^>a;aYbtSQijAl(UTu0y(93LXxj<$&?oP`f*8j?>PW>hAhhEmzqAF4Mn
zpq`$cffUD_g7dm&gQkOz%lgJ~iw@(s@TR=oILs?e1)6R>t-SwGW0k9MU7(8bd?}Uf
zPbg_y+X67(vn@y+e)AgT8;$kPYc4WZ(r`^El&KQYQmb@B&z}s^`3B4diMgG%tY|~Z
znxQ%XC*d^ALRN-@a8`NmvYC#k;I-}ar8b*WzWxc5N5$Ozpa@i)ZZP9;1kQ@|d_(?+
zbo4Xp*lQZzeqU&>J&Eun%tE;i^D11<7TsmlU@LQ-{uww0=Ujg#;D~tWN9Cew)&tx(
z{LZpS?~l3ur}C|bzj5-qM(NgszOLfO(p@Q~hmPsLM{G7GQe|D!#gt1uB~!nlKSwNW
zlogXNC;-aDczzttsu;0x9*)6T6<#ZXY^z!56Nf5Ivz}3`oTu}9($c2AV(g#WZMr5X
zg44eKag%>T3+@y_>ti8Rn34QvLjDKgn7n^(7?%-v*K~NtMLM6bNJ?eN^SCm9W$9em
zlqZ(~Pi<mHbejN}f@9*LpH}IyYgk|Iz|$D#A&oL`>dz@Hsa}l1DyD_G#=z#`ShAma
z+S56emOSL_Sr(wDWmGhZm`2Gqfr_JT1&cH@3;+5uh9fX1hw&78qap#1yMtdmdd7B;
z%ZVL`Hj5g~DK)KPtbtL0=nF&)o$3jZ)Q`bJNv)UBwox)t%C;q{OOis$Ma)WMy%AwO
z*X1iRQ@RLz&=S($Px<6pgXRGtlT?Z!M@IrfAl?HR>}fR#E@Ut%V1`jSu(XQY;{Coh
zqM?*m^oee+Kl)C4=C?eHhnc~&{(7CYX>0oJ334pcC4ZpbY-n|9|I;hK%dB-V2K)QP
z<j264kVZrfD5aW1w_!ljeaU|yyVo%Uwys`h9A?rqca^m%lV8N{p@NybSJdRE)n_=K
z<Ua+6&H7*=Z%+tvJ#ArT4r>OUCHY6|?U=c{jx@h-y4=?&^?@Y+8tXGs-sqcL5q;r6
zesf^9bwuZF1OXM#$a~~PQRi|Ref@r%zgdAv4U$cpRmW|sNV3#7p_JqsCsa_a)#)B6
z8pN)Cr(!7cJtJM4F<~vG^E+qy5R+*=+1;{X7AH4yKy6CpWPc8*x}7qA_2LbpWYx>`
zFH^#1uhnRd8&Gr2N*pDNCrbt+mC1yTz+zdg>vIN9)iu<|foViI#tB^%HUgkoWuB@^
zpOlMYk0VrpF_@Rbe3ZuxIecWUpYBR^2bNALS$eq|ap8z19mDa&m>T971_CyYg6eu4
z;D*LNym_c^#kRh*Vnq4eN`>aAR5yh@jUqu`$$ZE_D<4-iWP?m&R1{~GBHMA>AQWqI
z3?+?YC!5`?=)%@kS)*=KC<Ey*T0*WfH<=X3XFf{SJk>Kz2zyqR$kUkAMyth?3UpPQ
ztK6DAi<2<s@~`P)8!6h7eMS69$zG=xlXkh`OP9>=M%pkg|B=$q6iPJ^Jmi!NSQzk%
z8IXakJl?COJ??sLG15kHT)M?bAnFO}sFd>}xOQd^pl@~~BB-*?&_r5)Rn^szV%nl)
zv_oCng1&sfj56fwHSI^kY;Ej$&nSQf79nj+s#RGZkHY}D1{T{kY1}mo;fO<1o_ixX
zKUO45E!(X#vy57%{9kU9daYV<=5Sqy%5#~%|9#tnydsq1M)ArN!B7~Yijq$iH56>h
ze_HxeHm4irGF<XNwkReIE6}T}JEN>STBW%7l3hZ|db_D)aC4e<na~k~2=7}M5~++>
zT1_TrQ~o2CdF`3P4=f|uQ&Mxx8Vb5K3&<mG>5GRWlrlVF!veTkUeYOPL0kKasbSgT
z@EEKMBeStfx!$#7jyrZQQV1o};mzE5+$*Yrj>gjHo{lJ4aFu>0%Z!aI{_l#R_Qx$c
z&QRtdO3C)OCo>+lwj(I`hElJ6vk)6vU6$W!nV~G1<L#Lr$BaZd)*lsz$oO^i{&9uu
zX_*}}37ZABNS<{p2N?D&YWlf(@=|#Gs_6=bwOuL3gkjIxR^s1`E~3g%)o+$$E9w@9
zW?mVN^jNNEy0ZSE_gxDJ2d3e~E0m0;v}zs~i(@;s9h+J8fw~NvGL&)mh>ctqsc|j~
zT?&b~RMHW0*M;Su2Kt+Q+l!1MG`@FTbTz#@Fdm8MuG+XOy8g#@^vr2S1e)t~cio?6
zmUFCFy1A!BZmwyPX7NOOk!RDa_w-D~97x%YLswCDL4U$r$LrJ-$~G+?i_OQ^AJJsp
zRJg2n#!gOO#;#)#&c<pZXV=#cX2k%Yr-#u3D^M<5ioY)X>6!jwiRAR3@i!|WydH(D
zMI?M;Q+ve&mdm}DkhQDZ)I!H9(77VEs-5D9dGoxhG3_M_qO#)o%Jl`+EoPt)sfNGn
znkDH;V=+71RaUc$C0xQM^okXbis?|a0VW^R?<v3CH%pq?7Y%*lJgx=ZPJCaui;gy`
zZ)?uBJfD)xo4f>Vv&1D!;doVGpdCs-*>x`$wyvv}3d5QyI9gSPf5Qj&<dEaeSyj9g
zE-JUnS*8;4dJqA3WV*V#@07Nj%D#Dpo^cj;yJF1lne`|u)mJal#zs1=8&#dIAuTkm
z{NK@u>FafBa)YSOkxtcSZYQ#BgQ7Tc^XGjP36>MfK}YgzZkeqrtC<jEgy|bed>b8u
zHN8*UULllks2iXo`EP7t->$i3^PXYDBL!78Be{`O8Wfiu8RIxVWeKHXK6q39ak~ls
zgf*&$YIXGOZ4n}_%0q0ZYTLTy%7)`GFzXpFyIGRy8JYD!S?Y>8;LLP6vJr91*+iOg
zqgyHRl@83740KUnar+b3tdj0)Wo=|uMDUN=O4`3}-_C|dc0-Y1{7WQJT0s}qF=eh<
zy*D<$Z(c2=xLMxxTIW#pY+&9qB-%BT)y><QR9;Z0ch{oPFiADr1Elk96h5sK-E><=
zG9r!KTxrutpF7B!tXSV>oM%r)hN#zsp1KsI?QdYMctoRrqN8h5Ny@k`lInE~Cu7~>
zmA1kp^x9z((ig3%jkS%SMxsnzQTWr50!KSX&yW`vVL3>vmtf5jz<ud=WRW~6vgYfL
z{ab{lMZjTCGM&e;?HyQzcj@=hkjh(Fv>2dbzP=|r*R$0v2kH5uddjA3co5ZX;yN9A
z+Ojsz4VZ^F;%KO<(Qr*$kXx`ELp9Nc+QoVbTg9qz?0=X9R2E$bp8G)hShqWZyX)`-
zWrdjUjpTe|!fr7!kjt~PC5CeI+7VCrVsi3LJsd7NTG(<``pU&HsHsluR$DkTamTCJ
z;=3weMhwsu`t|#?{<b0Uwk;Q2Qpa&iJDQpI3;pTF`Aw<+ajnp8aKyLn{a4bKwGAhy
zZw9n$!ER>hpb+^&=>L1Qw9F~17O`ript01<K=_%f;lNh(s=4*McqYAM&v#7a+ih*;
z60}tADx0+muO2u1Rl$0!gfN;BO`5?8lkQikFkx|N-7rRJKjO-hN&ao4q^rxiEr<T+
zP2rrpg;}{K&jrNIC9SHZCggWcx}CnQV@b!3PE2kNt*rz9%0XGy1ah)213Kgbyt2{~
z|H_j@GTstS$y+!AReA8Zyrs8~^kN&{iS&GP{cm-kt1>!RD3ul6SWi8Wi%Cpt>M2`-
zam#aLCdEjAw$;P+JmK=Ji%a0Vum^gUWb<2KRD<6OIDR>u^7rr<9%+C$F0y4i_sD>B
zMN0DA*Z&%xX!<a>G32-nb*v?HT@H9L-2mPT<mG@(FAf#a2G$YuTI*j)e`5(=!fRMl
zvw7qRGw}!}LA0biPn@`5O*VO@XgB&sU6$LaToJU&0`xk<iFymGl3y6@$k^6&uv?nI
zcol<C%j(HJuIWpU*CKW-$@{i-<Tm8L3UBrMJ&Q51yhNt&3Ip;S-b1%3n27gq&}K&>
zOpSz5M-w=&oV0x)D!->92?@>hbV!_tzToTM#vs+YdZO3VgibO4;whA#&fCjmGFm!J
z?_kafP)+k*arg%sqIo5GZR-Cs%`Xo5%BX#ZMQEA>+mQcj^>o%OT=VcBi{W~(s3mNi
z-$!sxb!i762on~%(A5OM%h3N-;dHHOg259faea-DHhNe$<i8>=ljGX|a9|~wz^<yF
zao!)nD_GAF&dC`#X_@en`R=|&rf=XL+=1tD02bjXJb;r1vmF6R^iM=g@kEF;iy?p)
zVv?D)CTLIeRwVs3;krDEL9Vy(5FWr0yFB)-l#FE|*2En0G917XJb}A<zFzd>p|_C?
zeL2KDvLc+ES*ROPQ`Kre19RU?@E9H@G>ksHf;(!f9J4Y;ZV`MYdlbTc9@FISMa6Vj
zObr7Ig-gXyW@RgLN+`rB^<hWyBf_%yx|Wz*9>pNeJ9q^51>+WtM1AdCPo`&C^Fz1c
znH$h&@K7RXNMSlEcGgwfXu=zKAVj3i^}m4of*{Idu7_Hf!T~J7Gk6xyzY~+lAr;sn
zCGTy71I2z%x9ZO`c&hc6?E!Rp%liFfuphwVK*U-V8Ol?2Lqxop*+f`RCjA{efhS3S
z2EzHdEhNL3V(RDzOWKPx{_sGwI~(g?5J=21DX({qV1gJ3Sg&BflKk(g4}Qk7-`!$Z
z+tc~E7y6Ib<W8tBq9I4wwQN@ezT3!uQDjypMAH%F_kq1dNm}?)%uk#4=Q-Sg!%)*G
zi<hK&b_#oDe>U|072Jmh>O?OW)9qdRimulGNDgyD{*U3V9N1WK@g?h0=)#gIOJV)@
zMHE*p#(lY6$a}^deK#>5mC?;-LY+;GV&N=C>65R$Ez<J4F-qQ%iH#AIKwYSs?@Kq|
z`|z7cpWmYYuczAjxC}sMdK=40N9Bfiv!A8yZK<{$BN#*RdPIOL@KChJcLUBPV#KNe
z;WmnkA4q$j!XKjTeI4s>%j=E{MA2t~;?sczxC1}IFF`xpvL5;9#fXe;6w$iy4(`IQ
z@SC&Vg^Qce7<0cE^7QQH`w)JCAK`k)A6+&^ia@+QOYHaU`g8++66yH>bsW~!OB7%S
zQQJXjmlvXK{tfPg1B;5wsAg5`)=ghGP~7+M0RGhT3%>qb?IdxF&x`4Rlze(4-ur6-
zMbNe(r|9F3@_E$pR*iK1=b`my8t|Ahmb{)7x{?`6@FjnWw0u*4y1ICcsHrMKr%0qU
zb@#X>+SsR||JWU&R@{QHo~sHqei!ZVs>{EjUZTi3l!{rH<bYm@xbb=fV7~rw<;kK}
z*40;(gs5{Be%!kL-1ILq>o+jMr`OV-U*TSy->%v;rc8Mv7aZy6WV)ZipYW6G|3GPu
zvUTK?i;+yE=H9?{_@~pJxAgnt#giW@u&$!XJGcWsi?qER`q$U_9WRbSdsTE@q55AU
zjei~b-&N=`rDjquVwYu6x;nrA4*OFM1BR8S^DP^g9PK^$8U8h7Jx~tM+XFmro04oP
z@c9G&1%JR3uU@T747t@_u)mRf=R^2iq}!VcFtAfrwK}$UCv}C!`4jweNO~A)J5Vts
zdUTA|lzI6R{u6!~B5iJ|4po&7M%tqzdz6wlx8R@fPsh{iTLLiV1}i$79WCL#<p1+l
z+S_#<{V^-oL^`gi!sk=?75*puJ|zE^HTHPb78{*?FWTXs;AglO3OvyI2Qu0f{b5@t
zIl1oJvOYJ5z__A_oEL|GA|Je}Ks}w_YeTfj&Enj5S9PCA9<~JNdRN}}emGFqpJ*YA
zAikxf_e+Hy$@{<Z^*8O3?T4qS`d&+c<6rPEoxYyGf4dloL^}`$hyDa^!jJHi2QYng
zU#>|7qaQF@4p!v7euH1&`VeWnH&Nu~{UZ8kE6~0Q|Ayb-iLbwD&Eav=8d&nBuS~@?
zJHM@v|C&@~=;Dholiui-{TuuYH-_{l)?ZWHiH(r9ZT)}2zqi_-mR=!>&>{S&fu+04
zHTa+KpK#0V&$?oKZ>Jl`0kt%Govi<lTfy8y|9xyWQm|b?FWeRB{O?0}c#-5OTT5>A
zIYrg|o$SxQ!v2)w3sz+)0{EaO9Zord+wd#=2!Cyb^6RUpv!afUb;*52j_ana{$W^;
z_^d=>U3Ran-(S@=@fY}?@Dtn~LgjAj`Hsy1hxJ?3`u`386aLsrn%!4dNYkG0*pEMj
zyYMIcIOM+FhlKFD65FC^4pW}{rmX+<Aw=w!x@uZB0NNByWJ)sMf`2P^+(!P<7}U4S
zQ%4-?ui!EK0sjd<Z!3S_wuoKYs`|YUoN&56?I{1Yjzmjk;?VwE@EiOJHysAd*Wa`{
z>Wb>XZRPlGEB5(wTSV^_JL~Ie+d)VFIoyM5@C*C|w}udPWBn~_e+cD$0gvE@?oL;?
zLP7SV`Rmf6mQ>)qm?R#^dj1Uex1xzIYYW=azoztWL0HJ?{kLELvL24bx8%K2k>)o1
zsn~s+^{=X(D~?dB$YgOBewX!sxRq66Rqxy~U5u`W<bD2-^}juYM%&f%E!nLI{8NcH
z$^SR_4Q_1J|DNej{ASpqRb{^Z(E1;2U4I<Do*AEY>0(OrB=7y7Lr9QVCvnIAY0E1w
zDf;<U(a)=)AMYDwPqZU-#N_Z!7r$T4eh%x;ibN3GzOE%Kz#EGm{sq^<{uI&_6L12~
z!dXp5=UT>Vf$`rPiXHruYozg*@qU-(z8;G7`PojKB$;3ePQh^%^>YD-K82fb6Q1mZ
zhd-+IpN1na2V*dRC3po7;0D~;YJK8U7N)~FIH}prp2ELp@B|*fi(NQX4#Fuxf*;V$
zInyfA67KHAhcE^w;5?j#qhic%%e_9b^$)gveniio5_0jFiVn#K+!G%Ej_aQkB>YLu
zR;KB!l=r_c-t3iaA0CBca6zChRo$z5N?zTB2RpqyP&1g2IpKpe1O%S)y!YYhPU|m;
z^!u2o$LFA;%X3N{--m}gEnyUn!Z|nx#|1my(%t6~+=EBE5E4~j9!}XS)Reb=r^0R0
z|K?6#F(*og(}wI2h4Y81-fS=bnlL5K>F_n64+|=?JXH31`%fK?z-i+XNHyU_V_d%v
zFL%P@&)~2~>rV<CAr~J1Be(;1;qgxEZ@^KwAUyvu4Phkv^FW*bey8;}gh_Eiv+1LX
zhL;tC-rj`|0YUno7I=0lDYUhDkEH_3+de!7$Bj>6TEnBus(Yl;=bN2=ysr1Zta@r!
zE_q4vOY-mS^cBheoEE1}YAb1}(<_Cs_g#d;dFtF4m8SIeD^lRM`(br!z@+eNCn2*w
z!#6vLZb*#Na6l=&vf|jc`(eZ?s}6NQ3XFpIRO#P){o^pB_ire;TU4v;-s^AZ<tMBs
zy{jDZ+ueyJ%lh#HIshHT7jFaoYnM-^_n$FFwH39hy$b-mT|PA-Lrf_p8EsFg(`mQo
z%O<><<qwv$dF$-;85imHv`QK)lH9x9>9EpgM6kzms>#IxNMF&}<>3sbgcLHaR)l2}
zUUSdiAJ_iRh>>eqO`l8q(Vr2RGh~3WN=I+it+|{2mjqKVEdw}E+s$iuzaRNeN&lt{
z`0}kfRaW+VzM-#}v-7*CPN{dhuRk-`_f(B;DTqzK-`f4hOPVU5R6QwI43y6Q`<<ev
z_>4;bC-r=^4ZYh_{$<(ESs^p@)wG)0hSv66|1=yHZvwi9#Sz$-{HIk67!$U^vP{6j
zkO)h=d`(S-gSI89_xmX+lU}mXzYq9i#h&(b7ai>Xc~1tm<wrl@lN%mhCuXKTX#Jkd
z?)m8Bem-?0-R{NM$o@YSB?AW^C;yTyA##Bq@TplQwQE=V5BvR^us_#6^!+nS=6014
z`7r$nD22{;V`6)Ja+81i<DRcs|3`aI)3<f~fripnlKEqvAAH>T&+Pn1HGIFHl9h6M
zA9w!BMf=g-KcQ}{zpjFA_an|v%=*UWeVG0>ilYqe4}Cs{a56Jt+y1>NT{J#ygUdw$
zt$*kO!$8OGeDDU>?0`2v!obHcbzgh@0R|$s0~T$|AMmLdX?^D-6eKGe=np$!n1eu*
zR3G;H|5XErt^Z>VSQx1O$NhfAN~N8TGjKx&`h(YBx4FrW{QZgrSReMh?_c*L^tWCl
zZT}eg`|p3S{<(o0NQFLN332{Vfbo$Idhl@pPG$rNy<H6SF6$2m(fLs2L;6!C=fB$|
z)53nPs1@<HA5e;AO;_Ca`?(@bimrQFJS9uY553#-iiDn@)ZlVO{WUKW!1bQbH{bvq
z)D^#^HjwwaW$kgro=}QmQaREXj(fGA-=Ba3Fem)blpI<VNy_4$Zm6yl*{lGdQ=KzJ
zxZ6p}wD@VeWv}Rdyss3^gnmCg-_u<4;yzO}75(@Dt$$skjPLeLF=fgx%v#U)l6LO>
ze%#=so<E?u!4*p}>?_4LBlPly&8NLrmT(VCGA=OG8G-XGYZ7u_DVlNJ?<a*o*HW2h
zU+b?64D)~rJS$rK(w_1U`5jcxdDl|J`&$2`Qq0pzF|BJdY+os+5goo!gIo+ifH(Wm
z{{zzB5fum`KeV_0OjwGoW^>I8@OG#+yzQr9SZoai&?}ar+-?22^1GAjHSF1F{K7uZ
z@09*`oSgrY8nm8hKIz#m4rLOK!Fjj<M<l>i<+z^iUgQ~rqvEDKqBE4K%i#gsg$M9z
zCmWc<yo%I^t!=)oNscrh^m-Q?IxRT&Q!r=Ej;Yb<F5KCLbbSPlYVn6u<49SuG#{5L
zygL?|qsVqzb*j1=RFeWe)`-XUzh8nmRX9(o&R18D<8zJZ-QUR!q$2HU(GMJw{FlWz
zeP{O~VJb476dl7nOhI4X>!n7Fp6n9M%xLjx2{VwZ!S1mt%+Gc&GM`nEE@mELIOv}C
zfBR1kN8lWs7aV=f3Ixwo=YF=!`VX1@%&KR-t!ea!a1WmDB>#qh*iXY*(FdR+ZCd|>
zZ5h*D{-@!%ikx+|r!VODANz>l*3TTAP@Q-l8q&X)@B*I0(_M=Eb>Rq~5hF-l{Aj6B
z>h7NUa}>_NdC^<ftwHpGnnYgi(j;;SPK$AU)b3Gj^(f!m#r}-IF*qj?&Z<U|CNuA-
z!n>pWN%B9VBJYfh&zkJteYgkDcCq21KS_QKKsAbIa2IaDqn*OYaX6_aj{~wiF`W0z
z*1x0tkLcy+Htbi<-^GvhTAgaY!ToLbU>4576}SQyU{>y{tvW;MH2fKEZFPkcMml{=
z5!e(|)g=B1uE7*sjS-Qd)gKpe;3p!UtY{Bj!n`ov@51wKeb`|*51+!tq71tV;!o;S
z-rc(XQ8)`%;IeporVPP*Nv#)?@aMMcAAvJ)1wOaGnp*^ws`OXkceu0lDpZ2w@F{!>
zXGEtkVL@n`6Zi#caAT_tnuK%k1$+u86vT|H(c(EAhk5uDZf?8&2AqZ~a0!m8$TtmW
zI{O{mQ>W*>Z9}Pf<IO&*iMb4#!p3-^X1iNk8+9h(0(`0WKdAjr&!;Dk=;3YFpTi0G
z9KM0ea9rM}BV?7-Y5R-X1h-lLQG5QVZXlV0$m9U8!*6hN+x4G?Pvm`$tH@f0b+zeT
zho9i;);7na|0m&!XlLgXJ`Yr9f2o1+KjF@{Ht-010$)h~XN=%6T_01c`d@H+tNxFP
zwepfR+C-O2>JmHzV{miZM!^Q05$XCR5nm3JKYcF)b`}2I*8XI046eYJBCf1zzNw-q
z)zdHsziw-P()s-gzJhbg0Myi__ySJDb@&Ue!PBkxpgNp}&*2K3R{&bFe&00Zm0Ewc
zX|yZBG5AdSb4>C_gAgxelp1h%>s5LRE{Qhy1dJ<llq-gM4A<aSG1hN$eow)d@P#&i
z97f2Qg`QY>Qtw~N%HAK!g-jc7_a%i7E!Aezn;bVlup6#}qat1Z7S1Vl8P^%=ij-zr
zF;#!4RXK-~@D+RwpTa4{u9?zs6Z*Poed&WCA8-^tll3{N@IAM-@xyRbkyQn*yA5rK
z_VYXVMAPCGBN3PjfPW?C3hS@K8TbLdg^O@lrysri0XSYHgT6hq{(1NU{u^9@!%ChF
zw2A{pN|H(%H{FL!NPE5#ZRn`gaC*?t9O0D0k|#sf=QR9Hta+1~c#AHBruJ`C-*|6m
z{fFRF_y>FoXN)#A<?zSgFsz7@ogw*;!A1B7d=KZ8yeJ#g#DuQ)Gs=rR_8%|95x5NB
zh&V4)t)o%W)HRxtnmxEb6vrKg&*4ABdp~NJ9jVc1PB+8UdUeh9e*!MTcknfwh65Uz
z&1Jm~h>1NWuwM<W|2TXJe}gM<Li*p<5Oh-T<7yusz|A2aHV>bRc<_k9=MQvp=HNIS
z(yJ9>h*3B%()lyGyG4uGBuvVl=4vb{_<)M^=kM^jK{(e3N}&l{`Yu>Cm%pTXbZ
zYd9hAml<rs5p|!Xylg@KBXAnNh5vvnaL~L*raVGbqvS~_1^v(A6nrb<*m-IGz<_iQ
zDWplc`TIlqe+<5q{-2Wm_oc-x%@Nn6Q8(R(jl(7QZ}2y<hSt;`Qd6(l5w+f>^H-36
zP3!*$e6IJ;phCW}+!1vRl{H4tbQz4n8Sx%pfTJ)D7=nFW)E=3QG2KiuD8n(uPRV+e
z3~aul&4?kouBIXu{Pd;WzoHJ|vJOT?|68?j{4zA%228<u_#1qs4&8Cds;O-l36?7j
z#@t9cm?`%ATH*7UtVE{4-2jmqXmD@cyKN7`XZrm^f*dRBTPo@|O@=iMfGxTFC*Z7T
zKhG)lSQ3I%M^pTBa9$%R>p^;*%X)kR7j${fE&Rx3PijV*z3f5AA-Dp6hp%8xn_RIv
zT$(_gQ#~b3-xT!c6nrfQdO)YYDHM<in1Mdbig;lSZX_FA7H8|<;S;eYR*IwUV{(QK
z>m7XK>%Rz}!6i*-1X@B*vKTSAo;AUKY|8(<0+@?1DU7YKK^4tmkEn5Y8E&WbC;5K`
zKZv%vYKF2TAeLPHele1@X@KY80(@oHw}uR5%$YP`RKLF@*p-b>x;~}o?U(_k<OcPT
zt2~tQQtK}NWAK&aKPN+#^fxo`2a2!}h4Xtw*87M;pMjpQ8s%cHjOLR2zzLC_Usm!6
znUIDS(l;!Sj-TJM<o}&$r)%ci2P!B}!f}N}P0gScyif9;hgEv4YW=zNKbJFV3G{5^
zGYgkQ8+}1=Nii#z%l;fP+_N@ZOZGF_pD*?M6S90&o89gk_(xlJ+D-jA0$1P%_zF(S
zWAva*ng7+m6Ea!iw#q{7q@pJ!z>5kWb2UE<47#SG?u@?fmCH$cKZhUSb6s9aX1y~V
zgUm1+msAh|7=?@Q1AH&i<Gwa92F6RmV@XMwb+w&$(_cCX-@!lNlDN~N1Rp-Erm#3w
zw%!I{=&ZE&%cAwbKv&V$CEZ?8L3ZHl{|?S7?Nu%kFO-cGrK|T}h#=y)c&9Hab((2X
zGWti#b_Po-3$6kO;B)v6KDF2-P>xEbw^jM@_X23#u-=#9AMhQVRE%6Ia<;Y&PpBf)
zv3I`yWB@;t0UQ{yOePS$y8LNR<BO|F{$Ij>hc88X9|FgwwmX8us0y^^ZIXYwK76ao
z@0?=S+z@*+4Xx!$ilv%-C-nbY_yI1<MKz5iCO64fWSAFYY-UFKe?<X$xZb|KNLP_a
zS#1>?1}I%0zlJMf;;kCMf5j9jz)zO7e;Wqm6nrP%`>7(zPgx=?sfB-3Rjt=a{|~?w
z_y>Hg`M9bDdOc}TSujKuC6zbyX9O-4^(UU}8U#YF_4hTqyWr>dE8Tr&6aWqs{swYH
zXk=HDM;rR{1^fqm1t*mE@7aV-Ut3Vo(O%K|2S7v7<Ja<@ares&b2Jm4Sw%kVebSN4
z^yj>iJekzCZ{W9OK~j~p{*^ete^>NcH@<>O5w)+R)3NOGpMX!{KXmt<)M@WSDIKMf
zbf^Mja70y_so227a8Uz#qf*zl(b)84h8i%ZD$yKdB{%?QEpMC4<tGo?G3x577{|{I
zk<O>u>mi|K_APwC{6T8)Ii^wIiEtpP_IE<_Rc=G_VoP2b#>G2*ApMPla7me`1BTBV
zSIEBRw^M=tq!{8yBK$fAr?mb6+UXfgd|zr<7w7D}uRjeaA629ndp$0T-PK(bm{OH-
zHa&kpRoht+$py?^d@`M$NjM58;b3swPN{G>uRki814Bn=R1FfTN|;Q=ru;9%S(`48
zzFikYT1A>hKBFQN)0!?lAgtlY4Ge6;q9<6oLqnwV)5_jnghO_KBW{RR>AniJ=i!W#
z&L7cq_NacpZ#bcS18vD*8qO$BU&0FCbCTb{5Tm1wI^|Ml;GBvKW1;`Yl(x*3aBCNd
z50OUJ#Cd&CxAaZ@IimDv)sh!IMZhC4sxinTx+s&@e+teh_K(?=o>|Xa1~`-@ECrKN
zi9^cVbj)?c{*`r!n%}BF>5q<S{T1PsRSc)Nq$Nzq1Z>DZl`b#Y-Mwwp6fK3%Bg%pv
zu`2VX{W&Qb;d&9|I1<4%&A_H<_){Uj2F$9&HfdO0-pSdQOK#{0oeKS%ll`34!xf9+
zx~2>15`IoCI-_xZj|<&lLYkj#oZo!Ycf>`au0F|$<X=YBd~#Unm69IlN&$Ozs%j!V
zpM+YC^=XwH>z28R2F&OH4-YzLq|Ymebbih*>2Yo|6JS~AaZIH1lc^(T6b?%Idf$zA
zF3A}7bTGS6hH0e~A2evr3D{!VV5a(pc{s2jpe~a3lw@8(4BnBP?kn)kEe#%_U^+wd
zsuI71!I+V#Q-s)^R3&DhuTJuxQlDO}PV0b5#&x}I6u5fYxln=Bq<Ksiz4g@hIxnp0
za*;M=VArRp$4nXN{PVQ_!^$r=lw|1|N^`FWI$jY^_6*c=n&Yak#8}(F?n~%(atmV)
zDNV8z>p!N{b7nrISEMRQ-<y2+DS_tn#^Ep=(cQIgv2#Vvd|i}S>5Zl|i(jv?9;ZJ>
ziw9b8%pvsTJw^nlau7z+4SLcHa!qF@Tt{Zm)0xcZqylAt5j72u>4a1a>nB(CTP9&c
ze-0&bh8mq|)t5$%(IAp<Wy^e~bn2u?-Z#!~a!7LuZ3lYW+>$q4y-`*Am%P>H`8}pA
zd&45w2>m>gN*T4o37);q`m7!vwW3d7`(Ltqe6qoldf%}m|MX?^D&2&;iAXN|VNZtW
zxc2KQ^kY9J%ml{8yJv1ZR4YtKy-xFh)?d?U%M6^mZFrrT62Zr1KUYgIrqSea9U(UJ
z&-rE$SF&BLE4a*|Q&)|rq3~&7+UEx-g1Aapn_S7sKsYSfuXTzqFpvS5&|P#BWj3lS
z16!wmSII1CLqG-<=u{15wrhSPZmwm!4x%75%^K9IG{+P;N4!7K&xQ>g7#3w6CZa(o
zLhDemjslO!#Ktj*8^}R9&?>Y>tmV6_NU^V@TT#xWFI3swNVfADtx1)}ss^+hetTeM
zIFk*Umc!Z%jYceX=}KBvlYiL^_@ui0YtRgNG*qC%JZ9SYs#(v9@sw9;G{>#wzN<Hn
z)Ms1*^Vo2$w~9jBajW#jl{EtQfn8`)c4o!~4c0~s6`-$c#y|%!TzXr_JLMA7k*QQ!
zpD3RGzQr)P#F6QEds(&~hhbK22Dw7ko}u2mMrL+oQaYns7tDXPsE)be-^NC!C5%Ro
zTZPWJm3%$=$t?`)n<uUc&ZB{n3Q=*3PhVvu>_tk-LQctn<<K1ww#<~W@0m?!he-_)
zR8>-%fLfCOv|N82@-QS7v*fvK=wz%v*QIFE>K+s+6G~>)`-X*+OhzmId(1YxWLc=5
zz9QnukXp4O`_nLd{78Owbcrk5UNp4+Itu6;c4dfHC_#yQll%p%0aWao(W%<dVOeSr
zdDos*1ItD8Qcez_rs_=1vizO^E$i*WjO8(Bo9niho13HfgON`}hY`CYc5?GCZge`*
z{)*lxnhKiocIB;(#^a5F{d-S4QBpFr=8Dp{t6tY04#^Mnd|lPNbYM4cn||dP^1BgD
z$7svOU#?nyy<CjI+zoUUgjFPzhOa+<q@t)JR6my%^lkrh8x`G5_{Axy4JnEJMrQAu
zC;b_f0f`5ymTSi#^HKtJ&A`AnENjiOHF0Gd6mL`U$)&ILkbh>!Z6IywOHNS$$Th;h
z8dwHd@o<)H2?N{0IQ_^5Z|G$%Z#b?S$iPU;bNixjsotGdhHm_K)gp$va@FM``AJ{k
z__Ynva6>gHS(Gw#+6T%4Mj|ec^INmf4O78P-xYs5E<_`WVe+*85uL}1PHtNm=$Ubl
zx%V4OW-qrVL;p&qaFIwVnTM}NcTq)JRWXI>$&z=qgdy$9yf#!*G*h+1>j$wTrOu20
zS#|yC88u68b&t$8KZ1d%4cSiU#fo99uF;WIE!o*<$DfzX`h>sjTlOxjYR}C0hN+AK
zQN5Ug8JI~a>FeSy6=!qvp!knh6;regH9st|AF{&3;n{l9wUXT^(LTp=`hB}+MX($;
zAR=fKM7N8zk-`z_S!Y;!k(Nb+eghjAKL3Wg_lq_5NE<XwjjI~1i7Mk@Ah%YR8SI<I
zESXAntSdC+xAA|hD{WJez)D6v)U&WQRC04uJW(A@4)_k+sfd5yT!Q~;#jcX>J;OuC
z&ZV3FW{lOB-HfVxxoCjnFUM!Y$wqx8l2op6HS4H~Ua)e|;q;9GzHi&xw>8I)cZ@))
z@A~7Ov2IfE?~3p!0=5mM9$2j_;--@RDh52iuKf=A^*!C+wzetXf^Y!nejBG?_<~Y0
zbLH<JS@Fp3RbpTJma~aK%TG>L{L<LkF7yTl>)g|K_iWp9Q~QCmFsjs>xBIq8y1RDr
zTgqkCtUT2=`qP34JXsLpaI;yl4c-Vf2WC=xaxU?bnGAbYoXPSu*QVGzrp0>IeDj2a
zRcopTa5WjMwz~rTOJqwh)GSw(^#t(IG1BUxkgIHlu_x6Ts3x|OY+J|LQhTOJnVFH6
z!uNR97OVtmd&kyavWKIS&@UvJLGdpK>GOhk?*n|NuM;yc);_fOC0I)9j~7dwWfi*t
zMKqZkk*L4_ddR;i&8(@-ylq@ZrA=s$jkf%Lymd=R;bGY_i`wHiz2&-R4Mv-MGD)p!
z+V%*-DGD!b>Z*9M+t~hf%mG9SrELcz65CyMvUa;-Q(Cv=PXt>Xi+93Lv<z8)0~&r=
z%3QJ7X~+zPc63DLuz^6gZkz8#+<|1CZ|O;Bi4CC>Z_>1$m)w3W9*fh_)WaM4zY0sD
zRu~n+WL7jRrQ$2}tg}LrEM9-ZE0l_B=@&)T?g&hFRSED?(R8^u((`L-;aNwAWJT@D
z1Jn7ky?+EieVMBD(0@M+15=)|huNlxfy}W4?Ur#q&luH`=Ypdi(9_{t7C*@Ff7gm}
zL;R*+?$hrth@*M~DPl?Y!pt5n>t@@wj><^0Zs`BIls_{K8`yth-m5KSnWb2N+X9d%
zeJKpep%|Gq)xh>T`bgJ=ld>v+`B?pE(TRl`FRBf#nM`Tw$(&Z~om(em_?>W0(L=S0
z;P_t)XQZ*E>9H-{uBtb(DGG&m=`XJtiS?*epyN`QSyyRNxL5EP-avC*r0p*>`|bd$
zKvln{t7C0!aeAT$bT!FoRfzHj)oA8jBypXN$YM$IT2A}4Ca{HM!y0yIbJYfNDL~5<
za2eWZ{m)GPW#ttz^9p6Vmo|kfv$6j0rY%)cBL9G0)*>cMgS{_cG1mVMUc-tYvN222
zwVH5EgRu*OrrOY-SFi|&?Eple5e3=4toO3yzmDYjM*1_l#RgQ0SIoY+Cmtoz?~wnJ
z9vE0OUbXN&f|;hEjh3L*R!4R+V0CN1kI1%P41<>8Wk7%TgtYJi4q4hKCIHaCL>#+r
zJ%j@UC7ya}r}R?;GXt4aplbV_MA&Q@&eoVAZ1$C~jAq)dTyHvjuYLWm;0>%uf3Wa1
z+<0W@*8~leM7{}yzM$M3QWdW%qW*$K?cW<%5B+~92U;>IMiOISTr%&4i?xP&vfl`&
zY|_YD!-r<3QC&@-yn>}9|8;mLsG)}1IJ+i=vMmH5JG_BK*Z(CrB;g}bMM9-(kOuGJ
zdGzph4T}4Sj7JEpOul*akya!Hd>NB;%c5gDs+v^Q3;@O3Nmt~fPv9-|dm<CLt1hdG
z-Pz(8hdeqWy?*JW$xq=4oGv2h#cNa)0VntW79PX1n50{Ur*KQ8-;oOn_Z`O&kyku`
z`|uk20})a^grn+}i>hbEt}<<7|49yGLw+mp5}v9TC#pHQV%Jau#PIw8p2Q%{nh2#%
zsu3yDE}11TBlEFfq~ZOp2K>*cjk0bc@C0~EoA&}9I%)VaynttLLdjUKqxTezW%`yk
z!u44U`6rRQQWu$D_#^EXcdA!#6K*Bui_o8FLyetlONwPHF|O;!UdQ_1OMi|VVQQrF
zGSlBKtiWS2@4SxnzmiH$=>6l8-ZAUnm;4{YQ&>prUr+?PZgEe)m`IBIdpd0Q91&?z
zAYG}gaomE?l0|P-k$=3`{%^|vIoyY{YV0eS=KJR5#(nrS^}+YyE!=}sYSD}$tY1HR
z(*H$x0Z-u75b5~?m{)PkqpyyIziCrn3DoF)oS$cKTb+$Ht516VrlTdi7AVi_lmdD$
zgsJ07kA>6g%8W*KWL@Mpo9Fkv^!bE(%Oa6lvXp#Z-v5pEf7AZFgohd+kM@^=U8182
zBRQala4!Md_2G@=aY%LLxap*FXGEfXv!5>_fO{t<r9;I4h%^ZHzuBd#CGNIg;jciH
z=xMDDRWllP0Q$uUU;*xn`|M#d>>Z^W8>;I>)D+<#3WDFjHIX)NC_r2Nq2nseRqP_`
znVU5cFa9Rl(~b4_q~B4EE@{Cr%H0#J=3UXoK8p3{HY*+NKxLc4=_vMjB1Decp#WV~
z^$(~St7gURC_naP{cgZ7@Mr3wL<h}#E5hz#6DbIo>w%b4??)CSm-iht;bFxpWPP5A
z`0{!<ppI?+s7>B%ZKq1s`WpO8>j!CeL%medJW{q$Izq|!a6`nKPtwM9i|a4j&&JEA
zs9+r)iFou@++bt>#-cY$H^|V%u)dGrFZgj=`H!l)=pY(ByZkqG+WrvldMmyEh!}Q8
z?Bdihz0Bl&p1|)SonQ0yH*8|QYE|QoX>l9giZuSuIKR<YGOAvvu-JXu2ox^fgI`2?
zUO2y#wgp3l$DT4BF9nBmHw;i)F1n!`M9s=2QO_C(((NYvB;wD0SeUA{qj^J0uB2>B
zckM^wj@+<6T>+PjSOLUUxTn$C7QBY5@QcF<wUz9c))Z&Bj_}@@>XC1>{=Wi^Aze-D
zBB_ZppPBuL+Tt?YgCG6;y8cg?18N$HZ(BS1M)u=&=zm-B@wnOv^UZ#NGXJZzHw6mU
z;V1YN?mDJmQ@S!D6q9l>c5aI~^RAdHZ-fEqDu<mL$x7d{DCs1(;1&D@|Ab!?Wqtg9
zOJ}8OL1521sgmh_Cf@O1hiE_73^t{q^>@wscZCO+wCCS&Ga{(I0+dW@SS{v2J5s%8
za6{hnX`~9H*c{EZ^&;U&S80zI!Xx`-h&HxqGa8xM4fHQ%-5itm{Wn~Nw;}(?c2v!E
zMkaiqv_V+EKb$nXZ>R@#YbK?5`A*j7Cj48Z<(u+fw>6h-3E15;8Ndhd3;gT`q+{!E
z7*$bb3E-Zv8-9iVgg+b)Yha}1<sxuLNp@#d7tO2ildMPE*Po)4x==Tw6hF|8tSDCd
z1Ac~U0Y%Z453I?053Fr1tpBn|<Nxi}1Ff*Dev5`t<wUVEGLLuQC-`{?k8e%#uNeYL
zQ;sc{m{audGyKc3XnOjlnvF4+t(~N!x<!h<ei5GCW~#ioB?b#0h8#e${>l6Q0(Tq}
zv#HBUMO}@#-Q)VgbWIoKzr?(|80(J^)34l7%xDWs`3>BX{yz=<A6UC=T}Lo_LNF8f
zPR!k#);rc;vT3`RC97CG)RFg1@-NuWuKDl)0g1a)+wAWR_<8934s?F23ZTOQWt929
z>hwPrY4eY8bBKAfFB=*!-4<X<OC_Th;w}DZD{pyts-E5X+Xj1*@^;Dk|Fo5~xnrcw
zrQ)5hB|Pzma07ma-{AIEG`&@w#3<QzZKQHpH^<-MU%LJj*55QUh*fr1#Q6SDcaJ~e
z(GU_;OEx4bWnS!B7L?;Pp-~R6e_*8ZK1J1)d0c~6a9i5*#}FXqnpWGi+hW-E_X6_%
z3x3{e{SZ7((;5XM>Rgxgd<NIyN0HtS>(8py@%py%mNI84diV`~hMVENV)NXFd-%Jq
zBGW_o9sW~y&w~8h_WKcRbyP805$Eqs#lH83&{RYIeVLJgl&>$>``CEbuMX+YiiF>m
z@YW3rW=)*2f6DqiDz+nS2&rREv#f*Nl=XXF<Q-qwcKs1)uc@-w7iH&5xGVh`KEE+E
z?1lHtOm9lt`~^RWw7ffXe%Gy8qoPges^YMu%-@f)KX13LKc>gax^FgBB3)4C?l*bg
z!ur>Wv8H#vtqVBgsiLo6x02qsr94rJjWSf|?@Pu0KW;@M?Q6p$;TGLOU3u?UMbh?H
zJLx~xgWomE%qMUa{tdruYd^z>M!Bg6U2=X^-surshimX?TUvv4xTl)V0}Vzt)K#@A
z9li;7Bl+T=moQL1@02FpbE$DC@J)EMEf()0+)_>Esel9PT3U*|?!rBIxGisV51z<q
zF2ET*UsBaBrT%Wioo(TSUrTi9o_;XqY_xli#rynZ+x5SLt4dLy6(o0ELV2M)-Tj@`
z-xR>$TX+s<mD0=Qo|6N<2Djn)kO3LM6Cpf3gL6XUu1OqE^!s;qBF#>@xm4giEkctK
z6<(KwXMR)J*Wu3^+*6X_sosAA$}&JN;2vC8_J8>ERLAHIT+m=`8Jfm;eG{&4J-x0!
zan?>~R3=yEEp7gFxW6r}O$#2V!|55EwC6i|{vO;_0cBYE?kmH3Pgu&+dWB_?5!{mg
zbhiHf0^E_PU&3i2L-cg_O8a+vcm3%JHu+$TN=dFM0J;e`hrCyu-;(tAyuqS`QYGa{
z`m>Gvm*B20%df>4p9aHM<iqd6b-26j`djc&?((fbwIcbsB+~b!f7|HKGr6lpcqusN
zT(QXu*^fK$Xj?vobbedX-@`ip10|ncNPo6p|Ed6fm*9yGZ(Z;PsZMzV{(`&Po}XvB
zslSx)$2Ioo0q|{jw(a?S57(7uISt1)K#5BHUlf7AfT!?!cW$yA<}_(PrDSMZoi@+a
z{ITO_6b`_=CdI>gEvWMUrr44={238!>_MR_ApxXLi8s5Gij{>lktVy70bNy_$}2S+
z?D(00c~K9HTh;6hyj1f}f0ySQFbjvos5UAG@J5CAce|7NlqJ8z(t%u^BPob^vHSYR
z;Gib;Q&(I^JzFpLL)tx}^&f~?L;L*~dy=wMU=|K*{bNf0Ey5dZ;cn}n5=_RNA!shC
zZSBQ=^yiR0KO!IUR)i?8ch{d>u+OdnY5$+YD{r3LNq=VT2#vu&$-ignoZD^v4IzKe
zTSGxhQNi;)N!@ahuBY=m24%~hr{<&gyF5Q3{W+}j-;qguuJ!Mx|8;}So>o)eirW5?
z7Vf4$aej|zHaD4&B^9n;?oP_(`ZK0`#HyXYJ@jW(&(F$!q5Ml~HhQ_c{?r6hGpk&B
zs!2zq$n)LxXHry7)2avbwRz9s8T|i>d#~lVvTR#yY?Bt=8-f7ny(i1`IY$xqSqgoG
zKH-Mm^><Qtm!m0q2a+IY;9c5n^ssWw96K}Ncp{{XiVjvacjnr2P4C89x9h4Oli#V9
z1Lxr^1L6JS<0glK$7bP7<@m^&W!Wg3GK#7kPI~>6q@V{zLHozSlXRb)6f}J%2;B4O
zkq`UX^T$7M{sM>d3(QdO+tkG|&tH~;to6$89q0UDHL%D=pvn)NzbM5R?R7uy`L!VF
zoEY~C$D)6qbOXdpQ0;f59S-@F`Mk{%R!+qKIq8Nl{@FVY9;o|RH${Q;cU|`<pue%8
z4KC~+pZ+09U<J(vui$9<dt}}+wQrT@KbG)=v&|*WG}M`(IvM@T(h*lB{g1^z{cMK`
zM0uTnfBL#My4O#ne+n{Q9Ln>@l3z~-quu>E{@*+P`3vpkAl@NAlb2-aalcwWAw#6*
z@mZte`8@Oe^<%0w`}l&*(mcLsb^Lyg0(M;0tREXVh$a1hvfsDvvLB(JHM{=c`+JgX
z|3Uf%QmX%v=beWY{TTmQmrp7~cPBo7@?)0=7MoJ0Y)}00rOtZd^Xtn;KYaf>cl0CA
z8w>o%`y2flKg@ssE&j0<wI5VrIllZ4@=rmS_eb9USp3t&0Q9q8sh>IOeZqb39_J)E
zqOuIoL)(*oj=l+>>z9TZrMVpaHmB*1`1Eo4-jJkd+;H3Wkq-k1h%7e5w%sbb1a=P3
zk99mhz->4&kFa%2(CZ;-nY0GGZJP``{8QlWHS<vS2{(VN=L7DY>639Wn+`wv{C#}B
zoON$@j)OQ3F0&ls_7pI2ybWw0$4T`0`vYwJOE?<3bsQ(T%tdZxiIDa=4SF>HmjoiK
z#Ki43`S)o43Aqc-NZ4eSWa~KZU&{7t#xhO3bsSA>;aMN~*Gi?w<$pbYz~6Vz?;iPl
zS@cI<Y6M$0S#{+3heZD%OTVMvU(a8XB*hl6ah&t_Q~vrGp6_$Cd-s^<AEEv$Fhynh
z=UDo4P*^x*imJE%^g4ce8dxCc^dNJ*uEBDJ^Arctzyrp(JNq=z4QreuUMKYQ+99VO
z;OgK-Zp6<?j@1qLeL3hFbHEgEis*O~%u(*K!uV0=BYJ*@fLWe@n#1ol3AMs9$${7C
zY6Pdbe&6au`^1cW4}3Z38be@;(2aYr0&UEzGkVLUia?1X9Ns6CmaTj5OMVefy6
z_dhI4u&Qj7Svr#bb1bk7vB>P&?3RI#hair-&xj!250n2IOf!Et9JJfff0o;nM|p&p
zjZSW!XZ0ZZ=U5;(W!uczrLxUou+N9k&*R?W)4&WDn5S$p`$G9yJ;V$S0dv4SbNtq4
zu+IB;nfBiF59Iu_;sBWlECEZ0xIrGFDW^%lJ}MBa5q>`M{O<RsxKGt;RBOO8umr3e
zbp8Sp{PVym7MYvUWt5rEI_UgPf6M`+G{CNN;v?`8SUJ4L;m$uNY5qW(EZltA*Mr=E
z0`=z{YYd^xg?|TDfG<b#zf*n-ocppG%L?aPHx7Azfbu)X`*Ss(4VGy>1K-t*TQ50^
zz#?!1xJJxWTOYQ|3We)bd)<?UJ}?VhB<}k>;eq!FPO}Ou1FwKrz?Ysh94Bt}MPQMT
z7I|LaSN{Ifo{d5!U>>+a$jWJMFb8k%E%1s}`i-9FpCTB?Wj0Ebxr^dE2XvpYOQOg5
zM~OUffxvYIrumw{H{ccU2zUZ~=stfASOBgQG~RW&`0P<#W1HGj&IopX0wVcbW0%he
z-+x28eBJ^}Y{}|D|4YD4BB%5T3%C)P*N2?n=zp2GO@pj)L^5La1bEkz{vI#|Tp<6P
zW{Zp`1In*}$H4QRU0P0hE|Pwe9C~qOqFr{WJqMom<exHdhM?_E`wY_n)Ho#Lx-{M#
zME@D!0{M52T~f}2b?Xs60x#HT*PH*{`7aZ7HAfHBwVo_dem-=E0tMhSaGCU<Cc<#U
zH2XK;DbUy5qpR|J4me8%JS}|=*5&mKcxoeh9iK_c|5+k{Se?{~@LS*s@EBO?K7Svu
zNZ6o70Zs4da{2op{ttoEz<J<2yX?lKMzY2hDc7a=rKb&>1TF#BfD7!BD+6GSU=4T;
zi~>)APu-zViLgUA2nOTA=NjF2$A8a&H$Ca^thdvDSBOVdB=KF5|1IzWcy*Be83E1%
zHz_|OjG!BAvRMIM0e=JUtzOYG06BsN{|a2=(utHgye2RQoFk-;uTu&py{CZdf*U<4
zEeclUpJ(d{x@C9-xC}f1ZV^3ckaeP%eGikgX$i2Tew?#jcYwRVc_xpBq*HK|&h-RP
z(kUI|{?ot>;5H4QdEhR8F-cUdVfK!G>Nx)>aD~q=5Ird5^i@hwhY`k=nL()>F>`+b
zZgFRU=8v2cIZKRyXWcq<7Ptl61+LQiSdnp@twZyIYx1Sz`4Qj>@H0W%hnPbxFp%A)
zNp&&JJEeaaI1k(+==@X`cgz`(X}QrC-RGYHt^jv|TZ9hF%Lcy)IF<eV&mGSX0at)u
zfcpgg%gg9n1T0c9I<1zWe*rj4)Xp;u{s)-?ay5{3f(+$*^3N&YDy^6G=(-eyD{?LZ
z=YUD7)K4AfA0}+?J+AF2%8Y4^k;N35`mNfq0<Z|&0`3EsN$Nh?n&;YYhnW<5-Vw(*
z{=W`fC-m4f0qiUdzeug(v!^=q83(QcKLK}uIo^L>N|k$TopOpXbN(`L0k}uZ+EJ!Z
zJsPkD_Aa{TJL&HM^T1u;K8Zgr3z%c}-Aqy|rrn*c6TmeBKc0~`%a}70?kk2Uzwb1T
zarD1O;MHl~yiHlUEban#alGt^!+d&A*MY0lpZyHK28rr-ntatm|2Y1+58MSVF#Pmo
zP}Z&FIz_LfHKrb__V@>I4>&ErcPV+Ok3L<<p#HfUac4X40yly4gnkNS6LFb?$VG}o
zP4BP_oCWR?HgkxDa%;3nSyPJHE>JZc;1TrrXF}fRn91zpyr@r<ok0NXc&O<4uLJ)H
z++v4#zd%>V++|>`PceO})~63ROWgId(&H8KdIQ$|we{hxYQI|o8$6U<@l#+{Ry&XD
zn#&pi&H;D$`y<lj(B$;eAO$1GzOS|);@p?Jz)u8i&vWQ5PZqGA0K0-u%Dwl1Ux6#s
z{yz1NCu-PdOw?fOLR))W0`3EM=)U*K^G;&>r6f`3;As2&XT^aTmd(I9(rh5xI#gBA
z^eNyP@Bp|&#PL8zCi6_J_H*)~2mf3pV*DxQ-$F{C$65Xn#uWSP$Zji;dEh2+54gnS
zg-vY-<UE@o2l*!jy|3HAKY&}noPghW0%B02C3M2~QWfwda1HnsxbAS+WL`BV<MSmN
zqe{2@E|UMx8x)zyjL*xn*=sjfl;NL$09V=GU*NTa)l92ud2Bjs^Zx{Jncm9`HAg{)
zt^)4z$aC<%s{G$a2l6N23cc5mbZ+t;2L+j;q5LyT(DrM<ELUdM$nYhgNrWVyJtZ%d
ze;oh)0^DHo*tU(Nq{SF80SvKmy4u12XXpT5pre$}F6wVGrj1yV_~dvCgTQs*zv%u9
z$-eQNEUL(pE5M*++kX8#@PMGrBP_hx;F%}OfC?0#4Pes(u{q!_@GEdbaKCNJ!wTuv
zm(tLh&L}~X?*fao9=2%56L~Ik?qy5QUj)trKNB>2B#T@V$qwH#lTuX{Hd;>bX`cTo
z-Na&cVTP?BC^E6R;kGRT7yzyU{{Vgh&hz(8`$w|2At!Ca8^*vPL9?CvH$@LNp9PXA
zh%j1BIyM7zhYDa?_SL1Ne?m=O;7)W$!Uljb;1=*-1PvdM$g?VV5msCbxbSq_wBI@4
z0ky}3;MV(s;E<DL&`x^WJ?FE)UE+SvQhH5$<z?M)fjW5I&OgA|^ER-+7}w^NA>UGF
zR(TUxH7B|VoCEF?blb}RiL7QfHI`>gx~cp#MHapSjLWFLC$pz1|Lmk=$9n!p2>Shi
zv0Gl`KH>y$K%p@H>B0XOx$i9OTHg-|Na~E~Tl_N(+you~SGaK0neI}ySvBGP)%pH)
zcYfoaS*osl7D6Z#z#We--{Suqy~lgNIVN9>Tdc}yo?t3%9c})(P3><EOf35beCgfI
zaddXm$?u?muAgGzI~DIIAn{HARY<=!|NI2}3|yf0*F6_08^3LGr9%DDmfu^zPrzl)
z2PULkAni~oBa<r6-w52>JNNA}^Y%g4%5MnN1UPrG?ihHC?$ZTeM0VYp`SwZuJWWcK
z0$6bd<QxU?R2FyKa=?u&ibBlU!Ce50zyshX;-HsgMQoZ~dYhvYw8NpRmI1lO@WHH)
z37rW*UKaRNIQO;h&VLd38Ms5(S@U50UhB_O&Tf8nJTN2lACt6$?OaHtCCz6__$#ns
z^uG$+1Fo`kP>?M@7Cz-gjXDFg?g~;x-~xm9Q-WM<|HQmbk6+<R0<EE&dEh7DJ|o<M
zTw5egcwPn{x4EG@a`e9rJOD0nIIfTd`S7zF;J}pZs#Aa^;6gU}1KGJ@))VqYb<U0L
znDpGE?_}$is<MO042aLAIT`-HLhU!s_ixJbj!0xFqzAiZ{9_8>E-){U1NOp!#J(|o
zvmNdj#RBjX@C$Gm7?IAOluhnUQGhuPo@UN}9r%f{N1<z7PAZI$a&~*ehW=dmbqBak
z>kGR4eL{{V&5#qp4I3Ig1>6N5a9d_bCd<oqe2>q&yKvLo>n6WfnEajS+IS3E)oM1t
zDzg=JU~d7ZvdvSm;Bc7y_;i5W5irfz`MhAJ``Jy0zSyvU?tA<EPJS-}V}dT>k!=%E
zz6CmP-)-S)f#JJL9`}pz_}N9_an=B@0^5E7i~uvzvl5VKk$5oXe)ln%K5da+KS7^|
z86Gvs>xHb|$q}*LwaE;b1$4~^LrgbYfyQ)@Iq?I)xPXp(MPQuC?Lql9uy$7wI1_Lb
z_Y{+9071-W=sT(JXxEtM|5e~L-ZaxmEgbvPz&K0RO)+5FjT<wf8v|xtmCKE0o|fGP
zF;!=iI>^4xrxP?rRS8Jv%mgzxb-}RG*elOb>IGl{7<SJO05hz*7i3eJ@sKu@+xxme
z$1AdPV~puypIfu~nRY2iK-mXOFd6Su1%_n0x-53HgA2?D=@XFa7XQrxXMh<77^$Sb
z>*9*#*d#m6-}6IY447xtydd$0FTZcHkns1cM+^f3LZ4&!Tg|TfN%@a!GMtcfWUF)I
zOm<zbdEhxwgFX@4C%IcZR{+M@c`}q;v7Kb4(0xl8m;@GC3~JMVigTs0m|pWCj43_x
z%Z$mdQwj6r|B#*7gVM&^Aktx({NnQYK?n@7;+M-Bsxa$Gxi`x3zGm$F!>rGZi2T}o
z+AcQ%c_iK}chR)ZKPhc{IdPnQE;7xDF&+d?v(!*D`JHCb$HEs&pI8blFRKv7O@0U2
zcGX8Avq-NXKMr1}-B;`V%cTD_tDgfh=&3<aK)~c#>Z>p;uxs6_tG;-EaTnT0jwtgk
zPua2wLxNizF>h^})LgMdc(`3sD6(t2D3O!t0Lvd)K{{ch><~~apgP2k$AY|TME5Nx
zt_;YYW7cGt<J3VvUt3vU%PblS46(`@0C8FB&2_3yqg*1fc#}J&a?B^@OK6Tu$p^Ik
zsRU7;{Ju}b`}X+<2_kP*tEOa2I$Vj6yNRvJ)hyGX=oMPXWKl<5K#K-!7l@<(2rthR
zHN=m%Ol-`hsG~Gso9@d-m^tYvUHI~keSBV_a~R?h`JqqrrY}4fi$m=TX!rn7D55${
z#Pzc9k0$_yp!bk+o8SnmY%QHONNZ*c7fbRhm2<gbz^I5^T;#tOdEw?n=CT2~g(5|(
zCl$~f<f3U$-Zm$CBhUXH(~Yt#YoBL8kP0BNlP|L1aU-n_y0qkXlm<D+UogEBQx=<y
z14Hh@7y*g}><=<<gz&t%8=mlgz+A{Mn`B#4N7Gb@p~z27*1r{G7i-K#GA;SF!I`qe
z_#RVuk!VB<f`?g9E*4Q6HES%&BcjuglWVg?ss%>~DZl1``(pPX5vV!lPGpAd{QX?-
zFitHJ$f9yjybq6&*$Df*i=idQi!%4;3+V&V9ufJe%p`N`POapUimZ7QomjMj(V$Et
z5#?wgL-~EH0IK`ri441Zc9@g>vcBwxhP;B;CD?fbozHQqK4jRQ&qCd#%*FUpmQuGW
zF33s>`;l1QJP?-Lm#lsc2yCi9FJ{^#*(^~+5;$!$%tBvFAZ<23FYbFHVw`53(4tiN
zn&z(r@>d~#ut$}WP=}Og^Kv;<OXQ!t?8!7s;EDHOzh7dVqU9a+(bLIG=Z5MXW9gW-
zrCpG>kS`Y0M~_D}V*s(Ip|NZzQ~5E|y-{}AyQ(28hLw%}7GRmz8qjexikc=37zFf_
z{;}m26QGvGyYRBdHvir%v>#+w8h{dN4W#D1*gR}1umWxQwX5!s9J9UwJ3tar!M$&k
zXMVig6|Y1B`C%EU9D7QJfP5bN{W3EN-h3?T){<ud3s5Di(uPuL4zkc9N!Ee0_fnP%
zfjAWX6^QBbrHRAd#hi(j3dwP+Mj+g3W$suSu^_44)V|#gV`ct8b`UfeJLN<XoBnGx
zG&qNFNUHWeugWgmoGJGeW<YWwge?Q{v`mu+vh_RSYuMju>4b))_$-HKJtH*=ZBA+u
zCKByDgXP^p!>j)>rG!Tkqj}1%iFoiK>s_AQJ47+st+Y#93E!e1Hwh*Zw5*n7txD@9
zn>34oXm9nPOe+TZFtUFyrs;{r03~uk+F{xDaZRl?#8#&yB*cS>Maj2+s!q&PUuM@m
zc@pA;SVpW(f9n{H87u^%7{UIF4%^&GB*-5+ltCmbbyGcHDw~}fw5%zA)1t|S_`$Ir
zL3u0A`WjxB8nQ1<tOX~}gjc`#zD4;3UwnX+FJ+27^FC%VLm@#dI@mnp_RGhTVoya;
zBD(-M%I;%tpI?hpiPTP0J*32uY1Tq!n)HIMXOxj65j)cn?O-P-mdtqGUR&KjWy<#Z
ztGY`>8G>BY;CFede~gMIA@;qI+>3QKG}wjI1d^2e=Xldd0BL*6%S9@)x9Qg@74oHf
zKqLqPYL)#C<<}H&I5pi6Mh@u!wtCD$6IXp#<4P|pn0BxVU$|8)+W|H?IR(+L>W!yD
zUTa2c>c7N1v<0#j#i*VML=(rhg-#cK_)J@clF<mM+iF}K)r7!L$Flh&5)ymzMP`X2
zp<1ggu1Sp-iZ@miGibH^ggY6U?))_Y3ovS_RJCP=UtOcNYhA_;@eT^Sk*3fo7OC{8
zN%wi`R-1a2tK~xxeRZ6n5@Y(L!5Mee%<dO_=|lt7V3X;js*9K+RxFzGYmulJ`;Sd-
zgtF%Px}Co&$v4$Io-{*RMlRyNMnKZ!riW%WM)9*AqzR^ZX08*mMWjIbmaSqH$k&BJ
zvwi;Kj@+7*sWClgWe8I@_A?d@S%{oO<bTU{_~Ly9LPgV?b@}}^|JTKh$cr-)2``!l
znu>GUV!;ae580#UWfx7vKi3TOy~#}p6*2b_AIOvDkQeDZe}y|HYV48kNTiweO(c{w
z9&2x^s0pb$8c!hK)l|^=h%I%ybO0QY3M5sLw@?!)@C~M>$y!B)Niy>l8?t^RqB=3o
zZ|k21>9I%5BnuYx+Vm|F3VRgVpy?(sCM7?$j9Nvv`gf~`F#-12Q?xJZ6;nz273At-
zuC`bQcHIkDPgac-(+YGdz;BLp4Svf$(5OpKo)y8Y<YbSfWXvkhzv)WoP4<-Su*1R=
zazW~F)&kPx_G0%)yRSQxj<=@7!j=mvumxkEdx08tE@{~p2GHc2b<w2MJL_DiQFWZX
z!&&PJP@r(d*%X91MBq}U4WoaJ%N#rUc00;2=A}l!9@}W^Rj%n+=jg2}phQd>M1b}j
z;<ZWNj>tzQv@%5}>ZwWD*f;)hf>Nj4gIC~-^;Sz*19lwFeU<#POU&q;1dGP`&8!;V
zwY@@3Du8`r51PePnwj6U&$R#>z@9tNK6`p2=^i#0N1YB!=T)im>NR>_`(gl7ak<Su
zPDK-K8vjRZY1#zlgw@nz@I?oC9L(Of{iRj*H121Ik1TDOXL+$nGrqyS>-8G@f!4Tu
zD<xElWn1kN3G;0@{&$z*dO~NVlBZ#oMaePa(><{{Es<Jt@<=AJKWGZsTJ!d~U8`2(
z9_>v|koa<J%OV;5e9GI7qrdetZUD0kS$sNrb;-Ka_^%ad;fugs3-0{=OkY@J2q^~I
zf81poaKm>ES)YK<<cX%hB-%d>q5fB5UbF@=_JB1uCHpeHyH6G|HCUnQSpmLSA99tu
z<ab0-1-$XPfDwW2;~GJ$n{J-wJ+Mq;nPlZQ7Gl<AardeXvTbpZenmn}O;rW5{W#`W
z{%1SCW1tp8lW#0i=}>7AuukNcHFy3sZs^!!(`V05Ol$Q8SW^1$a><h2A7taK<u)zP
zS>a}j_W9R<?~F?{Bd2M2%WZBGq<qbJaBE!hvq_R#jmv~yeYaJ4Uw}QWtFn+fl=GYE
z&C5vXci@|z{~K}OtJ!v8NDyfW&>C4_1BeoW$$SGUl4u3fv*vy_MSFg+9>|39^O0?x
zO>vy<AQ9z$4Ornut6k&&RRSS2vw=q-(hDkvZSIBH03ydv%LJjX2v%K<!BLGO<oxT6
zZqW7>^1nx0)s%9b+QW$azoPQH$|XAyt#6I_3k$9-BU2>{uL3o9{!jeAx}*sbA%SMa
z*U3No#y_8dWzH2C@jcnbYbwM|5Pc#4r{GZ1Du3Tx>kdXTjnbgb-*WVKSGpzjVog(M
z+85j@@=<{UcDSc%lMR0s2l=cJl<E5H0IR?!mES#X>e-TJg@g*DF1+Z|VO#;0fpwrB
zxFeQi{hU=DE#``adaDG^dtsAkt|&9YW>Bl>$IOA{C;%?`@)39hJO)0wgGB^w9^(Qs
zi*Qp>MyV{7c?A3cymC%h#K>nra%GluRUOnMX!axErEMcf*jO{dYOK{u%mA3`_(tHv
zM+O_(Wbi)7i^@}hS@zF@x@`haJ|ajIQfcfdOLAT7xZqmV&%hI+CbrLC<Oa$D8&S$E
zi^Y;1NP!*TT{q~x)#Z$Y>Ns2Q+aMm=W5V{<jDD5?FKCH^pOoIi7vLH2o5J44Yz!MA
z{X)q$XazC`Hi38KpJn48kGmU&*joef;EhZX2OuBv{4M(TXB(*?M#HL7E^Ph|JR@xH
zcRPQ`R)iwUC`pzJn@qcH0Ple(z%$^pJHOM9Lu{n+q{@_t^>qi?Bkc2^z&p#r7GzL1
z$g;iFE?xM#O5ny9Hi#6lF>8_#ZB|JL<rUm1_JOwqer)l7#Kx2{mLkoDRB6?HM#jd%
ze*v#8h%HKETTWuvLN@!|6wvjjrhklohq%Dfe3(4ry#^V%O4#p51{|u5U4z2*mOt@C
zpE~||0{m(GpRyBpnj)y-Zz7D`;6|q3l>SX1&tAA8Mt}|Jxicz6#J+qAJOP$XenaZd
zQHqf9zn^_0_x^tZzd8B`0P3t-^)m(HiS|n*2v{fV=tG5#O?c3Ls-cp2ZO)|B8R>rp
z{s#U6Ug<gv>vYeF_OLtjd5!D967Yz~sbAc~7ANJT>KKR}v~=Y*LF*qXY_Byo6eQc^
z$?aQK`#bQQpyzLNl)J&$p}+!5UaG-OMj-C?o&kRW&y9gy@*^)RcynYrliq|Y?VbYv
z0v-aNELKo|Cty=bnPjg^iF60}0Q?F36L_KYZwUR$LP85%Qqlf@6Lh{!f77zo-lZiw
zpUJmxgf0Euv=`_y`_hgQW+%xl^W?1OWAe|Ap5J4CT}LOMr7&Hxa0z$_{0n$$3cSJ0
z<DkHlB3l56X}z8c>}u1_A2D{db~a5u#q@@ENWVwGTU`qoN#-n~*X+qZDZk}2VXL1i
z=se`wt(_~7PVR)mTaNx2{&89=FCabnEEQ9^|Br<Ieyyt*nk=_i)M@T-l)b-Y8lb<8
zf2>?)y=>Z4W^t&Cm0tkA0e>j{6DDu1w?&sAB$7m^0V_nl&a9(~ME(QuJ3PK!m6^U}
z;1L~wug3q@*w&YAXR1qWsv*|vk$J!1-I@>C&zozA-St>HT&Dhf-Jw5y`apfz2_|!<
zlz;wd`ZF*c6pOg!q<q<Am2;J#<L>siAfKw};38!x64u!Pz5`3Z6PoW8ZA7uUa$UaE
zT65~0dwdD}MF#$;%gFXAhSmURUVp?S<vQ>V_!IcO8??ESO*L5*X-UH^;0tf>U%>MY
z&}!=fNF-@*c59zSgZK0uo|^lo^h>i{SWQ`bv_s5Lr+wacY$L2N;7{296v&|(Ow%m^
zj|uwyx%>Qfg`s#bR=8XTK8gX#(BFRF6DQbIXPvOO&w+md{{~)m(7!5O9HEp^nu0sO
zM(h2%S|1bOnphIEA(qZ`dFMC4ALRaJO}5pffzTJ?*Xg>f0Uv;8z@NY$z?+Vx+!b*{
z?9F|8^Lt#!`w@5sJOrM1fPOdWrZz}O<A1Qqw@%;h-wdC7oZqKDvE*aSSNJa4^IgX_
zRAXMB{A-nzIs<p3|8Kyv4%lNGaIqAwIuI%8|DN94qYnP5O2R28nmFO_JNb99_rJU4
zw<a@>W_nDJYb+Fh=I{UAv5eBr-;jQdfc#%!o#F%C_rHJ-9rU;4oyCh0`zvbVJ^Vrb
z+3z^NQwHWYS`9Sf{cTWwfA2|uWnpAH>SCt3J_3({e*%91AKc#$?E>Ba5rH|^xe_hR
za^DN!Z{T$|XmJXBroX!fT;wiV*CCexn*?2c%5|olsqQV{1%K@u2Mha|_uB$K$U4`R
zj+Npu@R53Bl~aa;9E;v&lKegJqUSp3g!u%QUp~jFx&cB8Y;Ya(Bj9;Y(CIDUm6Vy!
zX7_o!LgPE)Hh=0+;Ew(=73Ep(qe|(xY!USQG4Qq<(jeuASnJ3-&2)xO-|sW9MBMC8
z%Kh%M2|S~^`p$K@uBoENjo2T7=fD%-eTM>RGM;vJW1Hy|mk!vMhadM`CmvIvoUJ|2
zRg@OO?-F$T3GkxBdvKp^;3dUn8#qfA^eAB~z*FAeifSLDzsE4;6dMZM(JHi_ZwdOo
z)sy~Ue)t@B*Sd_(9s!%*k^W!N_1OTPF`s^xHSQ50BvHP~M&yqj=dS}x00=HV$HCTu
zbZNNQ^T|Q{vkds$k#T{8!dB3AvEK{eDe$s;6|3=2lU23_8x<AI@i+2(C;u#wRkwi+
zPIbrhy<Oh=1@HuT(=ESF|F{C{R{;|#v*_TG_V0ne`TKFt^H+#la)~wNREkcYfLFi^
zOCH&$0=#C9WSR8qBSqcse*|7~o%nY5`B&(7?Eq)pl(clt?a_g>^LO&U;~!Tdx+sfd
zeeS+l;X3M!{A#}i3cw@*NoIM0G2Qo1z|!G>7(Ot?H3Os6{v{TkKCxD~ii1CWzzi`(
zCb>SZCV(C*z<T%FyH7|!j2YGw27y%I>c0Tr4hLiyC!oayQ{s6kJbfgf$bp{$j`c5a
zSw@q4n^px7B<lHak!v<iQvjSFyU*YM0IVDb(B%vjzyPbh-&p8hIUM(T5SZiU-3bbS
zb*_9NCQZHP!#*$#%meeJpKIzc>+$JuKp~%Nx950;1-}0#@D*4FmJi3h?&JCAIqMow
zs+<D;a0sAGiD)1*f~L{r=BO1e->DwSKeHV6AEW~MPC%TmhXW$@0n@-7mlX^EpiA)$
zSOz`;-wvXGfg_=AQ^^<sY4*4b-to_Y{IB$%Vm{lW3UCg@r^D&rPe7LiQ9yeH#95~N
z^ya@Jcb3oFrWbZ-uJieo!vT55vsXAkK&e&MUOyeFf81Jyvs|tL>c4NSHGe+{&?W>X
zM1ID&dvl9FzI-?!Ou!$X=j9c+IcSr=|M^Hjs#Dx-GLzMxD^$T5{_(oMqfAJ(LAvi1
zn&s_7R7c12eUj3R2??-$97sNosA>bW;MRAxeH=(S%cKoZfw-o<-Q(aHf{=0pQb&!5
zhj^^>2iz1fOzXEVcz8!YZ<Ulj+1<CpzO$q0A965okUGrz^mdL3NjSnu{}KyFyZmSW
z$mh$vynb$%vGea7+~d^#6gincD4Vl)j}J-cbJM|qBog)s30XPv`F=tIngQ4+o@MpO
zhr#6{Ljoyit-f2wfh6<<($sm-HDRHnS=hEP^)Ujo0>p9YZ||RH7ONty+DD&%kmq+!
zu=C(uul&&@9AHC0KZ$2Af2{N81d`KTVNK-c==1wL{|GY>75@14arnQ;O*g}AhumYI
z@9rV0+dgGs9*f|2tnc^8=L4a?RRT@_cAnom<eLgo28eL-k2UTmKL!fjQG!45$1=f<
z<Iw$(^T$8-{<*H%{gb_X$e^(KWAAURbIl*4zcm0xKlXee?|-!1_B_8e1OLeTYahdp
zus}|V_&@RqqzL?j^y_FM|B>gl4tuiCf0zFQ;lC3<Z~6R0CbCZa`*}K$N9)b*Pbgh{
zC*~iI`ZM`qQ1?LUVLvGUHVAtl3wW59w%w!6qx<Aou`kd<)i~vHG?_dp^yj2bS~)(O
z6xgwlXN9T4YUy!FQx=4gn01=HW1hbx`xp||DJ#ck(*h@ca!j+@bmIx=U!sn(I{ne^
zt^4%RJqnp--M7)`6Vbmw5B<dSw~i-I&VO9~>66}QE47{A{I)%&z%*@zJ?qE%1hRT5
zV4c++;b<1{rPn27jbZOZ{1Y-fHTvz#2*`1txAUKje|%nEk=nJvmp`uj2J-vXsIzx`
z{tdZ#!{$!y{6~|}BmIjqwQn;^$NH3I+osK}RgN#e0ncA%%je$l-`^MI8<PHR4*0nI
zQ);)Xh4>$Y9CX7>tA`&WV47`)`y8!E=!RLo9|AfJuEL!lxJ`*U=5;>5b~qcAfiZ5w
zon(&F&DE|EG5fG6ImcDJ6U>qOjM7|=?aSdg_Ca8Zn+*HN02LO`*A7pxyYr888Q~BI
zom`6PyUZgUM*j(JBOaC%!4B{5+u@+`gTN%W`Hl(b_8Rb&^H@jIe~OsPmg3qbK*0CI
z#ULbZ_cV+AK2s!igssERKh164)}*jU`hPj{{3W5^2z6MU%(G$x)(3rj^4}CNCd(ge
zzVOT8{6EB1)W-h}CiuT{nD8($E)v~zoMR(_pd+l>fc_!$pWrs(A*QGr9L;w}IHJZh
zL;4Sh^|3PB+TrIfaC7S<_g~t0!1r#%@}B1(mu=pT|LbIdqxolm%41x>y(+x_ZwfMW
z5dTaQ^lgyxZ}W}II-l9|<8gWEDY9V5hponRnEo0d{U@0sb6t*WR9>4n^iwAB##-+m
z;(sMRjjJ-h;*4sKPYyT@ToKqxQ$QOmQobeVa;F-!3h`O4^B$oG(BO2UE3tn(NC%Ds
zXMhVVvbaW(ggs7Q2wMI2p!4?ui@;TOX^hJb>^*k*xGtR@T|OQ#P0;tVlzx|^uW%ah
zE$|Zfa1alV0B3-UB%XEI#O$(o2|NYf9>l{%B8Ofh?sQ&iG_K9?HSiYr+U-jFzzlGn
z^qVKVYLhJ*pLqVIgXlj3oC7WpbbCTp5A6tO^?~#+0;hqCzy+=^@0U93S3dt5_|koT
zNB?ugou88yF)Nb42A%^S4m$r3ut41O1upRQIh}1SS|1Lge+ifa&Xa}5SSj*2;Q5MO
zZeM$ze+D=QTmsJ1fLoF9vve7KI>?nD0T%iDr|AGBqCh?YZ-Avk&OZ-a2Cfh~FHZyL
zSYQcw1-v<g|IYzeNq<+NZg8~RJ?vWCst29_G_X;Rtc3soRP9MbK~%s#hbcB0xGQvR
z!q0#Y2kF2P@9zTP8*(xv;JWOdlm0!___WY(mP23QeBTEO&+<X%9|X<<*MO@6kDk(o
zd<LFU|7`Spz5q-EXDNmA1P*mg8cun>2bOyBKL{Fs1-J;z$ljv{d-R_Hue)2^l>YAi
zF0#oahY)vUO3Qi|pLN&DLtq}bMd+;40{GCN2kR8W5b&g<Pu2$(fNQ`t;2cZ*C8p^D
zj^{7Xt$W*j2NS?G;68AJT_%G}0X6A_FS7gSdAANK6Zh%?a85?%Y)VO6PrU9<y;<M}
za1Xdi;6RT#Qk&3Tpj5o?mf>;W5^#&4_oK2zK4tshf`DFkn(`8H)9wRTfEl_k)~dV(
ze5GP|+}-IpN6`371cT^j2)qX@0Ox2`LYvXl&oFQW_zAcPoDu68Gm73MoNod6yITed
z1g^Y8;JZP=9&}SOvjjs&H)!`Pa0R#n+y*WJ6M}uv;QgN=B-Wd5KX?>42iyd10at-p
z+4&j*RbYXh`5^GJqfg)Hf19}JQ&MLPi4k~)o^cQQp8~D}cWJ#x`SQ+yJC(3NNWLCE
z!~}2^xC30H15wUySlI$T0h2%xc+qVGio}h&3tVQKBFH~3h+7~aVbGKQ?)Pu;{v7i*
z+2(mho`2m_elG(zh5yFbWD*e~ZHm3aJ$#@Xq0fG%{D#?86}!M$!Zf%s*-rky0o(*G
z0*n0dlp?fG`i}$o4#axn{|mr9%HJf{+ooj1Rok)OQ~wl!Mc@Wi;Auh%Sq9)da1nR~
z{KY}BPX4(8+@|uiHsyegR>qW1fREk$GeX?Dd%!v3H+cwKPRyX7ELmc#9m{$=z%RgE
zdcb)Z7a1f-g5`ypz_L9*1Kb4u0bFa_p4P)T-oyx1lH-SWYCwj7OTaI{ufRE4kAf7r
z1_dFZDQhQMAD^(j{{`IOE!(_t!okBbO^Pd<wC`{dxC;D2;J{G^UmmGteH#I5u1k7>
zgTO_iUY=*{l+W&M9h0@_P2icHKLF-|pMn1cxIy5(L^i+=Wr5C|Rcy~nIqTzoe=fUo
z!UkwY2x#Y2>KpYV$^@PMm7wu%+eakJGe-%j6>+dAL;pD%(CftQwuzyT19Tz(dy^)i
ztvxOh_3olHsmHVdLqzBZIXd)SgU~_X0&t(!!;v821aXPXGbBJ`b>;sMSRiocd0?0|
zkO2XzA7b}a8352wAHtpgK5(D>Q<Oo+e&L@yW5V~!{{z4U;Ah}x2|x369b8^+j*gWJ
z3!iH)&<9Qdw+K6Is>jBq@@%~;lLcdq1Dy8$g}{sB0$Uq0r9Uo>hjq@iWX}H!fm=s8
zE*OXo9%W&{8KBPdp9XFcdvu0B60q0M=cU*jeWd(930wpILDbC=25vUNACrH}EJ<Ya
zPd{)DxCcA{t}*=aIr);~9B#n2jZXdJ_~$-hdncsJr@$5OF1zMf+a2MXp8p16+vgcL
z#nhj!@2Ou3gK78q&j7c8`@mI!{!0<IX3h&;jgy;N))^;m_FZ8>n-lkCg3mE#ja^r}
zO1=B^opaP4u`CR5UD(c>-e6X_rTv`y`T)4>fE2TfT0HXL7_bOza=U4p24lc^;5L1~
zoNV;hv2UM~P}|(FTm>^Pw}9K^-V#~gz5;BTh*?tEv-A7FEN~6DE3n0(5YOknIp^gy
z*>d;B&OZuV0`3Dh30vxCmv`nk2=26x3kY@$b9Mo^P0Ub>I&4@qM-~8ZkzJoV<^=k*
z{`Y`O#0~_Tdt%;vk*deKCSTk6$AR;}b>ITEPfnKpxOXW?=l(vorPcrxXgzL`{y8~3
zxT>=#{IkWag{||?5O(;oKv%^wZ67kFTjJc}cP__kDL@zg+>o8QIk7>3Y#~Wl(%3P~
z+0(!^W?shF-R#RPSjSRIhkVZzfCo$h7l0eozBwuYYm^I#4Of&ac%}Sv9=HWuX6zV?
z2n5`_Q>G4%*mYPj{=Wk}AZ%@cepgH`EVAnpTzR*p=br*DF?^blg)<2c;uPo%TZft*
z=bxthT%nbzaR*U>5kMXo2F}v@ZY%$v25ta%s6T98I7eA%07a5{ldEo5mH%%7cL{s$
z^UOJ!8g@4D9HY@q;Eftk=Rn>DW_kZMshpFtu=C!wfHk9k8CWFj@D*T)t-TG||KfUY
z=K(OL-?1Rp=zkqp6fZ9ZJOuckvdz56{6|$m+jGEK;688<I8O!$_->vIe21LkZgzk^
z-v|DK1Hwf)bRs+ZJX!g*!-7@<U>LX#{0DFgm?Y8dXA@!Ol>b^|+jR}hKivl&0Owh|
zN`-o6gREuKns;}d`MnMN0$iexYpUC(#Z#^^DsuL7)h1>J7{1&R-#5svEQ_g6^D;WK
z<^nk%SY+_KAW$QzyhToSe&)F~?VF?j6mSptPv9yrB&(Z(?6NN~6vzQp7K9T3L%=oQ
zKY+WyDK2`_?#6;ZN`-X7H-I%q{~X;9XT1jLCFtrUU)+!qGfJzD{-eM(;3wdkWIiDG
z-{c!vSh-8k?ZnalB5;qfll4z{bfWD3Lxvw)1Wj+a^PdBLrTbNssK>6&auGJUv2F!b
z=YHM>F7o^)FtKcE3&}!0t4ghcstaH4@ck|JvgIM3?C~#h@?+hG)*Syl5C_aMclxtM
zVgnYqHPP8XOaXTozE8=%Ideiop(wcYr%nGshCjD~MH*VO-f?#CvE#owpRYLuei8T;
zxC6{_UtN=n;%#0#rWteY*}C!n1Y?J*qCX&P7jd7vN5#C$jngguzY6@!_b>3x6Mjo5
z>M$hz*OY(G1HS@40cT}jT+#(O5lU;%rZv<|1NVS`0M`lot$#Z&p=2a`!`3_Kf0y2G
zQ522`zpeW7^y5Z>)2y<z;)hefBJ-}Pgl!%Fqzq?ffCXR@2wmI90_PfYvfU^U&|;t3
zYmh%S-2vKuidCPcfT!3Sn^tq9e2Wb80fIiy3FvlSK%0ZCE9y+~3_;dI4;TgJfzv=g
z`8&yapgx`8J|b41QP6pt&a=W_AmS2B8=lKM4zR~*(9S>2-59y9Mct`XQap}<%<=p|
z08Fy*HOa`cC<-v;_Mj5cp65w_7foP@qmh#mKBWBdP*8MI^3Qw+|IBc-a+JxZqHHa*
z%P$hJc#-_(yZb-I(aJvQ?T)iT<9s^j$IS~mQEOG>6jv?gMO=M3xko?V+HCri{-;^B
zE=c!fU9gl=Du4k2-41-x;}pFIRs9XY!U^T&&r6aXL%<w&vv}hD#BzihHz7NDbNTo&
zJ^vYwwidD$+nCo3SdqBpBY+>U_Bbc1+YEh9_BA>wERg=wl$X}|7g=94zsH2PAqZ0*
z`G4B@-*v7mu)dLU_gq1;NXE!zU<8<@yaYZl0W30a?q!?K6Okg1Xy3EIS!Sx+@;JlJ
zKl6T@Srj5?eO#N)qJqvl{y)vSgD)yDAr~|Rmuir`F4InbP6FrHwcyL*!J_Qm3wffH
z^4~tc>s++1mz-Q%4q;0Ytsx!U&1?05s2`YM<nD>ff?lpnlgNm!PpwcuG|ZflsY}0W
z*KR6zFv#7!{lIp>{Ai#2W-5P3<X?Rreu~;I0OCH@H~J-814>Mu@*Gf;4zN{l3@8?`
zH_WEJf;d~|ep(!n<NuDan%D>Icx?Tfl;l^FdK<EEwaHD+E*&w(LUZbJTjL~wPl6>`
zazfA$O2CNA2o$jIE^|bdv?elQ<7M$<o%?4^syiM*!-pC9>rX-U^~KamKF{xbwYVRc
z0_K2eV3h1?=E}U;20cJmkC-NCcx8lrdwqiOm&m^OJo%?dg)z!H(Y6l^0aILTkrM@E
zA-E?v5Jo&#lKF}(2zimmu1N|eFH6`Go`1@btcc18>7VBh=1Ij+ED>mchJXp)*OpJ6
zG|j4eKp(e}h5Ksqc8jvxIRv6ITPH^;0v_FL^JGBIX;6)~MzKq%nJ^S2Qbsm`Z?tee
z&@B0X%cEN{Ndp27PBcVyBfC*{ko5O~#O|{{oBYa)M0z}t$0-3<QYn_PKM5={x$aRz
z<;gz@^EU<F$Pi1^Wjp^cRlmjf_S&hSj>McYaDBpq?r-*)FlJp3OM;j)Tc`XFS)Xwc
zyW`pJlvp_5GKcmbb6m{r_;t$%1^^jB=Dt|p7?9bKxV1YoZ091Zqavjr-~#l>#(v6@
zJ&Q#L#|*@4Tq@aQIw8nYZUPZn%l_$~O(lj>Uu&JD$lpw}*>A`<rX+wHa~iNoj}ohM
z)-~&Ky+B06l;?r_rKZ;h<Z`m?J}=I;DZPS3v>sjA0jmb)S>v;wn>xWmOVY;X$-K13
zar{<;NkPV5ECP$^2LxGH!r~(r*@YG-di-wV!9*n4sGCYYxBz+@WyT&AfeiF{{+c90
z65=P@q+eX(Z2ll^GQ<b4`)|tW@|<)E3)IH_k}xuNK`lWd>rBg}SBoD;SopT2FG%pZ
za8V+D<RBGJvmj$bf#@)w+!~Oh!2c_-NxvM0r^r|Bu-l<1K$C(3iPt*+FugHL^@PNK
zwlUjKcKMcge<2X%IXP4mTkp{c4}|$rNu;!%FUn$pl6d-w=v_+$<><ki!fR2&s*?~e
zu|Y!Qsl$vA_R4vnR^)w_rTXCW>vGJQB%%~-QZ-iu%m5c<VWvre=6@slU`6GBbF=f3
zb5RGPBE=Ao7Cs}tHujbi?=fU0N`K$-YVOBc`lnA42gt_$f#}a1J8=5!{5has_`e{o
zSKtnj3gAmqRw7$^F;i(Q%d+}}_Zji@HM%7=1Mv35OhF=rh)Fk7OgT~GFwaEwmivlk
zuPjW;weG$IrLAudBu`P3L}i0@q)4bzq<`#)(k`<FQus^QeV6e6N~n*`iD_jTtUwO>
z+x09$OCZNKVY24~ps7hJh`;5_s*SiSB5hH}6UsLwWQwzO;8+IkBU#Mb&X7UYO<N0*
zIaGBan?(aHR`*EERNk}0akK2XPZG3*IiS~7AU!;Jz4kp(q%5FmNsFZ#i~MEljc8Dk
zJa&}$Jb6w6m8Qm6;30FOK%6*OOi*J5vJNEX6BMWb^70~<2!PyTQ`M_wTh!MwEf=t8
zk!FR**T_(<tVM7>{RyK*L;iceCEL8LNSVwhS+=$7uuoflJ*ltNM1ckJYGzepdJpYn
zixw(EI#6a?JPE%bx-BsN55;u|m__n<^OpHF$<@E#`lcL1Q@e<S-x5pyEfq?efHuun
zG9OZ9!JIB@i?F0ni&O395;62~HVd9g3tcSP)s}s=xp}LVu=E#Z@WNCLwa-wOn!9z~
z+OJzi$Wk}~N2@A8WZ%@x8n}o+BF0`8f=3*yu0yc+tjWegtw==L5}-wXyH#`BtKC(4
zOm*Aas!v$~Hk=H{YR&k6NA!9F@dg@X^}76C>;I?`?7(M|z*K4~x<xD7M%vcPG^AMO
ziGD)1nq)zFOOwP@bVfE+3Hm@RSJUM0w{z7E1}G}!skkK`gR@8~dkMJBc~{olYZ2ch
z+0;vulPOgP*%)VU&J!t5#Tf|L*tkcMHz4oU6ko*zE0lFFNwyx;rih7{{@J7U^s-cp
zIF;(UfSR#jTL)Q9Zn-E(SuT>BZvA*o0FbqL$$FuztQiaEG>v2lzpo)$G8E>I<VDP2
zwUWZN=Amji<!l*GdjTQv18w@pk{$4QZ2?tkhj}t1Wq)pamWg~F#1k~O^W_wdigA>c
zUsF#FI&e8L2@r{H^S@F^9ViPq0_g}#vJP+r5nFvpqqYUE+L;isBoH;ZX2x}vYUWt)
z+?O(-FCv^dY9ed7XnMo;w-&@QxF*(pcZW4Lbk+rGqT4MBS+%ON4QmUiH@J+m!Yqh}
z@ExAD1w#9_e6cBeYOL52W~Hae@=2L5Tr~p(wq@-IE}XeF5ubPZyvY57+t!6pXIrQV
zU`mk=r66g6c*-7H_sXqy_zhr-Ln=ACf>yyx=y;h=?Y?5qv3i}v+##upS=AL~DcWQk
z=q}J`5&_%1CUjXDkV67GYJ0Z1v?*<UV@xE3WTt>&q3Lr=L~T<Ox0tCaYSc7lD#x*^
zM#MJI9iUz(oWd>}KrM6P3EHrEeKkQ$uNnVr0ILjyjRk!sMH>7TpWhNw?3xe+`5GH2
zeetn0CGF9%sgY~j{J&4^LQCN0vhJ^G>;@Hn1G1@OM_SG;f0{~C!=qx`BS?CS{<deT
zL5=4N6)b;aw@~MmRe-3*J?85N;FSjA(<VIp2C!*(trha`J_&B66N^BQ^)2e`;cS4+
zybaV??$LN7&6;$31;+wy{@G&#XqdINX*T!Jl;yK+{@KYQ#(PpOu}~`FUuf{id+v6s
zvW(Z>*)mU_xZYOK-eV_xy~(z6>r)Cu`ztLHp<mU`Z~V7Mry~_z>bVHO94}7*jJ}iS
zT0LdofX(a<r;u-AIY@hcmmteYjY~$h<N`E`w;U?O_H6^(HcYt7B{!!yf2J1D%PNot
ze|*(;yH%toE)~*SQUP=^oQtr}KHO~}iHPR41yrSo7%)$Oc28}K_0|a5ybCt<egZzx
z02mKiBTpjUoKu}{uS!ed?{m+{9N@8Np>k(YWIzS9&b=bFy7V+o^LHQyydgHg=Sb@c
z@XeN*Y;n)XDPCbp(^Zt1*)&<BgMTVqrCVY7&3pjMn_8ff68ysT5XUtEZ6A|5T0&>j
zB^7eC#5=$@;EVMzCxoJ!VGesp6ktgItWGy1q5@sBL7qMC8QP+=Xw5Sol}W&yS&cf|
z`3z~D4Y`}_A%vJ~WYt(KdtJU8{dd?CT^C=;{80D)1;(Bgf|4v-8{<B(#-75_Y!5fc
z%jQox?ezut2z&>st|9XS@R_i^ngul{)bg+UTuZTL=id`~9L@&bfxN%jkUI6vmQ%QU
zgk#udT0JL(u<T;M=c~f<bvI)C5%|javp_O3p(JP{`tIx9pz|xh3J3i?QCN_`Go?gY
z4|7X?T~F^aaGF{AocQqtK54wP!@Xqd&VktE%ZxMsqeau4I3pH;t{VLlV2kp+$hx{6
zK1G<x(8XNJ@fP@K)7TZF>6~LBzoVHm6i%p%0%<A0b%G&`32dt`1^|-%+!pSu*adt7
zUISCCuH+?<&M|M_Ah&%1-T|NV{HsLL7-5*=OI9zH(7w*SI^RqII0gOz7^MRCWff;6
zfs^xHKLW3S&-VN#_l(SN4#H~rq0CZ+wB2jK60mHOsP0N$yS7?jis#o*#<EilV)wl>
z)JmrS-vYy2;8qliqt??|uU7=!YAc{+;5|Fh0_vSKYkfWP&)1IgSNNaa17$V?LQ>ae
zI$};Bxcl35&#wdTfVXVSu(k&+<9HmXUjg0_bZ%eiKP`jmc@dhXtP6Gm<e06^|3?3_
zOpbe59d6!mLIb)Ce75&r2bO>(V3I#>+8d0;ok^?$tH3LQmQ~CF`T~676j@0O0OZ<Z
zfvtT5EE)YOMB|!gFB`Hp(9Do)M*IT20AAW4)H?71_#i_R_CP`h#++j(?C*dt%0Hih
z4{SpXNdk`zN00<vmBeoIZ-p#zmeqK3dJ;*k_>`Yj;63o(Z6is5ufPiq0GFr(^PH#k
z$vg?$?>_)<^!$#bAGiS{FQ&UA|LF<(-Zz_Mt`LoIj(r6k`qQ{5;{4)!V98CwJN@~V
zBcyp+3@f6C?Dz1vl<gzev9%qzPry51h6YG#00I8b?5}?j*xohs5n}dd4hwXAlUSUA
zNdGtBCGaQk#DM#BWV=WOWJNG5oJ0ie{+*EZEdems+~yTb^(Ml}vEZiv8+fAYj1u<d
z^izRBDt!<Iv<18Y9ujso1-+Ymws$%dZyFTI9pEkSJ7HH_iyEveXMp=-@>2WN)MEqq
zLeT1$x(+Ck-k?EFY->J0ruA6`UJ!SB+34@dz!kD(aqz#p;l0<uZym71*5E(LUX+xq
zS7tz#@b^CvcJ-Oj-;yMOxNixo;dVhWw{krP{^;PJlr4t+(mZ9?=9Brp5H|V|u%z?{
z`-4Z>9+o2sv?;|IkjDfqUsL*r><28107K}PaHGw0!p^oPF>8!BhpB+mY&F)(TkCxI
zp`O3V4AX#YkTu8I{@@(nW#B3B5O}Gu)e)1teX;=2x=73c+y<6_hrqv-{!R9h49T(u
zWEEJ<Oz{Wc5%5r9hntkP5__Y3DIHpiK!c{@G4M~|iA@rlqgN2W#TV7tl;q1g@Iqj#
zTQw+<egiUwZxx?ZphVvTzXN{(@09;NX)Z2G^3f^|j)YGCJOlm&o^|NY9B1_hvh8Sz
z*fY2O;Tid-waTJS;qQ~Ke{+DFRA<ihcq_2gZThFQyEgF0o`?YdQ(!AU^Vc8t1f5PL
z@13&JX8l7u#4Z0F_+1$wVFfgz(#Y`{%a~S0`~Tm-la6IvHYn)x9YQhxDY<6__#5~q
z@R!}b`&1>}ZThGn>s0DYb-y8QbLRY3gwmv)H6o=fdVB!>0R9a;?^uRblRCY1#imlM
zYS7VqM$qqNJ-=08i?T-CoWO`l$8W%EGC<n^B?4M*a-*XRDZ|#c1pR*5v5c-Nsc+Na
zW&;}nd-@3YzkxqH_@~MuR#O_jBH93}`nvGr-@qThdpm#1^LumyOaLNAox8*g{tNgM
zc++wI8gsqYUSgtav5#}y|0Za8=KOV0!#TOUrsU5m^nU&g{0V%p?xd7*l9S0L$T(3X
z^moGZs$(g0RoatslEt(jz-ezMzkd<*JahhrsG+<Z!DrEE9e58sr2Kp^^PEZtL@41m
zBy-yUw)p%X-OJFbR8(R1{APNpbhIA>{{sF1K1z32yS*?KXUu$e>pWZoUIPCD{;m7y
zQqnJ=1L}#=jVXMaz&pxs#(J6!3B=c|^X96&-72k@vp&lm=Z|QH$_(Nl6EH4mZ~XIz
z?vqR?Ip#+BqJ_+QIr{$z{JVRZXO$`>Ct3W^MF9T?lSFTUzkxr2hrkEfGuHa-u`X5>
zd2Yxq+4sQTbbmV;xK21Y%d=GYb=w3?eFgjl{0n&04LTjO5urhYY+lDM_h7!G^!};Z
z!#=z671oB8lJwt!CED&kfY%-8Z?Gt4?Os;FZ7@ms2KbY{)0d7PR~>Asqe7&*4d5H_
zj<Hw9eYMZNG?1p$n0x#dm#4M!zv@5*XtIcJJ-9IiyGqmf68Ho7ttaSx%0>h$d|4D)
z<1Xmez$4&az@I(Ozh`w(YL5o%TU*R~za;;>?$`(%u?QPyf4M@t{SkNrJOTddxymyY
z`X|zqQenS<OS`$)>v2#1-zVEwrOdm{V&xX_iLs}1zbYN{ukgu+FrQmlvcX;0f06#H
z9Y3D%_p5RXF$HOx>x7>Ie*uqykKNm7_vHN>!i#n0m|j!<{sNXdwt3dM-m1oyhdQaS
z!(Hgk_4rL==MMg_$>?-K#kS3i{}S+6v|p|J{1xf7HQ#@iCDyO>o}2>T?x6oZA8vLD
z?;3ZpJMH<jr~K|y1=a<AW{2tab>I`v|7Xu_u6s0tNp=i;kN3X>JOO?Keg|H6pb9hy
z8M7x^teGuZyapaJc3tUt{#w=tj({3M{9E?Hf9wVgwogp9aBk*lV4BlaR`~fq-0A)9
z=XZct4D#m*T3ux1;;L*PfOp-CnBAw!ti(F70-WJa&Kd>q3$O&d13vdWe*%1x6x{-F
zn#0xm%yYg2-fQ+=KfAmZS5KHBY;u9Stlx1T`br1)+xg!z%l?{^;=Z&AeI@AkmxC}p
zR%i)71E)CaT4eHR1$Ym<13vb|^w?)qxXOI2o0U$PxqAn^1m1URPxq-YD&7J<bJE{M
zGj%GUcfdPf<)D4`tFpQI18Z6)o~z6rf9#2AQvqI+e&@In%r$at0bha7z<Ywe_xRN5
z7Pv5Dmi2%f|KmHRo>z1as`AfAzUmup8Y##+^Y8qM-sj&2o-@gMo|_e{;ck;Y*xN(+
z|1I-eZ)I7Bbt$Z{&eEIz!Jxq9Dd#w@UgGAR_rOcwHL%>1|F*cG^BZsmm}EmhlW0WC
zz*}JH5c)4kiEEK~5)sY77xM2)&-16iXQ`2%<+7C~H#=;S{-1jCzoUN@Sms&`*OSqt
zMtBdrIfVZ!H0xi0MH#fN(J|ZP`Ime0zdL`#Ou-`eDp}9Z7hsvyv)=miC53N+%TD@%
zCa@~{^DzE*L8<%w)2+=e9{zV8z*@(S_!K#tIYtAp$JJk(N8%m_L<Abm0+OsBAGCXz
zn<9oe1+&i$kn4xTF}Y74H!F;BFx5=?_ahORJeE>NWcOyBW&hQ~;kexML%=xk019-D
z+~K#5L;pS!uis{EIk>P6Z1mJ#O8+5XOqM02G@z^6xpBz(N4Ocorb`;^vs^nIPRe}>
zz&O7m&lJQSHz;mu<lOW8c6t4LdDnNiaZLUh;-vft*D^MNEyCe#9r=8*tC)~-{jF;@
zJA(dWM5ODNm*3_q>>v621Ek*=XIu9eM{FIF|A#5U!`%I`O*psp!};GQ^NesMeUrcM
zF0Xpz@Am_vz=UkBuCecV{W$zn0>*#|?v+XyT<si({vj~R=Uqcc#3Y34v^tu92FX7I
zY#XaG6xuir{mb0cF~Z%mRrblc=hMTVAK>$&0w}bh+b<9Q3{ieYfFe8$^Bv+NAJvbS
z#U!mBPWSFlku{x|HHh>Fo)094f3n{XSy1*F5gtuC{S>5ysQJUc@6x~#qtg?7d>L0v
zWT(OZSLe41x=;RTp6GdBR?=Bg;shU?s<dKMa-!$M>~ew=-@hmCe`5N3(nV%tQa^J3
zP=5Tx{A2u|Fc)|N5iDefs_G||-%u)H(GQ$I5bJB1^b;JR$oEfW4ciZ%--^&DKEF-%
z{)qe)*-zd$Df6N&zh2M(*OQJHW3UrnunCwATAkqI%ifKcDaR9hOu;22(2tzo^{CY(
zeRhIRF*_V|g6BO!j8CM_bu#)DrP_Mp^9Rf?fd=}-23{4!iRqt{hV>s&aGtz>`~wQQ
z+rXUc_X{)-Df$1z4nkg9Y#Jw$ACIM8Uy{u~sGt)nxF0mIzO;Ir;0U?w`=6M8xh?~9
zV*W4ucN`RTuztir33!G0F$cSw1!@QVz0PxwFh5}Ny~Q_2hj=c<VOgF50Fs|rJ5GpH
z;`1W{@wLeX=|_Z!fjm466qzE}WR~f;DFXX_NS(<AT1T*Pnf-nv?76KGp}`&Dh!mNn
z2>Qf&>~KN*u~G!~`~85&_;}-_A@(?H1UroHw}73)HDL}ImHb+cb-Hz?xQ=#J2ALuo
z&cd5)aGLUnPch5FJ{9m5NpP4Z%`wF^%Aun~CIi=xlj0a8;PVhud{zGVu~Gzm0vK%u
zq{2k$_K_*FQKm=-SYz1bWZJP(yo2QbA?AlF%<^m<pMR#vKRydbF3WO6iZ7(*7-ZLD
z!1ng<Hr#jkXMp*!A<hg`nI$}y{^@6adr(?FEz7-ixF)yr53}VoVV!y1rlXJi{UH${
zkCTbVP4Sh0VZOou^`~o<=t2LC0T=|%2;%nucQ5U;$Lz2o4+xrl7C6lXwyw^#%gX87
zLyA1^GtPDN^Gp-FI^PC|89wNULpSN{_oqmJgk>Z5{1Wi(ARU|o=72N68DN$&3~HaR
z#O?lch&7!87Pt<4f*Z|iJi^kEpwp*;^S~uwPB2>=?9qD;yzOogQSO@s&ags1K?bU@
z$9IW{(+8e^5|{_h1Lp|d5|Q=4^D90bROuKZw(uEXhDE{}iT8;Qe?Nr&)4*9?;TUtg
zyV)Y=*F$vp1TfFTkJA9h>=AqqyaGNPbpAnLo}l+rf_&meIX@6F{xJGa0cU`V9GWZ0
z^KO`E>5w8hfJxvCa2Ysj+k{!>`cB;N<%9UY49o!+fQuaZN+rSkjdjj12mN?R;D&RY
zx-UwR@;jR>Rt~4Xd;hb*3~PioP5hQ4e!b1m0O`L#;Vl!M=rb$i9}m$#fq<^hi33(=
zm(JTG`F{pD51c1_gNuM|LSczT_TKdG15N`Mfio0-_xv94naXGR5dJ9wGr%drpUiS_
z+UZX>AG-v6KE#Yn0t>)dU_qewBAE|-+abT9{b`V>gSUiw9^?NC@Ev#t{L!;b%5mRW
z;5Kj*SR@?404?}7JIx*fkAU|*b>I|n(=YSw`()#=>+^mBJO<u$^Uxr1ldh6`hr|Qk
zWT)vv;7Lc5w^3qVfFGymL4Y=J19-=2=65~s;1qBTxB*-w9!QxCw8sJ63E&a?IWnJ7
z;0oz?9+;%|@#%!WXWzo}p7bvO^TdrlD+_SJ5hphd`V@H5lm0$|3vUp+aFQP29)qY)
z>@<AU?TQQomq`Bw7U}b>kZ<z-9s_@OBX+y<p9Zb~H+lbq?AhLD`11xB1)c#PyXhYQ
zv%n?b25^~ORwV#Tv${Ex37`nP?6v`R{_7mM4P{T~9`F&E1s-xty7T-wU;(%aT<7`w
znL=o=)Av0qcRkMU12e!yVs|bY9t?}jZlHS___MoB5Ws+d-d_|vpC+3PzX8uF)SYdD
z07}3DaGkj6Gn`lin?S$w{vH7zy7|W=Zv0*1mXESb;f5TXWAOYS{uv@}+bx2(lsGiz
zl+PkC4?F>$_T-<C@^>4!%=fp!;T_-;@DlhN_#0U2rk@AQ0k;X;J4xk}u+Hup-Ch7s
zfY;sU?<4B$UEmzSt8Bz?l`Tq7ffqf`Ujohm*MUpGDF%U-K{5Gz%qg`V`tuZUogz5T
z3cp7;z;$m;1ApoHTL!>s@4LV+z*Y9=SOVWQ49^P~c+$}-UIZ2i8+sL(Vv(hfQSdlh
z7lv8-f76lGnjq}(J>V=&M<{GCM@TK#`tw$0s2?~F`~>_0Tx5TM4tiN$s-Jn4%&0*?
z*wCMWo8<liRY!?&@i1Sf(E&OR!j}F7TxXBv5Wm7T98WSS=D6yu-r*o{8TbeAD{v7Q
zXMI0rsc=*rQrAeF@q=;%Ew}UA!UK;rlqtF;ZixC_2mNP&+rWPSH-LHmpu7J;W>IFi
z3?>C$s10`XzfaV^Q6d014{`*k5*2QkB9YNS1>g*D1GouXV~?**oK*?(GtSm2bf5ne
za0|FY>owXnE;lAv$_@4;XQcOM-~qM2o0qk*hY3jvy8Fx6V4twLKLJ;${qxiTO<*LO
z*MScD82xYZ{m%*Zpp9?W45h1^|NBV)pMa~>emY~^AX9rB-OMzbI{M!u==m(=&lekZ
zN_w|__B3~%{};;dknB82I7B_c^E>)y`2Qm1_Yz}Ao4D&^UTl_pk@S?`tH3?rjsRcg
zC6PT!zBK*QgZ}>o{6ysJkfIWi1t*E<?fAcKL!AD*P2k8mHt04<g$aV<3^OUyN&f}%
z&n5nd>90Ou2$-b&ITn7~!T+<sZQvikJ<3mBwq-PlDPnW%FPsk)I_f#@y$@Uk=9qeQ
z`YmKN!wh0Wb}qI)lfZQWZ7#{2wGD5FEUPrRZLfWY&V9KnxZ^5)1;OF78L|fO*12dQ
za2mJ`+!NbhkP24H+J<APDpzIh=<VMCeg*CWi}c;V@w=2|!D)t5o9u|roc|VZ6IkHF
z3@f79<opZ(+4cI)>8607_dfwQfGGifgv=Wc0rPA--e+RA^_c>$0Y3wGfq8+>@~AzE
z(zIn$J#8I41e^nIGkh2l-yx9gy__F!9pRPZ5uc#<w}5lmg#otkqQw6m%I~-7KMmZX
z^*%-S%MR}pc!?8?DPO*Q{&B*#-jvwUTGV2aU<5cVDe(`cy^a3YfUzv|HaUK2B#e3f
zU8Vme;4W~R*r7Ho?KEytq|at=m+kps-~w=u+IN%&D5VW3aEm~m4!z@_mi9YE`Mb{h
zPdR&@<06$n8VYLceQtdwfa}0NfV<rI<IA3ql!)tPdIj$OKj`_-1NVS?zy)d_8`la%
zPbX~o-38ve-^c-rz&&ceDGIMGe2Iw&Gt9<urvlo;_S3*s%Fh{wZz&C+jZcT{o7rVj
zr+t2BfbIhqSi1130;<$p{S2$z^x$i!e_ZT*6SxapmI;+Q^=FX<@uGB{XXrl*++g};
zUXs6_tjo$1<fzFpv`_Z@2yg}X75Ei6PcJQzxvzw7<VBKTA9xAG0SKDxZvV9S$cZeR
z^VxXlJPRvJ06-sb8Tbjf1DqD%!kjFiNlA>50_<ucO{W<bfd^Co6C%CtG20#!N&BE1
z-rjf8e+&2-xGcbR)<0r{m)7_hv!=J{&fgDQ0saH{8Mx33xzm&7pCQ}sw}34(!t->$
zFHm}|N6;=njakgHV9lNX2JjEy7I2Dr@45_bCS-$<#$=B*p%(ow0Y6du^htxLo<EeP
zpbBfMkuvZd;0&<~A@hQ<EFvh<Hg8(n^bl|b_?e*VgVN&*p5I95zIuyo*Dd;Aqwh6O
zC^c&{gfL>BzhBdthX6PO{LJuag!i1M6KCuY3*>>T?AHN|k^k>-Usour)kFF~1sb3x
zu+D9_08j=l01tpWz#`kb^K$=??NMd!40ZHt^Zz~IJ}@uk&%7KyXAon|s&86Nag4F+
z4Q9SVVHj&9i)HC}Mt+OHIpAmDXW%@2_dwP~)ny+|F)RNK_ihUeKQ7aKHOkw<hgg>X
z#_TMpI_r5C_zwX+_oWLcmDOF=xVq1E?WxiKXW%E`JkuBG5*uGuNbFJmTl{bO=MrNK
zKMP<MQ-2ic6>J&*PXM=oUrGOdfs^off4&4%p!~JyKS=HEVy97Q0YvsIii|89Y^!Lx
z^Pd5J1^yGb%6*MVwt3Cy@8W=jJvo)Y1LlGIz<s)JzNjm+oe5pp5@&zEa<a4KK0E*}
zi<_O36~QUbpyBKWHwL30xD5P4_qUMsof1B5*Wg0?EnvkJR2=u+0j6cuq77Vwsym<M
zzHdgrDd0BnE6;D9l9C{luGEAEY}FYcQvkPtS!#(u?%m@ngv6IiWXWsm{5Pq6JkHD3
zMRf)u6;>8%o!=RdyTBstvnQ$(*?r^S-ja3Oc<k~wH-U3ZQG@ief%Ass!<!uF{OWAz
zEUm`{ZmU$biDXr2LIKzTR_y$f44yBtYVHY6HL`J7NaxDU?d<^=paHzW6u)WQf^2;=
zb_!TnSaULH{C|d-54*5b=;xEPRbZ7bADIGp0GwlnFPGi)Z;Y6yb6aIueb=4;GVn8S
zh4ix_mw=ks?1|+i-00T2vmQ?SoneN<6ZaRgbP1$LMdf!CxW@2*f+pG&c+4huUp%o2
zH?+6U{|j)7$;VK5R&`iNH4fX@Z2IReGcTSveID7)6PE^DKGN1dH-QJh6^=HW@=%^J
zAJ%70ebxB?l7!y{{-CL?gcQkBwWX|Tt{MN_0~VNJO|p{fNya~5Rl^B?!}#YH-hW=~
zrn(LRw`t~C68+-n-^cLf78icnY(OpBPj9{5O)gYe_lv+dL7YwW)%VYf-bjHQFb>QC
zQ-%=KFCfZ=EPPrjqtXfI3tS-XszknffpG)T&h9z%nM2>>GLw*_gsncqA}}WPfpWHV
z;ge`4J!MwPXWaQ+-|7r^8+Zc16;TuA1rc<B2-_3({$=98j|q6J1#d>hGSQ01fK$w~
z7fQecFvrCRvHW-}j<03G9I;Nb#|mKI+XO93on6bdEOfa?25|Q`%N{2T0Ml%H$Y)nf
z$AZVzl!1V}mH2bO2)7E@XmXh4d&kLj?o}No$a~r7KS|Z+@u!-CU}?(OnPhj~89RT8
zbFC$Dxy^M5gdJTJJ;6@1NuXFHi1|2^?ne2Tr?5E8sevKhf5|+!Y4Q&QbP3{##Nx*e
z0y8|n=lFk0+Q!P_`}?AIz~_g7)4+lg*%B}foRPXsAZZU@7Eb!)pAqiHZt?FZFwfl-
z0hNw^eA!?hv(<N!ZAvYXnw8&&Ag2Vh#I7bgKzW^aBrmW&GAsZi_QH1mmQKu*{w__}
z8qJ?&<|U9|wIEP0G4)T5f8W)k+VVTe44W_OiDNM!Az!BqOtH>aDsnV_ScdyRYL}?d
z>_wcuJ;g;9E&Vwm`p=VH<DuxEB57&zK4APmBqe*}xkQwY<u9E9p5QKsVwfiadyX7$
zx1UP<X5Z?|V%jpOQ_FnOB#ZE1$~2JCiV|zG?##6U^`<AH13r+50SP6#b>z(#&>SS<
z_>hc7#xe^934z@OT8Xl?<$6T6Ttl3|K=|30n$!^AKLA1J2L*>4S-l0G%coD(&tCmp
z4)p<Q(}K86-M*iB^QQ3nK!z9<&~DIQk9dD(*qur2BeaJr6*SAl%^qX#a$bV9l=L%)
zC}1qw2jmN=4zlsNK(cxABA(nCXoChRFfjfz)03mohtMo9$R9agflIYF^Gwc~(MrXV
zq*?3<C!}@G;urI%4G8g4f!KmbVj`K?AvPKJ0h=N7)1@w7$>Y~q?9?C?O5~RiXqLFB
z*fO7{?jaGh$zniFS^{!K)JNHJkQe%g{Bx9zAXC=!T#r?2OZ60Er&vVTSU4xB4Uy;^
zmn>_R*|BMv5AY9x_^}oT<OMgrkVn0rX_F|Mb@0Xg*Lp&c!rsz9eN=z}$C*>%;zSq;
z<br<I>2g3bC-t#J;?-0x!pr`X<CWz@R7<=`$g7&yo{}T17*%3Hy3h1ye|E{O{iZU4
z@si!Y`_-sO&66@m639O}xdPCn^|K&f41NB^l-l2N=w8+jGFQS}p#T!q1^la_1pj6O
zysomK#h95D(-FTYl&~)n1jaz{e+xk9%NvKR5&GGf)ee#q3b)7`h_eu6!w;i-Dt>TE
z!(zUET$HsqzWDwSePWu#mt0WZo!=9V(YirMcJ~FEG#10uqQ9lLa#H)Tkjs*hnvOA6
z^-W#pff5S{rv2<K%%O7~6(!qD9?-ihWfhPozu>s5$ZKmzOw|r>EOTffY?bB5Fq=tE
zgqDFb0C~}P_TvddseoiRW+^ik4lTcG!fn*{MS;Y8oq*c2-_9S1r<NDJ>xt1y#W9KG
z3VjIA#3CM^NL$2f_Sr#T868i5Ru5^}l8`z)Av+o&S{)z-*(#ydFP8FQj*f+KfpJ~a
zf{j>qs9M6$ldDQavAR<SepYPK>?@cKwxgLbZl6EM>WNr_2(3YAMn~QHW*|xPhGd@n
ztR?j1yPB5aHzi|!pQpMloBuRgT3NWg-#KE!CXipY1DihuUXa<zC{JOux|K(@2hr=0
zD+qb(xzMtdF&Ql)Jpz%$M3BrItU^cKr|GzNV?Y&yNVd-;s%6dPh{@j?3l*gT7@2xX
zrLm!rt-5u1V7`c9&4$A*MQgVlGyCp~U{%;lq;lrj;gQw&kvJQ6ho-tsLQ)`&1wlTP
zM7`w@A#RDOq#D&g`~4dt=o-TN!u`g;by+dq27or2tGf4u+7{8bOMHpQT`E#yX=pu&
zlBsp5MEna+s(vY7EfkS328)@RiA}hSLe_eiNC!Yp-V3t*C$h$vNX$wi`q@M`$~H8l
z-9K*q5)h%X-?YL~#D7|8sw689$^r=^=sM-H!9<+<FR9gP%+`5X1rlVfk5<T9=eMr1
zINQAANhs}Q-_ab4v_pgv**c+FP>49T#U5F~P}d_C;rB@ePZ|MR@~fVa$4C*v!ailT
zDRE4!^aual8edZR@j&?6<Za(zuFb{E(fhQtk;-2{x%8y1B2mVu5yG~}Jc6-ZtOA$K
z%O<9OOuR$!J1y$;r7a^;sS6{~2IinPg>`HTkM)%AvxTIBu5`Ipz&v_zb>YpFli`~R
zT3w~@kY}54lYfSo*pM$^orH~sfUpmXH(>0h$t#Z}aSY!8TVnQT(DJhR!{=q1o_Ad3
z7;xV<pjCO8I<KWJ?_yr7X#&Xkn@w_4Bvm`R_a<$qksxIYNvnmT$xR9S(sqzaSQlhv
zFd_-t3yy44!LFrsB@3l?!?b;5e`vC&wIM=hYSbjj)I}nQ={3-saOc+mB_K1KD;A3H
zRpScA2%^BEtnX;L0sI#C?G4TrZ#vNJY8G(BJh(vAyPZFh`)=`%{d<-A(fBkGT83=z
zjoHc9>d~l6q0CaZ8nVRXd5^zWVJBo%7r^B~d4KzuGFFGW#`Cs8+;@Nm<Ji!Ydo}{D
zXSG#^|M#V3AYsxvC+uK+UFZ2LKr<r!RbZN39&||3L;^BjG5Ou)j)*?V))_5gNly4|
zg4i{mClc!!iVu=xT>_H=W7$2RVcsON*<$tBEP0i(4Yqy$IBU>jY9YHkW5H0eBvtC4
zSoKd$7|v`Rq%9<r^KSy%K+Ugn$;hs#3uI#vrAnQByjzZeVy=4G<-O;`_UD;eD3c!&
zf)=|jiUhpS9r}BAN?YKhDJ?JC#Z#*h0ewS2-*e)-H_4a@rP6xXS`4(!mWbUfMqz`7
z(dR4VXaLs<vYmjS;VahGDQySF@lEk*D}=z<aG>pbzz(0ZGg}?9CIN}h2DGoX7bqe9
zcd`mp-6u_rBox3s8^oz`v|&H%7x|+1n*sq+<!YE+2U@+yB{y|R8>w4p?o-UAJge4%
zl&~M<JHH}flC8whxGn}f;#BG?J*E~nuMzXc>V3!>Adt1HLjP)=n?Cj>a8uReNdu}+
z>-*jKCuK{@3Z<Y%f5_aRK$-w-Ie=3D31nHrUe*lgj|XCgYGSbKRj$d{B&(|SH)p0P
zkzrMPcoRapY?@Omwi=?6Yl4FF9r$W{q4orHexH^tVQRIIP4HHD-et%C#t&QcAuOO(
zH#sM1sy*%n+XMS?zOtn%73-~CNDwZx``=dmvnOdk&A)($GJB{tvu^yeCv2BUfU2d~
zQ2MvN8~^W<a+-5B`P1ZQ%DrIUsjm`HAFa9bh~E)N(WTJU0{Gdrf%*}be5`P?%`!TW
z&V@WPLaM!-(QVBrf0SL^k)Z#8x%oV)zXp5;zStnd9>EOO>E>7v0ZCVe@&c<=xM@N>
zhp$xNT07B<cOpwwJ^^2Cl5azn#H<U1YhJvj0u?}mdx@46wEZ)YAoc){De_QAXiiXr
zpbj5_4IoLW0N3b|SVqJbqnFV7ZV_R7&FH@ae5F8|p-3eTfT#gCKeJ*zh@ehe<F2+u
zYCvYd>NKMbHer7T)`8g3e;HU|UwxC$>`PuLp!VJYRysh#xA{u0t*{{}T90RLkOX^t
z{*%$a$}3%=TcWYNIWT6Dcd39j!H(!<;42lf*(&p9trO3f^`r97HutP7bHYah3cJOS
zzR@o58Tf2-9IhvDnL5GzKusZ8g~ul-%(7cd*d#*DJKBI$*xm|w4chQsA_uk4zr*kQ
zDrTr5gP5LpFYf){0UvFS#`T1MWLsjBVyjs+3w+$mzXIL@-(3oBjYv$ZG~o3tV6-P%
zpbo47pMY=1|J&U2vqGOB%*KIQkgss5(r4qJJ)YnAPo2g@{BYAh8zfemlK(%851ohu
zYxN{&!;ODG+aRYC=@np`G*83<u(VH|ro!FdvgywX<#Cmsq_$mpq7h=jZv6nP0!aV{
zTz%4*<HXel2?Cj+T>+i~e=6!)gA)eBd_E=tQ#n8)pz(hxa9qOF*Pt|0g%VB&(%HKW
zd;tC;Y;h|Mm}g^vHAC6enL&>!I!}nYwrA%LvY^%0cWJ|4*8cpB*p*Lhnmb{AU`R+{
zu7LS*DYr^}03HE<**1WfjXlHC3+0I);`2JT2)p`7*AX??TsS8518W%O_0~9k@dEfA
zcx)SOoRpV&{=sZ3bwm0Wocp~DJOv&b@TRM-5AnyX8dzhDVK&H#>fZ#tZyBJbfR2}?
zeczZCq@R2K88`BLH!bH$uUkp7jpoFf06O|VRQ`!bzY%F3Nu_fvr1!H2d;*?ypl(`^
z!vKB!K!RXnAs5Z8122Go0}qY<F`w^ebvYCR6pKrg5V!p?fgdB#%PO%KE|)a`76CQ5
z5$6eEpWEjT*#S`M3gR`Itr5@eA@Iz$DjNR}vNx%~z7%7j2LG@Qya66_BUfblr^Lpx
zfXB{D3roV(`wIEzjTx|p&~Jp*dCT@h;>J`78~hvaw@vaVEOGWxcvC7+Ghh}9>;g-`
zL#4m*&k!4nKr>{z2;`s7z~i3u?-TwhbaezZnYsA_JeQ4ZseOHkB3RCr@yrc0@BJ<C
zCt;tXmI1IKk$&muX)q#p#B|#K3GlBD(DH;lnB(bvIl3pl-)G<%@H=69+q+DvB)9p_
zp?DA;?|F;T`y24W3{b@L=OyoGmuA1eMeXw&@C5j5XH5B?+Fw<Y))I@nR)FWg?}SaQ
z+xcsxQ^}e`#g%h3a*@{(@Cf(~cxID;4gRH2EFPqr7ovS05jM8H3n--x&hhs{zIn*B
z(H_$VZ-Czj8ei4(r%ZMAXOSH0(nYile53+=Zs)hoixQu-BCH9ZJHj%x&)>RF&62)0
z{TmA9J#pMWQu{w8zGwUV5tT++s&O4@Z1)upNy05Ve?%Li*RJ{aCaupC;NQSw<sXQ;
z(3KB?xK1Eyq|-jX5udZo|JGkQC_1PqB?*^2cu(B>S9bo0oM_GBkT*95*`n!i{GXwJ
zgZ$e^Gmw+;GiFS`PUPZ89k9<eYCO|k7V<|-q&frS_~#+;+7w{K%PWWhHSaHFABuCd
zo&I>zv5(I>Aj&cZ?lGOzr1fyt>kq<KH<kX0ut1LTV|Bx(?Bjbx{gdIJM9%Dyovo$0
zLGRP)pGUe6E*1S*5)!1@``ZP+5V`vy@o_Ws&++n1?^_ZD^uFH`xw+-Qw~{4RRrjQa
zvnl3tm(u%BYVY>1gVwbsVN*gK)@0TAJ@6R#7u~0AeLiJTHqP>GOw#Xg8|0tBKk2@<
zKNSgkEMRMjAG!g&1OB4#^|50URD;=1lUSc$QDxHZ9q>2sFNP1D=hyd-ve45_rioqy
zk9#uk9!sfB-kvq-IPrFA2FE=a>#OuvU2R@hljr|L+;4aLoBDhrMM)L<I{&GV{ttnF
z1AlicL$l<S1zwgvsxj>R1Uv<P10Dh&I_O`Qw1bu3tiJ5{XNkmn1ibIq=9!RxQVO6I
za3V_I+9CY2&$LY}oNLE%Ii^>@L*W0R_0OEYDTv=e)&@3Nr1(hw`4{k_oBsRaMp)^i
zK{enqmv4z%-bw$eL~4m7kIZpg1K!j6KI+N;H3>XDi9l+^tNcRW|4-m`2QWY?{9`p3
z>z}i_!CT-F@Mkyf{l4UU!fe{J!L?hjfk(ja<e!E<A7x2k<t|I_?=W`%llrH}`RiE{
zc>L;$Xx~49e*sTA=)cF(d_%O53Gyzz&nMJ>?>f$37v*8~@+MiZN)&DLp5GjDe(QZO
z(^cpB-TVKp=?m*tY6(Og_|7Rq3ny#z{ho7|Y873d?_}ez+^0Rp=>(^MKLF2xR|kPM
zSGk~XgP_Ud+$+>%q31pD;vn4h6}rja2^v1kSh~i@^C|EOSUCvv`W4;W&%itfW1Flx
zeFR<uuMPrTuL18FX?!4_K_8u%9qvPa!vuInfhW|7yTB@!$@k0ZDHl7u0G?Uo-0@ij
zp3+S@%Q>Y1AjiDzC#Fi@>l!Vk|2w+LAGrIqL}j{5F!Gnc8{lgP{Zq!nF3dY?gVtog
zO<);#2|Vw)3)%R`6?m6{1pys*g+llIOO{%C(0_?F!ewBZ5xFDbE^T;k{z-vlb{BjG
z&T%E0GbR-xmcIgC0n0t{EOvo6v^DD-G%fRJE_{3iya3+yq`y1A)AgTZsgk<_Cs*%)
zX9umL{=$d90%w3zT!;$($Jf9!(yzz)U2d>J`I}@$)uRFXO5FGZ&;OqCy8@ggo<z#M
zkzau&;1%$?=Q{HhhRLhIISwWEv1syz!uJkX>8XEqxL|ahDrl4xaX!db;0?cGv*-Dj
z$p0%`=RU~IJ3DmXUI1?``J$iC)KkmASq=*K^BXt0j`|s})Dzam$*;@jU*KSVft#aT
zo%SX03Rvn13sdES&~@M(cQp*qnb;)@yy$?0@H)O`1eoDq;0Qsdce&5p)ef6D_)`WZ
zxLILLK+j#(WQB#^13w`!&Pn+(V$xKl^Wa<0+q2Ig*91&)Ctk$1r7v6_lpOMWKQP6~
z@?q|6+vEAavQTy4$LCrIJ%5z_{hdReALZf4I4c^nY4I!fpTvheKOk!o`Z!_fJiu?-
zymAQrCpc+8LLJ!T_Pnobo;dI`3`}t|!~mnoDiz4l=Pz;7#uNphK?Sl(`X9!B0oRI*
zvmR4qr;F><`g-{Jhq%^YQgSiQef&aDs`QZO3&127nhrA`TjBX#q3&q<&k&EIpEl6m
z|7%a}tLL8*`juID{Ki84+L8P}Nkp_E8L!>s@}=*GgNg-&lXFKH<mBd-$nO#KcWX69
z$^T&6;4)wSu=D4DNnnl!FktCxkC-ZJhl9F>vgu`v>jD}KqL+{4pFZlZDK0xP{c{5S
zInDd;BmG?G)UnP#222rDa1ijBzjp86Nq?{VJA8Jm^|7E^Wqa3Qs!u;9`d)eIJKR71
z^Eqm-5+i_p9w9mMVUG`&seNnw{pK-#yLAl%%;cuh`dB;W^C1gbR!`Yy<MGj-fP=->
zcUtA7R&<=_J*i<7Wpbu+qVtz!sdmH$z{=s;&wY%4^0JxZWb`k}`$tT$RF40AAm`7~
z200cw7XOrGuTd&p<3~U2K2}L7uz+NZ+sFFk$O6_0TRAEJ_i@ulO8(hBCjWq~5G9sm
z8odAGo<AfDl*vC8vhdOT<MHxJG{7~Pl{qf`2bj!H1k>uc{BOmyGLr~){$sI#y?mav
z^U8_zXOX{OVCP}wMEWPl>dyw>zX$!j)^51?m|xv-Q?_oSah$8|c-U%N$Gz8%k4=64
z;8%FrUV#(;cwTzke&qK9nHc&J`ukGfk4{Q|NNZ#B$Ifqij~XXJEbl(QHC>$e{K0>R
ze*aziL;BlJe0Wa=9itzn-~WI7?@4pqk6a$4Pe1<Q=k5J}gnrgq{)76@7y0?Y^INC*
zkKA7<!v7<m91tMHEy0c6J&utIfB|5Ti_3G&k!^Dz@rX|!cd7N$Jnhjv+dIxp6y(Q?
zTrRUi#N?yT?{k-3KYROav~K4(px1rCFfhW!=`ln6^&=6Z_59khTIVFtv7EGk@856R
zh<JqUgV1jEQ|3}|%du8Cn!S5``j2p(l}k}=^T&^NQbO`iKb<_w5^Wym{3X(_Pw2Np
z#Q3AnUuM5#nF^#%{@Fc_Ch<wXK0!FD9Y4tHXTM$_J0)uz#@sk2|G4UocI@r(6^?e&
zeO|tGU0R)U`#9$>bCIE=f5bQ6JtqIE{OV}x_A%-2iTs%U*`@IA9r?W7zw*y60XB|H
z|31>sV~4(b|D)+2$SzDXAQhhfX#LsG^Owkc#(De4;r}A_?*LFhi2sdctFN3-`F@Bu
zIRu;{=)2X4YwQwOCg^u>H5>v{z-eGn5To}vmF&`7D~IIRJg!onBkof_P>|t}55O|8
zcF6hrfLUN(iew)9UAKVG1bsg2{ENT>Fb7OXjo~X1qgM{mVLmX)RoFA!y>G)IpE=NY
zSekj5^q*pmJ})`$j|9E$Ertc+%1v`-V30MI?<^93KAip&z#MQIm<OhTJ{CA!f_M3F
z&}#rAT!lR&+q+X1X_q*n)cgEJCdg+=zakCTH{c`i@o@SBm;x3ET5kgAY9wFC|A+C<
z0CBsg2rwGbfm$aCz8%Crj{fs}{~-nebq+(lIjF*Ip9v<oCq#$EG9UN}SUvLmGhDzu
zEM>m$6rryNso?;a;wtTF3V$p$!e!v2bw_r8d|-lb=hJkcQWj5FILvogjiV2krv4ct
zbX_btdUyT<>2LfyDPuQPg4Qn`M}F<{$Ed=p(rEYL5YT!6{lF}+04&G?a?8<wBLDQ}
z{}7l4=4BgvAamiXEHfVF!`l0w0>*(NLi`VU&|iQz>@w)xXfg?$2d)Aafq5bVc+{RN
zzz5(Z@U&;4y9mq!SAgrl1+L%r*<!H6CgZ<>R}N>PoBFfBCEyynOhy<4M8F2{kw5+%
z_}tSJjsO<~G<=xaGh(UwBb%(>ce}Fg{8xe7z(pFMkX=Hnz&rka@AC)19B`eW-Ln*+
zh%GW7fzQC(gSwm?{VxGGfD6DhE1WKhblp$yfH5}7cb>lwI0IZ^i_{36@SF@uyaFBr
zFAh5YDc~Y-4Y&j>5O}Z7E{AXIG<@0J<z(l-LeTqJR>&dUAM*nGUhjGSYrq8xz(`hr
z-viHq$H42JI>-ZNfh&X!o~8Q;vY?yhehR!ih<}EG3&3^YJTM203I97E>J#uBc+#^`
z&dz_43Uoj=C~mS%`62M8=lLD|o&22R?)?IN7}tIB0(inkw;tyo2hIXlfU`to7$#_X
zg-zm54{G!)QGRa%7X-L5r4IVWHp%Bbb$AGz1}>3#tjPjE%r?i@(k0Y`e@1!!OTa9h
zXcq+SvrYL4@T4bbeHmCJ^DYv;qCf@yoh^p0%cnR0yZ67!7CCd`n`~5i!4bLM@;d^Y
z1FivAfwRCkg|EgVyak>E&w)?f{9gnX`TL86Kk#UMwt0Vl1Fw7b$#}pF^~XhEuGM81
zVwN&EPf!Qfy7RH)9VUURz+HlN&+-COW^tz2{oq*gt?HlxaE6$_R|$QUV}+^0PPtR;
zJ9yg7L(>A<ev07WW$84X<N4jXl}{b#9}(ExMY4V<GaRu1VkW>l^@EDQS;8jYB5+||
zy4A<nJ>q<Vzq{{XmcWBI1+?6DOStX}XV~2NJ9%gVxCZ=6;Km_J517Ilr%1cdFk{1t
zz&WB8&hsx7IeO)SpatL*w`n!Idj)2R`{lTIioRFOBJ(MNT)4c-=WhC627UqVXQ7iK
zcH55;dfoN@cG7<lcmVtaTmUAx!9UL&_AX;GXVPDH`$01VEm!)5EMV=@Egd<Ce})Ns
zdXKpAgJeD%I5|(c^x&U!z|X*aivEy*w*aUJ8bbiQ?6v`?2pWD1m}PTQz(%1u`_2aC
z{F%=vaG9{dr}<Y0BnUapD|90Grh|XZ5H$V<flDD{y3UE6WK8g)`~0(n&ArGT@qQVZ
z-T_8|5l&<KIw<8n<G>Zl-va+$k*x+U2$&F9jkl^po%}u^?%_CN&jMfGC8fp*Y5255
zem%l2|3u){L0S)=eJb-*e(pv;cf9{FamVfg=cP^B)?T=n*22;y^}(I~ybnAe{|^)Y
zsmkZKfoXE&%bxsy1Gop=Bny^+*tuYNPEZ!8)%H0$)A|em=LwoT&jh%=YryFVn}c;K
z;k`XSP0;n5)bb(gNTt?E+KVU>fj)12hJdrcE#MAtp1yNlFc)kQK!Ng9)m%gwI1Ah-
z@ZONY3CPY(k5bj3NWZt+p9Agy{{h^f_IJK($_czdf?O0B6V%iR83oP(ce1$YMcMY~
zqT?DcMt$7S^Y;k@TqGKmDNxrR)6edcB6sM#H5r;E_g(=;vl})-R&`ueBxT3KOFjQM
zaEZX1^PJEC$4yGiN?T&m*^qDcd<j?tZUA?Hi@<nx+|Xr$2bh$~aR8{zKTdnx2X4?x
z2jm|oE_vygaFf-qH3%K0_4ySz$L=^^xDaGv_jQe){Bw`MedF?jF%eF{#$&5$Evxih
z2Oa>oh`dr11rh;1Fe5Wq4dA&d;Bnw0@BsK3I73v|26aP8n!d`ciq$&!=NuKdY0rKU
z;XL;M3^1&7edTTX-v<5@xC_h?biN_$%gPc>I^lh(^dA8(l7CJ~^2~<yL&~31tPv-Y
zGW`Dl_=WP5mj!F?6>_q5&8d)g9rF7R!Vc$T&xO0fGQGDloAxsDI{{n-ZUZ-HK#JKt
zCDyb9X$#5lzcZk}0#_ycYx2OcAn01tTYFY<iu&^wFfHr<Q$C!sWi_Eo{YDMoDCPGS
za8}UZstDUy7CDZ;RoV8uZBEQIa1FQ%Tovsdk&H1XF#9Nj>)aQ&?X1T<a7`lbP=@WS
zktC#^uM#o2#eg9&16%+u0;id`_OctF1A?uAqWc}#vh$BKd2xo^6OiehnXU0F=455s
zmh%H#=IAc9e?Q;ZlWmPL7hyD+3|=!Q%$@%ta7khZdw-AkLLphD!3Bez=XbVySU|fy
z*|p>pzyKY>O~aiYWb(@SUM1OfX@k!JS?VGSs@tZ33&0$31Goj8=X_WoJFY-WQes)N
zV(Vt@{1<?0qJ2DB69)<S3v@tMjemw1eqASMdLa9?jgSS(sTK5F=g$GNz%?e{#+bHg
zaQ{V<abS*VwskK1YMKCE25tkFfN7do8x{|!$=wvgd0>s8@w))}N&oZA+~g(s74hYZ
zz!U(W&Xn%5@y`@+5x4@J28QXi+Wp7W;bZhCwq!eGhJP;60kE6*1Z~?4*e18_ZaK4d
z2Dk}aBmF_b*vPBT-@hd5#Xs2jUGnV)aGs@~B40k`wGWbo0#4rS82^j`7lB*A1;Ta#
zD+tv0`?4e^<^7M6u5RvZ-SLmf?@huECjyoV`fz#LuzhZ;{bc-eikX{PDmk+)p1l7k
zuqd<7J7z;BfJ;msPcr=QWkH~0fdOEO*^qUs$@c>bz;z~{%Gp(?=EM5@nSI{eXFLBK
za22=?oMDW^3}Jx(OEApMq247ASDkgg1pETr1!kx~Ji5OPMj;*-7J7`xoD5@{!QUB!
zA<fdzTI~}$rxh;9Y=Xg)OFqpo__Wr$M0P$}W7-Zu(<@;7cc137C_B3H6o4j|k>uGt
zyk(&6r-A#xPrzA*$98gG6i_Nl<o7sq6*&cVg~_jJI{x-o8}tA}fds3O2P@7doCkg;
z?tLx`VUtn-iUQOXQ-`fP`rl&kJ1pyx0zu0)Cpa%D)fW9nfvdnzz*YJVMcFlL3fyB-
z&dSQGj)6<eKi#JLo|6F-vt*zc<-~{B0x$<W0R917=5SBU?KSrMO<5%1^Ze_M{$s#Z
z;2v<DL9(_^dcwkyYz%h%UvvC_f#*NP@B^}ODwJGOELr{UP616Y_+F6Ar^mWeCA%}%
zIgwky7huE5spFpqz%>WsN`jjjFm~{SeybhyzYp9776d!e;*A<Nv72G4v8=o4=zkBm
zPwkzT1#^*@;T+j1HN<*H<6+<$lSeaBy7EPbno=lo_IAxSJ{N#R;4W|nm=)&D$*wSS
z#_aw!jDOAmze?<2t(TtQ_n7?USW@|B@;l1p@ij3qrUHVjB}&PTTQ)T3<o6fg9x&I1
zn`g!>p&}@7{CUms&lGTr$)DjYMhCLLoTEK(y9-+UGsf`$Ixs~8!tmK|W0?CK;1~r?
zDd={YxW|Lk4+Vy>Hf7W#J@ddIaLUTuAwi%g=-s7s+#^{$9?9O+A=ap!s&92JoEG;f
zCtm@6MM{-6D07JI=NAMwIu+7+S#Kf_i~w^i6cwCbVT0BAETfnQl+dRv0Mo3}xHJg{
zfl0PK<i&bMvi-R!KB?;zm^J!a=fEiM*Csm}ym#wL%maM_(p_+)jSIjmV;8d+fmj@q
z<Pt|I$Ea#r0HPDjFNfJ>UEnSH41$K)X)<WfPqXuBhz8gm&|I2jVMf`e5#)d|wtWoq
z;W+F5r{YZ(`0$v4wvRBmZK@NJKQQ;LAb64EY_kG@J~mH`)4q9Rp-^C_%?ytcw0#T+
zJz$7Mi=ue#V76pOM&#T8*cj*V3&y{*OfmP9|IJsiXfCE!8)m26kez=@M9LhENET2*
zY;XyfAphiY)CW^6?!|ohK=v-@NKtqGSp}UR5`HMicDyvZ^w_5X8)K)`h&xn?)2%}y
zfBEdv_e4;=JPt&U0%3^==u=GFnCs}1i;Fy0NQC$~VA{?<!0iBIQZz|rRjW=@<hV^?
z(&%sW948AlxO&QTR!FVv^v?ot4wwYOK<HVPBvDhg#cB}N*9|&9%C@~x8IH`g-aKrt
zPqIvY9}9|3TgG`R7jX4(APe(yl1a5#e1Nm$`6B8gl;*r}UqUCwlVo%xVleCk*rNd&
zVe-g?*7_VQ6BjTgJ<LIPA4pt*)fmWT*320B^x7b?q#H*s<^{C9Nb?<t0kc`2gm*AV
zg!M9*UmumKQ7BBCNG>524|0H2x)6x^*|C=ss0fG}vRqm~a`drIlP{q@A=P;>+7IdS
zrP(D~U@puRDOw7lj~Q>DI@GMEO`+u^FVx2--4KY}c>7d#-L>hERM71_{%ekHO2s1f
zN65OC`AOxr(=1V3AU|sS<LWaL^!=Iw4G{3LqJUr|>^7ZXxEDK{HYS4`Y1Vy%L_jVP
zS26x6VsC^c&zv|KzIgDS{1fmMT%fh%0e!$I3xB4AA&U?F?4R>otnRw}V`tDNS*VEk
zzmZVRv|1{=;fH}@5zP^5(_$80$Oe%>;-%!5QF?2wuJ;L69lb2Jk40^u0SkC#qgLN`
zI(bZXg$MEgSci{kQe&#HA)t^)G(gwI7j7{>+UH#~d9S8FN{~hA1FSlyBz}Wyqyt&P
z=d-+GQ#w(J9lNGs&D~WMXpG=&fO9~K1?&w_nT1)h<zyDsa_M=gW|x7D0x&{mEM>jl
z6i5)Zx?XeQoEBKnYmFI&S$r$Y6L<n#7mFAeZ->r<>*v@_L9n_y{%T3KK$=6AHy^7c
zg%7XG@k^+HV)B#z{lpAdD#*d+H6-#yAaCSsmOGS@a1&j7xG51~C>q|v;r5#Gg1Af;
z^P1{5`Zf9TB|4W;o>S$1HqQ0rERaNm&&9YU=fU-7*W$$Tv%Wyh#F9=itJqRVIXRvu
z&{L@-kOL}~m{;fwepKLLLNU6KATE*r5;GCl+GSvgiIfM6L~e8jHqSHGB_L|uN|^%e
zSitz#V<=}~4VaJavmT?ItjUMOJ9dj&k8t;5ByZEdUrfKTa9JE*UvyK6SKi)~UzQAQ
zlKnN4Jgq93k&la1NTDojHK!>jK3pQo#JR;`NdC1Rksxbse9@;?3~bj#^MW)|5bG6)
z0#}zH&Yr>p_0f<ldJePcgOr8Oh|ia8lTv_vi|#DUFj0Zf%sA4g{1w1_ht=zYtRghx
zD;L-;x__cPHK{qh<}_FtEa6RqafH))VMzJYieD;b)VR<TyG^PUOc@TymUyFi1Vs3M
z0kp7UqGo49_y7O^C3HntbYx+4WjbSWWnpw>05UK!G%YYOEiyS&F*iChGdeUeEig4Y
zFfgfZ{=)zO03~!qSaf7zbY(hiZ)9m^c>ppnF*GeOF)cDUR5CL<Gd4OkH7zhTIxsL4
SPj_kn0000<MNUMnLSTZP9p}mb

literal 0
HcmV?d00001

diff --git a/shaders/CRT-Royale.shader/textures/TileableLinearShadowMaskEDP.png b/shaders/CRT-Royale.shader/textures/TileableLinearShadowMaskEDP.png
new file mode 100644
index 0000000000000000000000000000000000000000..a3844dc2a752154d96c78e86f832dae7cc75479a
GIT binary patch
literal 206668
zcmWhz1yEF78(vysL8MziItA%i8j%u^Zjq1{q+{t&ly0O2k(36JT|$xWMt1q6L0Djc
zW!d=o=gysT@0q!0p11D2=Q&U6b0b|K1uF#r008Pe)iMPD2=5*V0r&ntM6hZ$-VMa=
z>W1n7f~4}t`+1Ic@1!sFb+rKa|6k=lYP0WV$o-#M2Lb?;bpO8z07WIt06+{tPfPtp
z=vo7IG@E0!b2k}_X`hw5hm7LSh}!*e{likb$|NWEX&!@8=95v6qSw`h{8ENb7D2-V
zhN7*4ncAs2EJB5Pmz$H+YW(33@2B+p8SOO<sKOM>ZWU+&bK9ijT68D<k^egS7f(8*
zqm%jDD}aU`YGkE@U?83ZL-h7!5Ko9BLP?84=?yg5q<&Y>aLC820)m*Kq!?p_BNX2~
zjKj0zcu^z%z?z5ggfUb$GjUXk{=~I_w_A@fmIxTMO*WDSFOIQAC`=5|14d@z6<<ll
zC>p3!diqnPZ{bPtgwb(ROo(YXEIyg1nfyW$7$gX%`dC2(^w1>wxP>RevtUB<SI`0o
zNE{ewQBDE`p#4cno?L1JJqTNXcnX{d%5Q7+35EqHgegXlG^p9Wa!C_(0uZTO^I`N+
z&2YzogM7s40E7kxRbDvWed|smRgfr4?@#50z$e|Iztum!gI6{PA;{MX<B}?f6Z*Zy
zOikdCii0GB3Gga?h8TPZybP*o6{9#*&s`)<ZVW{G1E%4ScrXDtu`L;4yagr)$7%ES
z)7bC7BPq<QqWq4yMu21S&Z&Ry;3p%RhYUS5xhTZ80Tf@<Iar^^6C4j1Bc`_?iA-h)
zCKSz9*<h5%Q*WE+$-PUWdT+$7BdPHZa0(a(#j+t#JpLYzxGmg$h><5*BVhrs5|3xX
zxFaO;S8n9CK(Sydk(n8#d2C{ohG{K(FdR>LXVWR71_(~N`_LZ6>c*Yiv{Je$8^ehN
zcxZ5y4&4Qp55o|y7kw96Qk$0%G^~<jX5nEobBQ4c&;Xbm|CjFyM~<nzgB?_`Bv<zA
zc<#T?r_a(iW6R)<P%x!&?H1Pnm=G@;$-iURb%{)I(n2?BhEn5=QGRguedQQ16VBx>
zX*Bh<f}P5X)3=@)=;@>AFnb8RmD^q!6witScJnK0`UKpf+B4i+h#*ir1CCGGXYd*^
zJ&Y$gQ)cQq%=#&9@|Nn(voT<Dkmjj0Dg+Ko_H))lbB%GuW$53~QR2<B@Fc-_859j1
zmJlH+20c`xb1N6JKctLH-?BvQk%QdfTti?gJh893J<N`)zwz-lbDRnC7AX>v^EEYA
z#`YJg)}V|QouQezZc}kO^;x8F^tnA?AzyiWKP;93i|o<>A*-5_5^1w|M-PjJmr6AD
z^_eNMhmFFm8w}9q63q^i{mR+YG{|9sq~pYX(DE|rMQbY+DQ|KJ5V>0Dz&z}ZR=I)=
zswzxZ5Y<Y}1`Q;d&B_^;dlwID+HM;(9Io2zhQBN~6(vyFFStkS@xgIm$}Lj=UI>~K
z9PDiH0W#`CPFnGW=&z$&q=q9CtRPoNn3?IdZRspcZXwgD#5<`^Qw8157!jS2oAvP5
zdWI$Mq)3%Q$inj;YXa`uJeO!2T67caI0)hR!E$>L`&vEf`0s56B<i*AV}7<CAq|au
zyYsoAd(8W0UY#0=sg2ps>Aq=V-dz6rQqaAIJBz2N`}v4_B(KdEBs&*wSut2%DR|`F
zQr?{WaeE|)s3PNUGiWVL*)(aRe}1X&@%g8K+tQB)v*kQ$9a4O`pO1>YAKh-A4$!iz
zFRRLN5z%eiF@;}PpTEvReu9#h47Nvh6>V-Td)@ASxF&%U5yuG3<eoAZMgE~u6P*AU
zZOMlzM3LMZk))LQk<dz}AQ;i<9+0Usle<2BqjBRG$7FAPqdG&U-;&|}5Txo<F1cGa
zKwF{)q10_5`YuUnwo6cHzOP4j#7`JM_MbaCwO1@?-)wTBjb9>25R?iQctFtQ=OIQo
zWJ57Kr$id{9Y0~ADglv1cs$(wuq{CF%oyX9avWRX_fz)ra*^mby6e%|bfJ*;!{gt(
z)FpJ`fyT{1<a-f%S?65@pMB~JH!CysW+*fkDc2}sFQ534Fkd4JCVAcV5-_-iy!EHh
zeV18Veb^(PRmhm&9#v^co$D-^R^u#gDWme4-(ER!9&UpGl`AnT%=iGcNh|J+?(JVQ
zz)KHn-?tuFf18XtSbx4V7jkt8>whIH?T~~U9>1N!_cwz7^HH-8<#OgO`cMBqN63Fz
z*YFFdfo>y}aG_RBa;ypL(MIITuZ)6Tla)j|`GmIn<#h6SbgNRmWc;INdO~VS!GN{o
z`BsrsEK<!eH+9rm`u7$+_14n|`3@^jSfJH%vk}#4rlnL+ZgxuRwQ%#8;O;@6&rqZP
zayjH<d{#OzW`-KeIjY2H<j-b0%NR4Ex6_Pb2A#IM2_vbldhSU`E@3@d6svc=9&ixJ
zITSs&D$Wii^@g&G7Iz~V=ooAuy~n|N1XJ)~K)X0I6%qZ4g(V}u^Dk?3Z4;Hp2x0HX
zoSN5??7N?9ZPDCm0!hW(M<1j-LUQ$$sc-^MsE8IUCQ8`$nSSBz(&Zm~&_rh|cNoOZ
zB|n(}B!#KQVjqF@n6KwXtOkYThN+(#!x+Y-pE&8^Xy95@##xpUZ1WkP{jH`n?k1k=
zuiT$5c2%C7kPUW)hTF?0*vn@S<-Nvce_5Ph(Ju@BdJVh*Q$_d-%6;)APe(KyJt#$z
zb2P#-@Ml;RvjPBOI~Rz|A4UHgh&?*xv=d_UBxaToT7*(9W#BdnW@BILg*tioMmCNb
zm5e!${&o2{uKogQB;GyMF3y*C`8jPXoOmF%tU!f=Om<W$>R`H#&moTF13kfl;e!Y8
z6Uw$!l9haFIiT-Lp!9ojCDWL{uLeFE5=erX%>MK~@0pn#B2lC8uS|1nJSJP3OchOw
z;6-Os9cwZj&UBXiRI6OJu;}AQt{ExLC#zT``*c(3y9T|sIbN_tggwc`@w%#OAb<UH
z=7#=>x0g@74RL0>GLKLFeBEcaUG<$*4Vz>3C&93rs!h^|Q%k(Z$A%jJbZ+_mHCF+_
z*U%ZOv?2qI@EZpN#NEis_`%|1*f(F(7dd+S`~<2QG7UnC(2|vmTggv+=;?Och$JtS
zM4GcRxl~VcL<^a~!Mp~}*y)$4Sn6>Na_Q5GW`3FGN-WVuOzM8+#9%MVKC6swg+YZD
zmo&oAG@{1yA{URu>XGD6gr~XFYTTc$4#1)ckl4T)W$ufA=OG;F<9Jo}-?2zmr``%@
z>of!5462pq;A>~ftIy!S>TflVit5u^K2bpd)J}B4@b(5w4oAzEXC!!?QyT2Q{67rG
z<C-N?)e?Ldy`wJqyr07K{}E>dIWfs*|JEW7Htjsm)pgp`?SrtLw@F}%X%n99dhy9J
zRt8fB6UHdE2=KDTS`&||Q2Kj5=;m078yeob4{y_BuOBY^wsv;G3;0)DYa{J4({5*^
zF%4XabEyT*O3whD)lBV_KOdWty0-w2`~mUo)NmDV!Ia+y1iU%>?`cQhK0@nKWB@R~
zOE^TE($ZXD{orBz@U+%oEW`Q{V?u)FT>c<mq67s?jF^jaJ(Z2&McT9z<!1UkM5uT{
zqS@MQ^Qkcgmf#}?h~;+AWap|ETYLmgZuy+fLGS;F=E+8NO>7dZEb`Q+;^%XMkIrne
zmwH>5GE`?8C3N`g0c8qYt?^pEA^C7&l2Bl(Mq=>gMZ6sbmOJb9)aHjIec%4RD=&^~
zC~rw(x%fu`?bdYRjEI6L?+q*wxWbsW@F9_zSDM%P3#4bs#j+sS%`U}5_P-YTHz)FV
zW7p^DK4(Jj#TdsD1~^$-?5*O1RDRBfUA78r0m2Km_n_O~97&s>nG0IuIaX%E%`dxb
zEmmH7Yb!mtvX=vVZxPRXwUf){I?lkz*B3@ZmWtQC>but|Li+Hz@^(l<nO=03I*V0>
zRC1b)9@ztQ^N_GNDImz1W~n-xBM`&8%*I!|F!+t)6#noJnAq0l$H^n)mqLz><RGhf
zd!};xm2VGukTq_S8~JtC@m(z}6f^Pmp)^N>(3GXXpW24nd*oP4XW-R5mW=1MqltYV
z<NL&0(@!BS6cJmooJ0J5@jmj>!^f%lQTpd6=@$?(B?Rjxl(FRZnfFuVVD8a->my%4
zx)Yr&74)#Cy^t?{{=XlIaxMQjhxz*!zn0`#&*Z2GRb$x@Q!uXcHvXnpo!;WqqstD-
z%P+T5@Tb~fS)mjPS|bp~Xi#<=PuiCV<g4i`eL!<4a?>V+OW!x<`O(zP)0$y+J~~wR
z)Z@|!X|Yr5PHg#Zyr=x7ITwruCs5p1^-#9=^I+`0a3JkDWC#77$j_M}*f8P9W%jKj
zviV&iAi0IWE8wd!u?x^vJE4?3YUvYRAmB@I?!DAG^`X#N%T{qL;-mY+&MjbHC!bng
zi**z6r4tPBwjFmAm!k$AW1$^M4{5R8!WaG6(AP34*cb9&%Oem!@QcfdS4(W&)-}DP
zmxTlS%)B-;@D--^+3XTmOoL_Sp+rh2Y7NC0GN-+JIF|#O%d6!U4jMsXFTcyKT*fp>
zK_dCMYXrq+GFgx?U?Yc)gIrCNhx`PRi3B>{w=NKKCB4nVzWZG#@Fj}(S%f-1dR3Ha
zEPyE}=VSXdL&HzPl4k^6V}+>SY~kl_WSe^H$;gnqni-5BfBtX^N(D`wxiEAkpu^T9
zZ0s&Mp_Ru<{|@x!HsXP`yAA9O+{*9ZZ^p|jweVh{jeLR&kCjM#WG<H8;)!qcFf>+T
zqMnqe+?#lc((i=q<j`!wBpOLclJirrfSK9wwV&SC&YW?C>w8$qGh$0;5#=u$rkPZT
zI=;5`9Vx8<eAc~YGJHI)qm_<XBK?Vf@6Bf_XbPTOZBrzr@kMWyf8oqU<Nm1y@;dj+
zVcxALp`vEA!*>iBwt{f>&6>D7XI9x|=OF&$>%{j?e>T3vzHri|Ua%kmTTBA91pK`k
zuEItAK0m^*50+K6()}}xQtA0x{})|MwQpPaIfY0$e_;`Ox^r?Za*pp!0~L~NnI!$m
z$E2}mWW19Li2uI(_NrP#54AZU9ZI6afQ{n4QE4sm*ZG9czHvC%i%u-tGPiQ7{wCdW
zPk-#uPmpuncNr7|jv$(bhUZ|6&@PT|(In6oTY5ghhG375b9A0$DHu=|9jcrQ?P$H>
z6b!nhh>q1oC=v^*RFbr{+@pMEql1551LA~wp%|j$U_<fx{-E0HX)6&wNQv@Fd_F>2
zIqBL6B<VBpUc7o@&aKXR@AD>p?WW8QXKi(#JczQAB6Xl*Nn#daUxAmhOH#n{?R@o`
z>xaRc5zMlhF?0AeNixbq4w?}VtF>wQ<=FP#@PXfe^2BR8z4qr?xR$D0uAym!((&+Q
zAvvtUhwzK-N*QunK{*cQT*U*LU%Q>9e;c9ioj{`(9sTJk{>j8Z!(j8u@PnPbZ7-EL
zaEt&0ZW?N59m<n)P&d;}p1yUuMeD8eQ*z(<JxE}zs%FJ<eT|@DfH%}G6Q5D0efe>D
z><wH^7)8xF;JDTG4m0ght7oRa4=fQ{#4$$4JHj6~kS*||-L*08=F|U?@r2!ZEXfgW
z&J0(3pMkdyWAA7sFx=d8nFVE)^&(u<O>Bp!oUznfD$2=@TXHM@xMa+9J?&Qq*}qYZ
zF#S+sW4K{r^hg=Y_<yc$I*JI!=(}y6D|3rr`^3tp#GRM7b1+~iAszs+KnRRZHzefG
zjx67Reuo(C<LRSgVNQRPs!2OZFJ@$63Sa9+XOt7cF=g-%*WtC#jOSv5d-DERZc33*
zr^DStS<h@^^fi_8d2WxWj>q(I!00&lmV;}W?T#sCfbOA~+3`6w=TJIAq%R0<c40<O
zvDOPJX_5Ycr^OIYYevN3dhK@eI&6Q<)8vFbMG6pzo^g42u+)x?X7^M09U+Ku!szcf
zob1F<r7ZYzTT8omF>gjf4^}r1lVPC(q)*t6M4k>qU?T?-6CPR;2JN>$(JFVgK5npS
zQMSQbn;)taZdmT_lVB*);pQI3Gmc#)=C;0Cuq?vY_zf59Y4UR}eXFkOwyp#3L1`3i
zbiy(N`pE4`QC|3o?JhjmH~L^nU_ZT5bKw&O_9(WpkJu@D2f}ybqqgH{oPdMeSw`6x
zd~-^UtVGVXzX~v(&>y_L=b(eSkCdy{5EF6LB~<21*ND!-7|+07ulMD#A<Km3uc)Hq
z%C-!(aoKA|#WXRcbD7sSgq&ppAepGJ?<Js<`hyTl)TKxtb3RuBsEvnv?Mm^meLW6F
z0{7aKqF3lz`N#UNVz#p#D6u7gqIGz&e`>w{l3vp5`SU%n``Rs4bUY0H;XBP4ReKW9
z3(O%H)Jy3Hr^Vcx76z!ca8A@*PVNM3m~CE%pv>3g5Vuu-ry_p#Aqpr2rr}JXg!TPs
zBk+7yr%JtE&F?b>3?U?q5&;e8WAoHH_<g3Bd3#g538HYkUq^J&E4=faH?eVg>l@1P
zGeiPi+i)Z34W{UK9Fn*z;bbBv==R^v#`Nzl4kMg>61I55eAQR3tTjJo2jB1@t?BTk
zb`L=&-4YEyP{bdJ;iFMs&q4Wg>#cM8miuqMt{i-u@vG+nC7RO|J+n=`_co_Cn*SPq
zDUY>3@DDr|p?<L)&P#N@V0JG@|D(y9vk9;B{wz%OiVq>)Yv*KIBTb$G?>JzF`tcIW
zL3C)LiRd56kppwzG#~mcj2&vsLCb%aXdPp{?V<zzf``^PjQI#}HBON!U35E;3``hV
zG_ud<Y4ij9weUGtP<@_tvmMe8s*eJ44w!}D?W0+fxDp$d>%xa+;USH%S7>RvtzOQe
z6PVdK`WyT2?6Juxa=-bk=_nFho$t+eWQZ3}bkfKMySmdSyp{P76M_vx-OL*%jiVC%
z8}hX!#Io+FmN4e>4aqT^@lY^l{Q%#el1+Tm`DI0n3F^1Gx(Xw@&kTXze~G5N#n#Cx
zu?vSrh!T8}%4i+8-ruQPchg<kvkay36W(V)(j1T9cEX&99EJi>9HoQx&rp(6_(Asc
zq1Gz_w4A_PdQdOd_}L26TTr3^f~~<~0}sfE{h@rw_+9w%j~D?2vxDI_9%@yfkBm>$
zBLFAE;0)fz)u6dcm5w8huS4QLjYcD^F5v42>6htW|J0or>m#gA){}SK`+TnmO9xyO
zB(WEpTs0R4|C-p&qf4LS1ZH3|yV``k5?I^Bd<0dg{X@Ko(nM~XA_Mu64dnecL=s))
zwFS)q{pkzl#O7XtOuk=qY9YQ2CevI%s;&iY94*fgmY*fie^1kw4iId{9nlgboRER<
z7uz5-A|A&p!~P?mz!(iu2b0y{iTEbciA(EU(6$EEpv_ZQnSt*9wD$SD9R0sRT41#4
zmF$7PFmlJB_ahv*u*KZ;Rji*}Mc-LDzT?dW*<1K0w{OP-VpdC?^&C53hr#vK{X*4n
z`$l0QxTA@bvbLmOV(EHGAjg1h5ZZ4bi?FD7Oh95wp~1KsLV46Vy+=B+-aoAphRFWR
zB<N)J(JbIbVu0$E&5$`mQRj#y$bO`SBIwLp`v%_h7S$Lxy{Uy>E+=<KaEc4@DYiWL
zb4Y@bpnpZZeQFoa;tK9E$3+AYV3S*fZGyM5l9(=s<GO@lhBf#fIt9m2nF47)4_jGp
z8Bh1W$WyCdAv8_UG}+QeIF;nRmLN{7UbK{qZ2$7xGTx3)eU`US;+unkUUtZWWyhl+
zdHR^Xo&ua<6gc3<_e*JDweXhL==8Q1*CxQ9w77+Ee9a=jNp>4cwu86g0+IDdA{msG
zrXvE;Rc6~+xRp${^9K$NP(0lz_*s`@thOt!DQ=IiFQ(%RA$A5f#DEgmz6!VsV^!jr
z_QL#9abiDl$^M3yzwO>kbTW7fI3^?3f^F;*UpG2R@p=g_(ERe_g!al(a6w%^2a+xE
z=Og!cZc+orsqNx~STR44+^gUIY^}F_TQ{GIu86@tm?$P#G3}}#b~>gN81t!)3DmPS
zzHXR4^k_%BTwfDYA7ICyk(QEYEjhXGe{uV%g{H}I{F9)4gf&*SyWUNoyQPewZV>V5
zAic;lTAjbW)?Err!x6v8ys$P^R}Nq9TBwf@$J3Z|tbdF1@Qhl&d5BHLkD|rui)1s9
zK6N1&I~<ME`~=pOC5Bhbc@2jkK*PoWqWT0=+asShZ>GYoYdPafZGv`vt_Dh}R<=lx
zdo3a`FE;Uk21KYMPj}E`EZJA;+Wrzg51|M)QRPnv9ASC$hK8qs!9GKTZe`Fl54x+t
z{*8UxmMO*VCz~kHLwxfiyb0apXOp_c+vC3LkZN%99K<?m>S|>F6wqk^&KJ#RwQm1Y
zfwL3DAKXgl<8H?)e<V**qF}Ky3X=80Deg->9cO3f5Z|9O(mlj=5i*b%d<7)J-}<uT
z6;0>mksbPb7M{q;L~gu1@*;T$w4>UYjLJTXe<!cY+LiIiJ@#);n`d%CCS71d7`H&=
zeo<_gRo3lVmlbaRKPBnh2YU@wJZ(SF$y}PCO7O2M5Qk;(w7tLK>Ma{Z?O*H-$?5b9
zy@F+|5FN*@xbYZ~r}%$(<q0cwq}HR7vkI?g{{YhG4?lVb|4P1r&0y>5+??9TK91Y+
zP5+1ko2`X0jlwy(de`_PGt*RNcPnV-9W9kc`l$S8z+VE^bw`awoh=6gm1nOT`XG(@
zwI9=(9z53NdQqBzUodL*=G>1(?p2{c^`6@eNfMv-X}R*s@(%DhSE}p>Sgwo>LQlzN
zhk_N=2cjq|^d1S68?VcigGa)J{(^07m#^DLj)yRQ9y_(4AsqU(QE?d2oU<eOU-^+4
zxEWX;VJq*}Nb*5*DmZY4$E8!bYBv&H6+r&w;X-G_Ih^Qv9kjTh%PExOu=u7wBSFwr
zOG)oKeU@vnKnly$=Lj;jly-7%qXuo^CU$jo!^)AGTzcTf5(oUs^>Rbe(l_Mc-On?J
z@cJtZpNa&ZV+$!NKaih7P_#N=eatD=`R4ViC>%qCk(fXI^u^@}|K>=}2O|N&^X^Yq
z$H~mYFqO9>sAXB)H7*7tXNbe6U_xbjuJ!Jydm5sJn-@6u&TNI8Q<^#aD2f5CMC!n$
z3;6-6$3^6Y$3Oc=IN+2tU^vhtF`fkY_g9CMmo57PwjJh&KU}AB`Tjzc=gn~v1-5@4
z8Ht|WGa0%P2I|;{{5=IF<0S7Fb{uWfrIZfiZNkbT9auR6Rk3ft?Y6{GS+g`}AkIy}
zc^KQA;|=cOz%MVt{`fB_4li@0DC&(Bn03UtDEILYsvK#^hhTplq#KRk1SM{{AN<;>
z1@+?p`nqLdMyR)2S(&mA{4jWJu$Su%_CEffXTd$4HV-4xSM*#z@ik#`SmQaE{|t}z
zO0g*375AwYviEm7A%uo&UGlUQOpAL(*AHsD92c(yrva8>dMJfXkJZqcy3Z(&XS^21
zIh6<I=CuK%KborrD_wTGi{K(-kmc-JCv~ADgMU)AJdD|nQ%^4$oz#_#0!Z9%WwX#c
zXld8FUGwTR%<#w@%pylgjOKcUzsLa@@YZIs#IWn#$Inx`=U`v&p`inTS&*xeujiU;
zkfiW)gGfkh)~VIIbPz7|CiSPWP|?cH+w)AMTt~FDMX&*KZ?C^3^YHV}V)JSBcCSxS
zkFY+ePPXX4?Z4U*4*1mbvi-95Xcw^e%E#|{rccONYo3sYw+Uk_AU@f!A13eic=yXJ
z^QxQZClxSUwu3b&P&55%s-hUkBHAryr#KJ8F06k}e%@BT8uAU7JhWR$xg%hj)+XFi
zBB+frIXAP67Q2dkJM|f_c`jYWdXTy0et@mg<@+v4whS*4P7@j_=v9_|a{r{V9ysn#
zV)5ctV}A14u|vsj<A9W1;*qeFtHq<WIjbIW%Ste*_)t){Fkr0FIJ#|)Q#o43iM$Ca
z!ypvs6$D#yK)f%B;p?VYe(|b7pCIH()rrS{*n?&_ZE*W<df@;c=r0mv?QN~IHm0nN
zXEF0|Vx6g$CDyf@bb0F?^+P4Iva5*6xb763z76EXwsM;r<P*)<G<^d!D7&o4v+0P7
z7X;;LfSS+3Of#eZ=uDL8<CX0!-mk|gHW|IR+GJ-Cn(nb_`J!uQnm?Oa!|S^)Sv^|6
ze_qzRF_V#K_~GJi<#w17Ijn>V`VDpMeB(FBV;5ilY<{`%xL?-ufx_O`p$B*CO*QaO
zpLl7_O#M8U7TEdzl_FxfcS05<2r@i?nb)b+)YL(nX>!Q(;8}V8dWO0t)kaYdm`06z
z!+tK|XZW>_9Srd81tEHX(${4pZmv4JVMqx|aMSOY&zCf`qFcs8b!AD#2)>PY29}uE
z$=Z~>Z=+HR3FH1WCwc(4Cv_2I4&xQ;s>Vs4EyUMw?hW%|2Ses_hd-^@)W!d_@Ifec
z)#PFutey60emEW%rnnKK?@y^3|Gnj<_LNG{QGd+!p0fujrN8dAV|<tzsS}-px@Ods
zIm@%1PxY1fUXDJ==e0HnFd>$teTy}c^m>a_be|aNaw*oAL?tUB7$h<16Ds)Nt_uFd
zn}<>dU4|K#gU;}(8}Ujl4S8D}LWsL#zU<~lpfl`ysq$LSpwdL9!|I|5yh1lOZC@Q7
zY7Pyt_`8oD_6%utkn=fXygcXi$&MnpAvvE<(D!FM8T9F{6t8k2pZ5CVRkN;rjt86!
zm=}-`1N`8*yXuC!cq+T(<lZm`q4ovj1*=2kb6h^0QF#WfovrIx<au~2aO9_fSMsOv
z9gUG`->sKy7ILoLn+2O8VF~(f0Mm32eGdJ-Ify9sJJGT;fz-bT_8qYS*-<GkO8bdI
zwiq{_*M`5v_Q%v*k(Z3)BmCKXzv$OJpI^}Lc}uo8TZX(2?3TFEKLS@X4m~0)-rF+f
zV1zeuzOla_S}xvZ?qPwbrCnIQa~uIUF8T1AH${~AI#u}|-T2OE@MiKUT)O8p@ZHO+
zj?zR{kibfYLN^&~po<h5v|iUuXf_kS_m_Dl#tXq0N#r;HtO%)S8sbyPnHG5Yob<gv
zSAUEIymX3dve6iu#a`Kwikyb}peXw1SZh@Oebn(072=7OZqf|uCM+AezRREIsH`SU
z4gou8TPmh`z%>*A8_4(92P~O`CuB%oV8=?5x@+K!ktDDorlr~vo|K~iK~hApU6PkS
z1LvR|;=Zrf)lDWl{`y^t5*>;B_MR=ubWN#Bbb{K>L~ZFpFDTk@$fwcqoM7%j>DQwW
zBQxdAkK@LazJr#C59|f6upe{q-$lV+B4RPHc!|3_&BRpxT4!J>|4;dj%OE3n23(1j
zk2(k0MYH;AzQ0!6G_w~f_X+yv#`GI}?!XD{@^`e?k?CNO7%L06Ufs2;xZ~^qU7rz!
zB`f1-T%VLX-!|yrv<yyQ*JY@a9H5h>5)s6Sr$a~aaosVqnVpTIW2HWKyH7oEHcgAu
zZCaM*pA6hC-2wQXV<Cb55tn_J9;(!M!!u)mLOJ6O5fcS|h>rz&dx6*!zw`dk-m|#v
z&gFpBeXfKRs^;<V+p?|xei<{uAQN?0n~CMWMiUK#$){UQ4>D$tgDXkcS9Wi|5^lyc
zZ%reBc9y`3rbe3gYl?wi(zFh2VyCy1Q0f#jE!8*Q)=nrVpES>b$hx+?{&4wQK24rm
z$~4&f^Y7z)88M2C0Ku1ti2Ao!)Ri^(OCDP`(1jq1B*;(bD3pAN<o-8OulQKs-}qX9
zNXqM`n8>wy22d(Yx-bF2lAjF41%<sO5i0DJl@qykjQft@h@>eSjhcirTfl!?Jo4$S
z7oHK_h-==8K-tlH=r^8cW*)3jBgxORQDp#Bk!nR%(_fM>l3IqFQMj;j3}~p!Utgs%
z(eyesA+M9WpJNZtvk{k!5L3<}lv}I`)fP8FVhu9Nju0{!Z9t3YgSjFJp}Op%Xjx&T
zV=Ng$07bVEuU*W#6z|olo5<0lsE*4fx&&@4671K-g<3l;SZb=};(fY>&>%BZ@oQoO
z$BA^lhA})}Hf(yr4oJ-DbJSh*5Z|V8y8UA#i@;E5m|<MdI1ir(z(k-(eMJc0-CL>G
z7%i(mJdHw8_s6dX;Ij#i^`dP9VjMnRfN4+e;rm71ojy9Vh!6EWV6FYz(;{n^ECG)|
zJ$q!J&Xu>LHf5{l@*7`4P>^?e{lTL7GZ{_!=T;B1r=6D0Y8)1Dapjolts=$T-Jtb2
zZ}8a2Y2I`y7lBm)EUaU-<4JNoKgq^S_aZHgdv%oK08tO5P-j|Jv@EhF(uy3)Dis48
z?D8jLT&!X&-ZRrHuYZMh{+ViUGLm13&q`qYQ$__(J>U9VdISdt!2F7P7awWPcOJMy
zT48PE%W#57@-iX(OkVf0jBb6S&m*Nbo7F^6!_bE`yYI9fW&=x@3Qp?xB66Ol!Y?U9
z0dgd2iMuD&t*^WlmA(i%5m{`jf}4Dn%pOeVg%Ktz9yd!*;fWu@hE)FLr13;*Vvm3w
zQCz(=QUXX6X@3mE<;u>CMe)mLdmNtM9dhycxqTSB6Gwv&LpAc$FS3=JP!A~y;dc^`
z9-|%|+Rkk?j<o|(z8gG6f9|QAnQE8c5+>H3<W=?#-s>~GGEU*Scvr2*{!EQxI^ehz
zxmUBgR=Sb0@1d|j0OGw#-`>$g8Ia1`Kv_+P?V6SyTp>Oj;zRC;+`;Q{;Xz$U;Lz14
zqYV5EtXgCFa#GNFu00C6o~uH}jdvU*3IhR!PSch^W-6Y9chS~6g@^=ZGWG#|!e~Z<
z?fyKCHa*1m_k$A4^q&zdlWb+qtdirF=l?9j*-%O-i@La8$v!s$$M)^{e->YzFA#;1
z$q^dU%nmE<dDW$ganu+h8-FewAvl)oncT1Anu!{ws)qCE7lcyX)G%lnT%`N6w|S(-
z`j?cVIfq0X*_{ZejnUtT@;(qY|7?K$V7SSIr;0p86w@tw@uwvTw$kSRwcK0lPVB;o
zqwcr7>JTIGP01G%`SQ;k1tE4EaD=nDI_{18G^9{=v;sLDWV*>=P)E2IFzom6Ma`?0
zIzDCkB<f}nMDE7k`dFtn8{5^bv}^UV-0DXl5H!SJcYjt^?vXTe%9%AUn$4dEM+~mT
zIMSCN9$pqm(Y)mfF2T!fk)do+6ux8V_OT&4wpMlBxdB={6GL<~FoeNdN8oLJ<gU;3
z$9wFvV5a{3Tkua&9S2p==bF-#csQ8Cu|FAHy%Ody;q{b6gP5|O{39TeniG2Gi*11T
z>J6lj^lZr+Fc*EtbK-Df$}c+Idn28KFg5kO6uP;6418sp;B}Y1n#C8!@>xK^(YtQ_
zGy!)ZE8o&$61+lH*+zkdiQwN3Q9M-FwrAzgTcpK~pZ|a)DaCDmc|d%@CiU<;15P3Z
z#{~~e1c{jyBOa0d!e?UIx1|4o??g8_L2)ja)0_UT@ZQ_fnd|<NOON|r%rRYjF`rdV
zSvKyYVNBt&KF2R>9KRiGs~JHNy~HpmDDHGCX+}VV21G?PlS#|=65ig9C+n_nrs+;8
z{cnF%g+3;o7)}~V2_|wJ$9k{!y-67GVJQc4ypewplbrX%Q4*P6>d?TAek$vcfdX^k
zsBuOodpUJY>oG~hq3hFBO#@;m^??u%u5fpXmql}5n-mGJ*r6JLQ{lxV1yycB!}pp=
ziz|8CdwvsW8ShmhLFq7P#0ddE8e(?&T6-VX`E&ZRAz|S5oInS8CuDnilg{^ThM?Vi
zO3?RLg|Rf^K8`y2`LUe2s&Au39>h!t@f4SRX)sz_Pob<4RK@+HN%U7N$}s_RCvM{<
zdzhu$^NzEJ1IIqZboBts4FXeMk1bIWJnRhG*W%#g1(1_%d-a^sMspL*EZ7(=!J6+x
zVj-+13$i;OT!X1#<ml{c<PUOKd&=@vaRsAHPTZd+01Ak@GkPg-iv5dSv6>*!oirnV
z%j>SdboN2E!Q~WNjze)V+*p&S8yhrJM`!8FvKnVp>ZfUitg=(M-xk}JyIP2wnKG>z
zAxP)^;=|JE*PbJpKeEqdxwj>@EFpOSB*lj5_J945?;s(Ho=U<S(~O0raew`O-T9C2
zJy*j5$%Rd;OZw7+C7oD&>n_}vrg}bLH0>$+0L$>C+?9gZ`bzKs^})Saa-$mlAN%*&
z9mK3Gj`rg`KH7^W{u}hH*E>l<KS`!>h+|tqTOEi^9b3M<;vlgh#{1QKD!9}d{G?=M
zjeP?=8RB55PDVCE4Rcd#CN_{0Q4RH@r@b$SAK*`g&p(MMeLRB|1%!1ixJa^LjL&XW
zZp15o^xT}t6eQLA8wZE~u35mmbu4TXQu5*L%gboz>m~_#k<H<7;m_eKa=Ec|eZ4pW
z>$-$chnPPiC|I1#R7e#GMlKMSi}>qSQR$wS6@gE_-0)W}Axxw>s#}Sdw`!!?hw`fl
z$D(AY+YR1P6mu1Jc5l2WUQQdw`z&${237REf4uX+VxdZE4z|$u)-3EoL8-a20$Nu$
zDVBekoiB18XdBcl6d0w9s2j+6vLn3*{xTeNwDDd=pPR%`Mh|8YZ^?KLCO`Rt*yy}u
z%h7(=F=}U-)K?9WyX=y7{hnJ);CX;kdY6@a9--$WWkzaCl&habka`Lce>`vi`P}T8
zn=fWyN$2LIeGcZ^Z%(Z*4XdV#B~tNVY|^uGo1dH*e8)cW{6~_Q3;#dAgXF4yS$gOr
z#!R&5Z%5zNyqO}vzD192Qr=2{_BTx}j)F&>mpFa@$*InlMg{#I7eSHNyt~95i-M|L
zUsW47VG27OfsIyzPQP6S%e@}tW)k#N{Em8IDaYZl4>3b;v2hbgA4WMyHOm3K%kKm&
zh0%sJl`McfRv42Zfb8AUcXU8bmg8eL8UiQn#`pTd9K^^(diwMDENZRnqZ!dZdDcGb
zCQON8lEAt#AN44anOoKk^Mi+x)3L9)vf@~*k7%(n1@R1*tHbD!C$AoL61XQV3C&AZ
zgDxGCuVZ3;i8+BR(4)9si{I0H-Odpp8&@H<!Lnb>_O0TV6$Cw^09)~qkA>vM_Asuc
zXw@bB*EP4bn!y#S|Dfg8KJ->^u|KM%zD-+>O`dvsVj20sd^dXfQk=IYSJAAVZy}98
zIh(*MT5{0R9?z#Zrl0toa!}M9u$a1VJ)g<6+Ad<)qTO_!+g#r!s~jZ99IS4Hdh-(k
zb^tqMoG1L8^kEBpx!2I}(@tJ?0I&Z{iYCF4WAT|+98|%F-i93WIt4G{diu>zQ-_DP
z#46NmDM?3A&0j*KER^njtTT$C@H^)_%BJ_RldUsLA0kFS+CTRCmKYjto!LC19HpRo
zLLm-iV3}>u`3A`EC<nH&5d~H%e=Jzu%K1m1I0?<-F@76vX4tOvB!?WZ=L&E$lhBRp
zII>W@H&W|RnmUujvhP*<>@a&`&wKr8!at&*_vI~+nnj2E_;n$QQW@m?+@mpo7KP{Y
zyeoPyAKp)WG8W&zu1Zkp)KqKoqMNNYI~<C<rQh+!XB010&5#pCYtmY|QDlht<m+YF
z><o!@WjJ;|+N!ty+Q@t)>$xALN5IO0RRE5ZiX<CzOMffz$<_CJx~W(JwcOTx@^8hv
zA)LW9NAXaHf}M2x-S&vOwCBA-DlcyfeILb}2GnDTn^f<5-G}>hdD~(g_Tc6*0pUkr
zV4zdLr{8oQ5q!vyXR3>FpDr1svh)8U9znjDwUj=#GDSt|ML~jycqYP0zR7JLUEQy7
zG)=9|n4qsQ+KO3n*5jKyn1N{*o+K5Nq!(sB6u5afRP#HHXSMo%YLNm*;@V8+<Kj7^
zo`f=i3W(*a?4h7TuXgsVz8YC;2{s4d*<KN=1kc4{2mSrsU(>x9xuT*;masr$Ifd{_
zJb${vjc0P9k(Yg^gBV0g(|Cb8aJ0AeU2s^Zi~Lc~=jVs~YV%k9ZrLY&?Ond76I8z_
z<`a^Pu0J(1$B-!T2}`s1uG~_J{i@8883IWZ7?2>9(7e2*yME$2b(P#`&4~8_E2or?
zGhR$<8%Ma+Hp&9Uy|M<v!%XhZf@txz&a1oQOJgU+dzAqyQK}S^K*d1Wbs4VrFj{0X
z!BuD&#G<Cw=b7$m2++ybR)6n#^se{(>pta~F6VJ;LE4tAa0XUntxu<`?a_uvmHQv>
zl{kJO8cl@%L$^tQtT)5$JMsXQ8HL5yQi6jq=I6$-@-l%ML`ihxx)>Uyn8B$%?Mnss
z%L_CA93esL7ug00VC%v>!T1T!0#wcgcq&0)BV+JIH-COZh2@lb+o+Q|wU!kmNvxEu
zrFT2Z8p*|jDa>q4u_4(_ll8%Dw)>c{67KiqaC>m>7e_G+a!%LjLuRX)XwR~&=Gv;7
zOoS_6b8$Jni9>jsP>Z^b3L*1@L>i`680#zTlMpT3QKK(wFz)uu-@eHH)Twe^qo4Yy
z392C%eCKeF<X)0d{yHPi{mF5DJ|}-!{=_F^A=JMD;8WQBJ+`aSj(tT|c{}1PU>eX+
ziS2vtzA@nwiED|XPnvW^R}YaXHw91V>|5J*IvK9xDY9{S-|l+Rw7e;6h9IpyqrnMU
ztXZ&;f0Lb~ritccJDC_FZ5CE0JiT}M)!n4#-kT5`j9<t8gdW@b0C)Sz7h@j|()e=?
zNU+kuMqh2yzw|F$Y8PfzNYCv>13c{mVB)tpU=^QrN}eQSar?<8Oltql_k-M$q>p(V
zNnSyZT{h1pDm?7R`(=^1qEAywId~zu{*CflXKE{FuEhtGMFU{Jh(OwjU&reDL!LV4
z5T`8({-4p>;??4eRBX1z{OP3-cIJeI$u=EW**4yLiG3`^iGt`kz9ba?6j$rh8scjd
z85ud>mtOSl@w_v1eTipPPId2rk^%b+Hn0-XRlZ6S_zb?sh+_ElzDAx2c85SA%D5^m
zPkS{<$5fV8G?{w1z3#bLiHHHCa;oX?7z53=*%=Q+o~tjR-bL%}hZT7?%KMj_NyK0L
z45ocpG`YwuOuuM?4jXR@mY=`T80?`SnnQq|1O+~gY{bkI&A5oSUINYFf)DA7iW$hC
zHXnppZqEP=a?`znX=LRx?<GIsH1R8Vvgk9bLSimKrWI5_@xkVWZEzr_q}gr=wMIV*
z-0vC#FrO9izES?h?dO(a?F`J8z5L*Wl#3E_N;%6B@t@a%2~n}#&+D$O)phX$c93eJ
z;tG;at|boHNDbBJ3lFr*2BGaUKI$)i|NIRY;T`nyUh(Bi-a;K#l+k&P6r2{W-rjaH
zcF2F~;STf7h24Mi)@?%9VW53G=w7+>yQo16=cyMyX<?av8u)I?#IHF>0Sv!>+bFcM
zcK+(d=vGy;-8Uu+3lonW`5o(Fd)ea9#2IF+D__nH3Tq$vz{WwtJ_PjDdsP#MvrcR_
z9A9yHzb04KUvcl%zFOC3FhRey^^IhHudlFKC4_H$CL_yr3B2?`;Wb0C$;xZuKu<Rp
z!uk<jKw6UGQc2!wWycs<6}_9CG}bL=vH63uN{vpBHIHL~^UKbr@QWPh4v}YW!ig2S
zO8b7#lHIpQf*Y~&p3F^vQf{s_XbnNf&{K!b1a&w9R}@tDGQF9NYT>x*FiACNFQNG*
zMlh?ik3E0n>mo;t@PDea<bl(nsIv+~g07&#;{M7ytIiaiq_X4P%Ks#m#_hhy6Y=7@
zjuW_=*HOl1C-wJe8??WzdzBGo-`u+wI6nKK<*t=|+UVcSe6&Am7d=kJ8Ah(ng8e=t
zll+82BnS*?W_NYDFI4bUm}=<Lvx>3zZc#n9n^{j+her9oyy!WVV^l7EATvwV`n>en
z?8C~djLh=i=7Bj|XL;G-g_#WW_3O>7+m`|LmOc&pR@rtgFsUa7;;FQ;GUNZG3qK-0
z=56-2qBU<)4cE1;A07T`D}pRJtX^||m?_NLD)Z7B2~}Qz(aM{*LJE|-4W}pBI2r@`
zerhw-{|#!)IrvZEJbw_w@L%XFRrW>wGj$Ia=AS7}=Tff=%-_70P{{S+l{W~s@7IUD
zl6<FGwRmp>s4X;6aZvb1!WH$WUuXQy<inqTPB$WdOx$qgRa@-lt&8cNPUO65Z^`wJ
zZ(CE~0zTs{eqk14u~Lt|$^tAoeD2$Fe&8DQ)z)zJ;BZ|xH;%&3v02EWE^;=;)JsQC
zIiN}bI`J>s?NR<VNKKm6h^O$+lVEyRmVq);>B8ypu|1pTY>?Fps-DL`^?+mV&uSCA
zjyu2BD*l%v=0Ew?`4K)ylb>uYHfN*?BmEaln<I}s!Hu;2GWjx$PeRHO=s6y-`<oVc
zz|((=$sNfK8)n@`@$Cp5y9kT3G7dZ%**3Uz7DgZGfsGxt7P9j3ub-byk{+T{a;MWY
z)vwh0Qhu%U%`SWA$3}hA^=(p&{h$)G^5M#Qn_?mb;hmLX(cgK`%C~^ycy(EqzuOJM
zMZ38CC2xa*dFy!RmnQyQCIOpO=sz9@I#B#_R2zl8m>XyM{_Eukw;5Wm`#m={tF>&~
zE+5_vWI}D^0>b!Mg(j+iRBxPuex5JAb8rfEe6P^L6=d0Ju?tgMnTg*pw4Q7SNjvSc
zDeo5XXQ1RNVU0Q0eG7c+M0cI>-{(nS7JruSizNvYYRwgX#~wTj4-t!O78|qV*Iyh+
zp<5@FRulc`5k-nPUh1!ppk8iC0mqXvE<utYoDfA3(O(<Re{p*6f52h_c;8!U=*Y^K
zCcKzYK|I|I1M+uKuPZ++x0H>tzR^_9XW@~{!S~DfC@DYOukY@w_{)paeKXZnAF+@p
z5@_9q!$j}<P4`1w0X76_?|hE=a3PEJh1M}i2X>P@1=2(i<?`?Kx}6hrA4}PphOP2_
zl?Ou<i9c~#xc_Q<8}yfWO~-g>G#q`V9!5an;#9Gylxj_{@@aWC*5GqP%^wzv&kj1x
zzRZnwX`UWYgybskGllXFS$3(>!B)SaHe#f%FB)u7v~$ZN6VBC9QtSAIwp=Q+ce`(Z
zB3PBPDl^%rpUk8{QJJi_U&>0~-m$xW{fKQ{{#7Hu%KJsoyg<8mX5`PmB5@<J6}+$a
z*L->g5|c?X7g(b4(1K*^vKds8&~wN2r`LZmJN;S^+bQ5ns%EVZQEl8rj#P5xX^&gE
z7bk-#5JE^hf~d?3sc`MVh8bTBEq?k5m*fW>d6AKXzlkJI?pM>D!-|JNU5$o4y1+lO
zudHEH^Pi-_^TicI$*I+z0{f>2F1@aF2gl_lPY;v;q{IpL<i<NbK0}!^KPV-^GTdBm
z=jv+4()MhEvPNPuVq!|=^I7(nWG4PSI;7nKxH1NX(<((fW(Om;)sSSnF>b%E-kmQt
z4m1kn{_Dk*6dE&bf%)4DJpjE7WKDx&!_v*d3&h-dI}smd{YIp&tXH|o{W@#s-zXl4
zq|2<-_}O+#nGN+FZy<wo=B#&!236k%UF4X+md7J%0)Ldcc^wZuQ+U7qY;X3Ujx>yl
zx4=y4xnkDMW$P@Z3q2y7kxcn#XtCM9>Z_n;mTMNX7>5ZRMTHSS=wZKv8^<rnul=9M
zqhEfYCz5XX=w4WY;FSlW)1O=hl&=VMy)Cko8H7_YlnW4T2)Uy)(3K*M0<hbe<XGU7
zwF^-m4SV%K=H1@N7?&vgE40eN!R!5)>ie<2#z$b!(r6Dq=Newq_XklU70_DrMV<f1
zxQBY+mmxJWgHNO}r5;$7IEvS2JZ?_w_tesnj|K7ufAgydvi*fU2hU@P#(To=bB^VE
z{ZJ%qAXI8OesTXu`TCiTv$65{{88Qx7iAeQ?|)nmXn#w_v3vCH6n<8dx;3;HO@ki1
zKueb}9Amin^2WGNq4H}XWIR%W3{a{^A6@-CWY<$qK!F&AT!Q^VzHw3acM9_Zkczvg
z5FL4Nyo0Oz8`3ozRamkWko}RiKegSqFL2sbEPpNG_NIE^(Yn8PYISeoU+yS1%c1sQ
z_#>6&WVXc3pveZe9RqdUc}bcq!c(u#U*E`db|Xlb7oXgF#p{+zk1iD(urld{`fA2S
z2&e)PTJzfJ6Oj=sHlr=OgBkPB;h)Y^qo~7aJ4(j(IlKyN1W=HwEjLvc8Pn&N0fIpd
z*0Li9%~MW}ZHc7livGu-L1ZjTNySv~6DxOPHiwP)_a)g~^Ccby%PIH>H_bSbp3kfo
z0USrqW2CFW)1>U-;vypY>V{AK8_Kv8?<>icv{-a<<_bqGlhOvg#9lvncsb$emy`XD
zw!CarZR7zyef%e_Bc!IM)qd#McY3?lg;W2Y<ItS??}+s)YsC@8MltW_&>;R;n*J--
z7UTb-_rS{mMnpr<SS$rPJsWepZeo8@qvI)5QrJJEx}7+fWX4$x{8C8tcWhlIew7?e
z)6V&$b$&0jsPHzD`Dk%I_RWdrcM2!4nI&I&HAH~*V~AVy58KL(1c7`CjwDRTs~O`C
z@6w#BcdpETr45HBy?%29@TMt5{Ahhz^QLB9b>heh(Crszx+0Y%wQE`q{Pf`-Yg30W
z5hw4TQ|}bif!0XdKTU`Iz63)O?$W5K3D(-&<NeUSKThz>kM-*^AX#>r=H!L#{(m>u
zWv{yzD$IFW0|eoGYfYr#rNg&fGIkS|3I?ZY+|#KB(+-LFbwY&0>Pr$pg7M1iSxPar
z+V>VQv~+wuVmW-btYkl}c_@c)Ena3N=C2gmd7n!-^C-&vW;K&=&ou}nv&UZzcBEZS
z!HKbD9VtDwkDXg&O5)NmS$Vsh-THSsjvnaX{7)BIww7?}pkVTEN{#ylH1}vIi(tLo
z%#U~af$E!f%iwn0`hiD-#`L-+@MYg?CE1kA;f5?s*;p(k+i{Av!e8YX$)T#TYrir6
z@E*x7$B(%0k3YyP$6Axq^tw7qdpnu*hmDU6B;z<!z}7unPsXxk40%;ksC1a&9IkwU
z7ojRe(80Zb<&c)q;My86U9-j*WQSC~hKQ>;_rstAgeoD5twuhpj0hNVR8SIBO?j3A
z&B`P)*Qe;NyHK1ywvTV+=Q~(Wyhf<%SXa{Cp87lswj}TFv#S(+t2nE)LPhh$u+}c|
zn%vCbzxPtofs=DlG+vKCAfE2+oUBjg!lnO%xLBwE@h~=+zuA0O9c@ke-`KX2uO<B#
z0g}ycE@f?xK@X{ZR{&6w*dcE3cb_@Tu>GiIv#krOz(;aXCBa&fx_Nrj3#Dt~T{;{C
z5utxc%j?bmSTI((F)ut0;KAphAY~`8_z;AJXk&rusifgeGLaH{hzFn({=-|1xHL+C
z`g#(5YEG^D?-1GzR1z%rD^1arv^R{uljzx>f}{Zr#hbe4?$c9M`$|}^7O^az7hDCW
z%$e6o+j|Xzctc9U$`-Sg+wWPBkQPoP*Xv$)Dx5zNyZuSeC!g-!bFgrkhkoj8(dFQ9
z8=;yJ{Xj_%2C|IZ(yTORFEH0%a$7urCCgYPiXD{c{sU6RO}EdTy3o7t<CV@pt$dp;
z^csIMze7$jG*}g(Ay<peJtCp?2!#^Oo7MYz?k*@=!JOji{>ymv>9-E3QYt53ie>3$
zPp8e(rVB9B(gZ2>L_|6na9hP^Naq!RH8s`?!R~vs!bh{t^R3t)EvB4oUiP#2UMSfq
zylCw(bLK`Zu9P}I3RrwbkjN1hZdlAyxyh(P*Atm;7)t5IttNo3H~e7rz6Bx~#{7HX
zNBp(w_?3ZfP1D+Jpv(p9P_9CvSSYaBd7Nf@g^6@?v0rg|c-L)-uFY}adpX`pw{&xv
zac{=9{jEI<+CG%TFN)E7MAG?ZO`#4ig)7=|P#?t79itL@C&aEXG-H=|UwSCoY1zUm
zs+<slFcna5AFr8jlT`V1+Pixm_?7p66rFWgQ*Q%?M@tI`NGmN!2+}bSl?Lf<0coUr
zbO_S%qq`(ULUM#km$V>b(mi1Gw)5@#bGvq(UDtNbd*1WD&;8tbrvuE7Od~;VV-zEK
zAnb#A-<IRg*cIAEU55Q=IfnjAPX!7t!2jg~nX8Ey6n;?SK>o)&iJv`OZfXariDi`?
zGm6I8kVSdm-2h;wwpmS(QdF$s8%NUTjn_=!;Y;l@-nEU}PdHZ47en8&_#MTGnRL>q
zU^=d<*?VxjKDv>)1++PY2re5ZE5O3LmZVLBN3hwznCd)_rTI~NF)k>(f2uij39)z!
zAv{D7t`E4mjU`IEZgFMMvGsRtRlf%EPN4ehIPbn8p8b^efVZ(*;)1RE<;XMRxp<$K
zu*Pdn!?}*4>p~q;+u0F&S&77AT<Sa{+Bn^&(8E9MMLFn`8g3rdP~yW;oO%xFDzS&9
zCgPw{Vr2wHSe(^{M8GL=o;e<Eb~Hw8RAI^I1Dho_aZcLfNwDXlwwgZqzTXw1P1=kD
z9Y{gE1o6`D?ZDF~H~Gfk#?&R6e^C{$00kIi)FCp+$gFW|mh^@=EH+18JG3N$UA@Ip
zlcpSmcpuI>1RL6%89=6+tfrot)R?E)d6~odn-CAyGeaAGD@4X>+BJ;&YZ7jK-h&*Y
zDDQHP_ofh02kCQi%i+}s8Wi2dJW&~B*1<Jk9oz?@FB-CfQ`HCNzqScQ2^_K~9(iP@
zY~EWLZ8@r$r8~NPv3cNS4C_EVz>sL;DG)jPSv9NPq}{P(hIUjVL{Wq;g5VhLN0UzK
zE}X7}UX|U=<kN8F8v;v%y<&090v95e{kwTDoeKg#EE=}?ek;?jAmf}f@${p@d)D;<
znj0r`Ocd}<#9F79n6jsONSF;<RWQg>5<wns!h7vqY;xlV@_^Id5EsP}6^?vJfQbGA
zx`G&kq+>T3P-H3m=b|(5?OfepAxFh@a_v*?JF_x~n&?MdaM@W2_hF%@cqpI}_6oA=
z_C&>!f-C(LZ8>=Tc-_GCHUneH$M*K09If2`wDu<Q+U?bBLEc$;cyt27C4dY(G#3rN
zv{ta75$QoO3@E*V?*8sG3S-vVn`!Os72He6JbKZSgOrcZ^nXOdZ~G-j`zHxft@a$h
zAKEWN)W7z&LeuPsingKza8$)+jQidm3x3e{YkVLT(jnJ?v3diPyn>7q^0m4$bp2i@
z4<H5)YG>JH5qYgJrFR<!itHuWrogbogP|0SPs#J$%=M!^HiK2o8w*gg0UY9h^2dws
zIeWC-8?KCaV91ZWx2*T;+?Js`6+_{J2fvM_rQ-EgTdprl^TvS>TjZfed#8zj0Mn=R
z+|&QEtyzltt>8<aP)IJQsCdSL_s2NX9xJ#}fQi(S%p&-TePImlfkT8)+q+(s1ZFLN
zjA>-F*AFM(05m8PkKyg=1_X~NGQM>w_=Y7c`?7WnGWmAlhjE(573WXOI*}(S)^=s6
z>E+kRHD^gjl2&TsQPAg;)BSrpZrnRe9xfnocAE1Mb4?BnsZwh-t8On9xX}d3MqsM`
zT0Q2_$=ee^Rh?RS-x6cuE|HciMZaA~?*K`iFlXtHHjT1+V)Xc|Bf*$DWXD!GeHr*y
zP&7{BvfJby+kuDN<2^ZjlTMf{_D%%<I^$TL{#0nI=|;BcRgnL?D;7wE#AaNJe>4t-
zg#~{aCqW43bt%b{z(8&2wPP{gxyG70Ts;f3XG5OD;he(qCQQ2`)ME`*b*O==Q_Xcg
z52r>pCScx&UGpKYC$=YD>)#ti45<g;Li^`F-6y;gL}kT-n(Hd8#_s!Kl@TsKLn$yz
z6WfQTT-8kzlX?MUDH}0y8zbr?6G;ot-M%(4PjlSpoM`KmLLJXyC4!5;XgabKC14~r
zk;<1`raMZ$$z&i%bTRx^GF*~S{Ak;vlErHDPM#&FIOI38=CxvSB^bh@?2!Z1%hEqk
zofYhftxhlgf^*E2N1NSw$nR;EbDa_G;a!HIdkgdrHtCt1@;E4HVH2z7zBVWs-2|})
zE6LPh+S}6<0ZW(&IRD%t;}Qa4#mpx~r$RZhrYuL%#R?Fl$HH*lx^`A)*q2j?yt_g#
zAk=}TJv)~$1z^^p=3!m7VdE-$bqB>>Jt?&EeYrKz7y}QIkFD>HSm1|E2=*<Aeq*BK
zGe$qHd8<BtWDvGtqfpcTS%b|qov6{MubFB`g#uLV{!7+*&sr>f@<pRsMbn0V0Oz&G
zyH4;}u?~)uZt&lB*J~j}K-pC%8GEl()@gI~eFn0xp7{fl$Z}hIj5VCKC(a<?DG1$5
zEsqa$Z*%l`NonQ(?(?}gxH+&JFny1@^D3Z5LZf#Pui-=QG4x%cJyo7;bCb{k(&fHz
zAh)uR+NWuM=LfJva|@#f3JSksYDcgT455$>gdOeJQUu2<I6%($dc%r?a2IdE!Z}Sm
zd-2QL{@6cX8)=!^AgF<VpM7gqToby9Vq4xUI|u_|;{qx0mDjhFDL?N!UR_!Z+J&O-
zFOU*@Dd)jPa;DF&La-p*?ay->xp+tEKK~&kxB+%@gAR3plfuxiBVbI}Ybyq}as2x3
zen%;A^GBabK%KH|p$fzXsr>IrUaQ^#CJo^diSi-8l71T!bo-?z2OB&$#zWTD!<Yh`
z;k&ap=m>C@U`rz(^Hbbuw8B{|4R&CDFWHebk#er{Zy;E*jhuoDXE`osMlWZb_CIPe
z<}V>xtrCvEvvga@33_t$HD&Y<_slMvG_0*D#eFV{v?lm}jLKy}*9S&l-w`$AP3!CR
zI4gAppO+L_C_o5@5n8PU{aq@O@AAjZ0!W#5aW>i?sDA0j(L2ZpCYs)msxMTAPy{1%
zVIdE4N-MWj3VE0i4rRzHeu7*E=Dm<)6Bg^#mg4CRa@WFPKyy@&?@+KdFL$uo2tjQq
zR07WO5*2{cHUNFnLWVXAl^ufsjWG@Jy)=wl4<qmuVZv-Ln+9lKmbiTR#nowUa2#?R
zg`^i@)R|)*iGq|ZJ9!mb^Ucot99(^J?h0MbMD=zlU^?n&KT<B*Xra^zkp%U1CVO~D
zkNxnD3Z2@k8?|%M|5P9(-epEtpORgrbPqRW#11vSgJ0}d(z)s>v)Y{8lNxt}AiAG=
zJdoO><bM}<dL&aYtw*Owbayiz|IE8x+b_LP>|4=bvC2I6z37-e0}tKP;eCR;`&T5;
zDjMljauxZl@rx~<J#F|soq3e~e}<>SW&zxH%I7eIV?k}J7jB(jkEyatbK}T2-d<zK
z!9boD>#Fp6W1So6y|hY53$nsRxp6J_<di=erc-BVa(9g88Jpz%o|j`n_DTG`3=hAx
z(fVv-a?gJm<g3}plcKny{j~0O1GhKF4~G9d`?mkrYIpRV6>XyWjP9XB5J0qd`m9w&
z<a>qlNh#y-b<e%WzqSU$+%`$CAL7%!#r=?OEgJ)&kaFh|SPzyI>C6<xQAl6!n=O`y
zvtv$2L;uVZM~T^&>e9Ji;NEF(6&|nRn8o~YUDw`$j=~gr4OH!n@Jr+vN}|K0cCBQ_
zI_uT(&8Xw%`8q>!&@9H^3Kl-y_?JA9Hib-Y1KDkVET{q7(}`aM=f#Z{hXxkec-|F^
z^HT(eL_S$2+Qh>IpSh)fug82QzOgf+TtB>*@NqDHgrE*99EYYB7nvm4*O2On&-aEl
zVyYM~61x7-SIAaZ0de$m&??*8FUcye6i2d6HHFfFP+D}m&z8>WqDLgku@ay!(Va5y
z%!ZUTK&0Nk{~6sh!*Q@kHT(Q@;Y0yCU+}I^ia%Cr!->UYHd8MC`kH?}oAv2kwb(jX
z58u<=xlTR>gFF9Ay206JuAR@tT)~eDo#jkHJ(}^j#D5*V@4+Z7Vrw|Mmq`dVEIP+S
zd0=r0uK$L|9IvPCi)LXh92^G@QQFhnP_*Xi^kEuNYttzp47m6?7^;(mR3w>WUZ5y%
zbAJc4+pRiFjpF3L23li>ias3fDW5^BlqZN4?!6*EL+H;Tf0B=!o{{%3T?7m6K=fEF
zsKN?9z^V`QH<7DovBMg#TNZMwuKzF^m0QNp<STz4!9}(u7=gi1d6IYiCD4r0{u%hD
z3<%(avZVtW=g^9%V7tYbQA+y#)67Mv;2oF(l7uk{ERH!|l|w(ioj$_+h-?(6EW*Y?
zU%Y*j`MV5<Pc4&XDGUpS6i&|qc-P8+($=^0(d!(59W&2IDYCDLIU|L^T6Eo)zHG7G
zqXiBJ4nhvDJL*gR@}H(|*`1_FBiOdU+s^K%dQh-<$&M~sdlMqwj)A-mW86+o(k=&o
zaq2y%SkX2H_K1#E@xjOU`6B&)wJV`Es8frLHp_l!&07i$jJ#%8Sx%kIK9PJs(A5C&
zU=P-B7hEd;8Pm>qcMq{fJ1-_)5S>-KEv%ieWk5f0?HBNX8APdPw1X8Vg`E*Z_^l4O
z=fec8_8_zl>6TGaz-<)CA3nVKY5~u+;si+XFUVM&6%w2UXdbrRM&bT>F2Mk432#5X
z2e0(c*IVqSfL<G4N4z+lTZd6AAhEa$K9ETHd|iqnS>|!5%L+OQgdV+?XxMDJ9^Qhw
zDW`<CMEpeX0C<G!+Ykg1BCZ96p}O^_3ws_`xcj772-qD-K}c-!tLqdDY}o1IQ?YB+
zP5IY9?4SXMD#kDA#{c(8wz~LH*`nLruOu%YqA`&zHimziPIK?{p5DbgEjnwouO4it
zEn{(#wK_{fYOlWd!%`U_&BkC$gr1|@X5$@0)jW#Z^_Ts54u7OM#JGb8vSVDpqD=2i
z{(QMD;(HBvek!kwJshDb&O`06DFI2ku40$cZ$6#VY(W%L;-9jxae*~uezrS3>zmWz
z-;3KTZXqX)VM+D8dhx7kLO0}v3^;FHp6_Rxc*CUg@Soaq8a6I~yaA}EI6qkqSWWo{
zJ@goA?_w_I2QrcVV_J&SZDDL_W$*v{=e|<AE(V94y?J(^dJe9Wqr_txxg~y9WwKz*
z43>Wm_(p|`GXPBaZ!h8O+Sus1CfAM>L->4sR+z8gcPM!dJU#i1)|=$Vgx?rwUQKs9
zH<K;FzjQrv7ALhmj-)FufzTOlkeEksEj;r#*j@b+glBK#JYfI5K1JJtTlovZRm0Vj
zr5fNfUu>Mt(;Qmu`L2>(lN&|Lq9oTOnD*i2pYO~0PD5hfYuOzwuzTo%BFgl0uxQ%v
zU+7|-Z1x&y6E3Jb?Jy(k<~ljEWY((%JZI1l0>DUr>!?<x=NZOg3uj@2dM(zhxgt4(
zL(X<wd7?lWz%`FL^n8<{r#tDbCrT_-2Bk3CAT&2j4tav{Pze;ozRuZTG;e1QR;DqO
zm~#1eUOhQzFJR-<FMVLNu1`PMX@n;@K>nR#+&N8(`U3kSC15xK^|7zRA$gZ61I2+2
zfc^fdg73*Z&&Qwvv;Z;8e)T-8-eJX;d`S9Q?JiiAc2TN!CW1Cs$k|I3X%$t+ciwJO
z(w=xpIi^i2I6&JA_GKtCPQz?OokQ`BHvTmK{5k+$j8C3-XbrL~e;qW#^_`S*Be`8*
zNzC$vo1W{yL7<B;xx|M2S~&mUJJ<i-nsI1x6TR~NUVjaXJv?YtJBHLcZ5;=G=G#vr
z^`to&viSa#Wv^CIWEN^!2IZZFu$&ADv?WX_Rk@XUSie0j)_{;V4qC|BsuFH!3>jMv
z=3g+<{^r2K6>BkOy5Uuip7^dV!O?-@0fdc1cmWS2+5#M%+aPs;e7nD4(T51BH5!n{
zA(d19Idf0MXDYPaz^=$z)V(KLV~zS3M>BHEnz?{MF@hzy=ywjPJi`<Z*NjjCnacdJ
zy+8)}sHTml^)=ULGdr$55tnHcT4mUxI(IC%zaP+4TB-G&*od@oa2FwEZiN4G2HjNB
zKxOm`5J4~f=rMZ*ePGo-zb~~yq>^GDosVExi>bpF9=-o_Xebm4xph_#o4amO(ChPe
z&L}X*-`nD!Wjluu?7WYXnCHOU*`iEh&9Jb@fY^8ZIoufovE)7$*!K20giU^Frdb%o
zq>D*JR1+I*h_6K-ig)a4n2A{`;4MzOh-#v=&%KSrKd7;c2pP0nfweI-=<5w}bG?U&
zdF_@%WIeHev2&6@j}CYdA~=r_l1|i7RuU%sv0N2L)RU$!_H={dxoPztvAMyfUPsIH
zdgt}SzE~O0H&hDL;Vy<RPqp7Gv{+nYgOwyFdokpDHEuFHRT>}KN$;T82!Dj|^FK$8
z@zL`2DdX6@_iMn>3(&>)s%iY0eXLOADS!n-==vhfOi#Lgy|$QrTs9Dh<iWe64Z-oZ
zeuK}0vC2{Uo4+^Ag@NF}`g1(>CQ?{e)EAMSvRWbMp{)Ish<B_Lt+FNaA`;~l{fmJp
zC;_a02{*OfZ>_?OX7;*GJXvw{Is^n8#QV+#lAhH%<3C_5ngBS##RJy}xe%Q3Pld?~
zlK!glmsp?+LJsS9%tL1n;FzaLa^D6*B2XbjRv$7H%-oU>b&Hl?oA&A+`LOjZ-3jmm
zfnN$mN%<1oij)EJ8&mcW6ANw&&DWY3<3D~g#VP>a4P#&Uz2}PQRy+I)^Y78~j4hzC
z4`>JR&|EZ`E**8=1+(-;3&9)HABu{AaEN*nf*W38pfN%f0%Dcn;+i>AKZ}0=`0_TO
zS2lx1MMWL6uBH^r5|OD449%KMHtBUtKDcPBK?zUoH8m0lCz?Ds%!?K6AH(J!7rw9|
z?^7|MXS-oyUk3^57yCa@({Z=fC6*WPmWRNXgSBGhwy1SCdktup9U<s!2^Fwj08I|c
zVKeq|)9^3Pt)1HnX9dqopj|61I1+I68=StFZt<H2yZ!nI@)X_%xa?X)(?;<3rW-fY
zr$;SUla*l$OPjdNCoa1Ebq8tc$I>h-dak5_S%A@d5Dn-#9}#vD-~KR9rtZcjf)zF>
zVTyJFbumWf7^e=s=qVThy0K&is5ob-soFD4jcR{X674%W-9a4@XaYvYKNCCl8TP-Q
zRuAjTAnL6_t)`1G`nIk0vJy2jd?)>X+}-7SLzA2i(Gcr|3vtTuHqR)abYppKwaDRM
zt@iJ~QIMMvHpO0yMUA3<`DwY7ujF2?RXgvje!R_%HA{yP;ew(uPS-Q7UGCby_Wdys
zQh)rMTxS*j^!UCb)dYYP%QI1-C9-fo$zOB1bcTve0ojAY9vh{yeUI;(*@czwBuv8O
zX~O^NlSts~ab@9?hN28(fbkX#cqHg&e@r8>72jT8?g+4o5DRgEUuRBQ6d6fhd$JS8
zVdSIZJgrXNnzff!{o3r)u*G&!LZERnxCQO_ekKe9$;b{hEMwG{iqj|cvV**_zt((~
zA@Rc$UqbYw9thX9&A{7*O4!88uLZe*{`g=C_|p&q%8gGm-)dEjH1ms>7M7=H7c#HV
zAgg>Rtlc!oB9uT8G8)f)y*-WF%&%7Bn8&qmM|5ql$Y`lo2B-({BApM57}`XU`k&?M
zK?*F5gK_~isFlcrYqyn2hL3O+6n_I8d%q$=D4gB9-ZpnE`+L>TvA#uWkNa!xb-v%5
zTYU0!YsSZ?&p|bU8qm8zWW{3M=G!(tHPuO3?%<$PKKCZ-3y(u|UKu;EhglyTIpYsK
z82a-mH2vOBX;H>VQ83P4um5F8>6cJ!w-%0h`qji20tb_Ox@#X11=TP?Vh}XrYHCHQ
zDR=CkGdWS(3!Q6tbis2&*XR=WO0z`NgJ9Nf62m_WF<3VuVWUU3Gkkv0^?t0|@iPS5
zSBH1`S--hbxmp@kA4DDWDdFv%4A<$iUhqO{RYQzmz{j%46MPIT3$vDX{u9|quxa%h
zYP5-h@}s`jGR~A)%DC-yRu2)Xp`dPyfb_(ehnj`Lx5al<_gd;mD>?vbcpj|vA%o15
zy4vn%W*zOF-uZ6n`I1o<G`Zn>j32A7ueoU!KnZ8Md3D3!Ha7QVf5~84vSvhE{pPl?
z2R5!0;ga;bxl?u~w2<P3t+gKLN`d2B_3M|^n|mKt7KwGrDUEdx5RK_z0pEWM+m{U^
zpaPO8?A<WyR_(NSY1}SIMcuoHh|!{Tn`7F{6R$72X<L43d7@XNZ(4z%@bg#?VE>)c
zs^8I5$yi74MsG=YE&m+d?)!2m9+R(DXsLJAQRjXPih2d#%VT{}h9xCMv$S_J6}0gJ
z2D2o;3M{=~`^;rh9Q;Y{z|Y;pdEY!T8@s&dJ2;QXqX`11?iZwAE&hm{*9|C}i)ojO
z*El<rqYW2yLO^WH-c%9jDkIi+|Jn^4D%npvK8Iv(dA!~0u5^!ej{zt%WzG1s*ED-y
zem(TpGxJkF_a>Le;!mvRdca%LZ(W3ei)~?b(&aUrz_Y-Q%F6Pb_sh-MBaKzVN4F8+
z0buKXUAK9;)u}-<VW`vrIF^hRvsdVg4mKTpRmBmsnDM$`nb`LY0-y^qw1mboq*lmw
zEIj;b2?v<hBByC(o{u?=YY;Gg`sL3FaAB-srZ>F5KSMhi^Z0WsN=F7G$kqwM-Z$HR
zRsLtCw7E%5x=J_cQIjW4Vyk<8<Q?cSm&f>@lfJtJi&I>Cm;r_d-D|%M8!4IKeNd2o
zY3`3A!@%e+s$Gj;YaXU_eH7=rV}f5SnF5`q*JiuGa}MJSnngVw$T0B@E`|*DEa1bq
zpG1ANc!vJ5(~)A4;c1bGP=}aRwGLsb`Qe%_g{REp)B$oVrGrap-|Z!X5%|Utf7qhP
zM4_OL9YR?`@Rl@#L`M~>e%9Ak_0pQu(}hq#@F0=XVm42tJ-zK4NnZTZKo%3AmuO|d
znfGa`?o~gMH3Gb<R5>~Ba^O!<#C`LbX$+OAUc?Fxc<(9O!lF1suiFAT>_el>#Ex0q
zRR5#I2XWm&<De1KmAzjB#A2*+Em;j>dP}ewxp44ks6~4kT5Pp0tM(hjh(+;D<bN_)
zyv{73B)_^oq1QpzcqDZF<83D)tRKc3;+p848}?JX>~huXPH4agA%TcmTdEQ;hB?gA
z`8iy^3p?<n$2`B)<Z#9fI%qE~Z%!I(oP(IbfULK&F*wt%_?&+3i&b-2Kxq&*;BOd8
zLi_5|Je>>0{3CIdN8H!9W(y~d0=NGqrm5s;V^;zc0nHoiC2Db}*D;sO6uVXJtj078
zt=U?l|034vdJ^#TuDs~${&fc&2PJ?a4<Y3L1@w^xs82KQVOS)dENz&LP8%q}@{HdJ
zo#x9nKHhjU3M_e9SB+78C@sVh&?NXnhIZ?>xUszJep1kwOVY_DD}pCq0Z?G8qbd6I
z&o%$EU@CK*0IRs+cEo(+!095tj<FPAIB~c&BHh{e;*9)e6XxD;v2qeS>Q<txln`&`
zX(cJ9Kj9?C7JZzw{IpSn@VVT-Z08n^JC831Or3p}fhkUUEM6V@wt1v?9<{gjZ?5Uv
zA+ArP%p#dQwJoO*d6DCu!+8?8=vVFs2TfYwKqIa$c!$7{g?Dk4fb+;$<5AfkPIumS
zj78@@W+v-ZKSIASM=#7*Fv;N#)e($4qs`vsb_w-*A4*knj|cNnf4*VU7cf1qao2ih
ztL*%{w~-n?Bhy9JvCuB&vd5P1@%!G9>sg0EF^k{?mW$c3?h08~v|;<LpK+o2nVosi
zbj&K0`_7DekYYnqETM`nrn3Y>;66w$5az9MLj#&J%OCdIGIq>F_X!~r@0sk?*TVz9
ze__^KWr^?pQYiF?bzyBszFJc9k1KZECN-+}mAnG)++`to&VW3|Bf5XvbIxHwPfqI^
ztOUGS^8U>W7(D(X2$py=SjPSi79;+U){XdZ0ehahkAz+=%gOgWyhyKF%gRgM0$M8g
zwuC}zvgJ7yjY(SNmsVmqi2hN8P;|Vs`^pXXeXO1Gy-d8ga{~~Is(5ESCjZ*({6ax$
z?D^L2r@Y}tegktdn>z7Uxu<Uhj9Vz`;A4VAno3Pwr5>rQgrSxeR~(=&zT<qVJ39N}
zIzf<0GacF?L}3J3Uz<q8szOa;YVnnCSGf11i6&!=+ux&Q-b)SfjdXFwiMZQ7%PAC@
z>$SbRg!T%h(3>C+Mnf!gV>&d0F`npe9zPH%CdNrxzoaEdAr}Q1bCn2CH_8A)hgw_b
zFUJ&(1C+jbDtY}O0a-L}R^5#W#r-bbrE7UA!H*Og@@;F>WHMiv^e1BycXaF~r`<@`
z2kSRImigM4iqrY!r*kJRmP?D<G&j!T5*7|U`ozmJI_-O@)qxy*xE|Y+*Y#z9>o!;C
z)Q?vipVbEB{U+7v%v~DDF!L-gxNA^=n3EgNzs+*m1q8S}1|tR@j3!~eLl!TuyWEre
zU95>9WJlI5w1+Khzp<wPcJK+DYx!u`x#*!b#^^??bk}rHD>}F_VanPiJ{?Tq{+(Ry
z^R+}0pa>yLc+8twkb&3CjBO2p)_Ym?G7q%2VaR)rviqHR!SL)@OdKAyQGmhR|0^tw
zDrEKimR&6k2QnXMSH{YmfHYiAM+^);CC&!q|Cx!5Rx-%wf2$M5OJlTfH6wl@D$K(-
z$K7j^5%D|&WFdAeqyc&Xs$n7&iUH|Aen|M_Q$P6}JZ~K=Vm;UV9=192SAw#N`)+rB
zq2qx`(0U7T>qW17pOrenC|BrFrcgia3I-8$U{6LVDpF0oWWENip9FhnOS@)b7cOp-
z#9hcqFGa<Tf8TCZF{#Hks;nx5B>B5DvQox~_om0Cw+Q2&8H<CO%qH@6J`=ip;wV!O
z;iXJW@5!^bBqUV8<FcjW14+8bSmn`q;IF>;=z8Ee7xNt~{Bbm$C`R?;<lvtNwvEm?
zBPy&M-nyNHeq}0ma2GA0)fwrX?dLM`xqt0p1yp@eUmQ1Foer~QPj&uU<LFy);~uO9
zDFk{XY8@)`pU3RgE#=bb7m`ykvx^Tlhc!hx(Qc#Uh}MuUu1htoe4fQ`dTgXD6v0ZW
z;q}~iIpYy_H*Qa5@w}hZ-E}`FYB3T2E2dYPw__}5h;T7YG1u9j@}*1AdpW>K#R@;|
z$*U+kD|<FZO8PVXt>$P%uYMZLUO?5MGg1C?+I?<5j<FYINy8i;{ZWrtO0me$P<x;&
z>5TKNNh)pyYC!dNi(zTtm7l9(ZgP|cE8Oe0xwR_7!6u=gi)`7RXy(mo+r#!Ryh;$x
znd-_Y^%-Yurue|{$Z&4`blsJDBXd_~><48B`pi_Dql*Umynm$R!(r>T6tTjHwF6P<
zeCtb`$B8q}p5k=^q<XlBD!HG3hvT2TvbmZ~pHhzqvKxS)K9;<G^>_`7f-VO9UAn#>
zD{uPX)bS7ZC6f>Ryq^v~X>T_W8XBUB3w;QBS>6C6^;@>Cx2=EkG+FD}zg*&5t4zub
zK^KQ#oarQD7dP~!Ce<=Rx5Zz@5>xqI`xOSe&CkNCk6Zt3<PL|OD2sERl757AlUWb+
z<Q?f$aO9^Snv9y2D?%*0n=AnAKfH9$QT~D}&}HDG1I6F^%j7LX%xU)D@R=p5aDkOY
zxJqd!9;2Y9Qh0H}jbX-APefdu>EHApom`PS3kEA%p$|m4r!UVV>3_xCh*g@j>nQ?}
zD*v^7yED%&XDXZFd0ye89EB-Byzep>{PZJ6hJ#$Hp5s=#v-C}VDAgB|&s2n4=OuaK
z`_(w;Z21}s3OaM5p?ZmzRdz2z6*8vk$sHT&md2W1xpxws6MMc@zML{1<jH=jC)VcS
zZm-0XiEZD05okAdKqq$gcU;>>Y3E<Sj&xzwEheVo(KpMFOzEXgxre0+(@b}i`b8cp
zk9(2}R-04d?MdyL>j{gasqk27MqFK{!6cIp;*$Bg(;9j^P!BX4MhZ{r#YJgDF5?@H
z%Yxj^E!GhQ&erD|O#>xd80E_+G==cbNP+~h&-})ZK)#OT<U!js-7gdovEk2r^~rEX
zx0faZY#uXL9%YY0$j_fK3I633>fSAWMO8InEs^iz*!tK_uc)uE)bJZ6F=deamg890
zqpvm9)DZ-FX6NbEgP`d=WmdYOfqKiu;#^7A-Zv0Ss-H=;AQB~1ENdDsNzVtHF=2t*
z&njP2cdn7!F(D&TiIRy`+(HDLKc(+Y-*S}-kQJ*_nDc5cH<-sZx|``l(z{hdtG-%@
zI=_2tye^+z;*K%@OF_`2xNZok_AlzB+g?o0$uU`U{axI0FSpAQ`n)LqXY8A{G$m1&
z0#c`GU8Wj!U27M08$-N1&HJH%EoP_wnw?hs$8$3^tBP}J+s7$#BP?YfK06VpwU5nn
z;E*)+x@($6n5a75S=Kt`Gj`)V2!5!jVrwnOBubGcO57x7$pkbgn+02t&SW=`P=0r+
zPPx9!Nn3h+?2^dXT^&qx9wbWaV3|G48UFd<&nkv+wTJ0hmn+{~lt6jyT}tDR{__%%
z|GheJDovJWw>bQ!p^3pTf^Rl+Rtp)2dSV6-#qX5kC$r|?^T^EIC58^7c$03rws=(4
zI_wE0nN!y$JZ%<*GT6ZLQuKYEt#{)8I)xV(kGJO(flR6lr$R!94vzZInLgu;1a|`y
zpD_Alyk4Y#cb&zh3ZKcWWAow;P2{U0_T_u(?~Fz9jESMl|K&L&7=mQTYmYNOQki)-
zZOy-QbN}AdX2S~WH&4=dxitP}E4C%v0JEg&jfX3o+mNN{tYWeL(tc#@haGSB%IqJ#
zh~nfOO`6|%0{P!W3+78#|1_5~u>X4CoiY0ut~hxVFu5FiA>8;5PaFTIt$5_d6KpZB
z>DyMvNN<|jhi$sQ#psjpuZ6=(*hupYD8HU#F~f?rcVgDf4w6=NEBA+{QEwU0-W;5Z
zcD$UkHrt{81Ms}Ss2H5?Y9nRJUwy*%VY25F6H;VV^SuhNnY_S5cZ)Kk%u{ATlgXJ@
z?hrnW2=?IIESb&n*Cw_%3YenmP{M5<aXwTE`29|V0dxGsI_zbgylHyGnbe~I;a>-y
z6v~ATpI;DFIQ;sN;r~=!hoPRxU<B5XN>jpwQ_2*TCWZ{^J$$w2>TrMlhq_Z5d6@l@
zLf6FsFR|r;<6u%L-jbJQ%jqy3dpnY%BIIkE*SsiwV4i1dON&y<%0=2_Ba`~8QI3x~
zY-TZe!~r{jiw}AZchns1EvDW#|4vUaubm(f`xtnX&p^sVxZIluH<`RK8U;&eFqgmR
zW~FE?2ir%JDI#|=;o7kh$A6qn5WhyQ-Z*+!c9$c6CaL!DM2g%CTQ%#{7>Ul5J|`p(
zzI(#H&_7%RH-lNX3Y!vcgBPPVc)!cQL^?#as~&1dy#0?+Wrx*=i)a^M>8o(}%*gsF
zXXt7@!IYWO&j5FDuM$f2`Ni2I{E1yon*Hfaw+;NgwI0_?p}%V)bMzw1kETOqvUzG<
z`Y>m)YM80EvQpUmaU&Hy3$i!FFu`FB%oL`o{FnXATP*5_@McuY^Bh*HtijkcQQ^JE
z9-o<{grCNw0b}|p`@}C?`~a0QXL~INZcn`4xi&E&fByx(iQ(v`^9akVIwR&jUeoVH
z2S&?Pjg>!K1iyrm(md;4JDcgLuQ$Lb7gO4?>6SuknuSHr8nd8arh5bbyTA3pU$|=?
zFIUI#9E3U0YT&YR`0Hk<uVm+?WXp=$_5^Jr@^Zt2G(!|_mAyF(e8kk;lJ;d}t_-BO
z?>e`e|6V^hqgY^DPA3Bzs^@B4+TUgQJ<5E1{KNG!!l`MOnLXH40yH|@DyBSrn6*7b
zAQ|ET8Ay%GGIwmAExbX5vOgec{G7=jR%SDrxMsL^k%s5e&Tw+PA73${yPHWX_KlWd
zCyQ_0LXv4xWG#T?+2aB|LL>UBi3eAi!uEJY_r*#yE$1yA4czhbb}E_Gd%gQFo-|NX
z$Ph<uY~}v?EAc#}?nhA(<B-Y5`QtWj<!xi5-O6j#<m^@{he|^GAEfX@0`v*UAZ)6(
zCJP4s2Pmvm)$|(CJ!jnwlBxhNMYyxnwHmqm6&x|Jrrf|pv+5b9U%ci;+Ob^Nkv=zD
z)WbJAd-|fd%VXjsB~2<lV`MD}$Y?75IxFI83E~|aS$bK@sJpBAHYTDyO1q2k-zyb=
z%Yl?ZYHGb54V#fAw#%EnrmnkfvB&8+CCpGlzCr@f<xAr#j-snExp!%+pCjLb-*Z1u
zyC@{grc8QY_3i1>>%wwjkYoAK+t@m0Z>=(-IkuNGvW;^;qKe{)wHA-uqP!@WM>jqO
z*;|T_QH?#y_8j?Dr2f2Lfn?qQSEQpS7(a_#lx!qR4aLeb8uUfiSs41SC3cAc#fB6!
ztF=pe%jLMU5cP^AhiuAIF63KuoOW_|XYEJO$-A(rdhKy?P2S0p`j;F`TOeJDYkr9x
zDQcOG>qx+{{K8fFRpiQ5U*Sk*w!<SIzn8*y^RhQ1@wD0E<Xv6TZ1xXPi4NY{25q!-
zc7848fj$;58IIoVhDw6sNC-H*=W@y=i!15|%vloq&s(^r$W!jc?Sf`<M2Z+|mrZxJ
zq4i_hM~n%*5?NdidP(+9lcp$n{80&Is&gTQBV!WvWD6M>%E3oJ>Rzc5+E+jQ@AvF|
z5DuXuNrP#&)?t>tRJ}7yKWwkx7faS{*(cJRTjf+FA+KG1imnmWc87sK<M=hG-U?~Y
z*<8VY1ckRbd`|l@?12l4!=3*snq?REwqKza#q~IxO{t?A&p)~1gQ(@kiy_(|who`U
zoI<Df<w`yqIw*-xBXaq>i<CIh8Jzm*2~i?Rr4gTqJaK#b-V}+6)=))CXq<3xEPf=A
zTnMQN3#8X^aCiGwKaB=GJkmJGXxSI2NdM{;!O8MWp^4vtS)sas>}vXo*>!^B@NP)j
z9(4Ih(SdUE^Li<ce1>rJuPrW~_sAbzCT*p2jDw!y4qRS(Q4dms86N~Omp_*l|5|6c
zE=FBKzsy|on;28P65%a}m)egG%B45B-B-ydaP;+MM7G}8AZ?2_Ov)});^+$U3iLz%
zyTLSug@Jd@xbXa}@0hN7YzUvY8e6e@ll6W=h&?|bonx5H{OZLc2zE>_PF6W@|CCx&
z<hh8`+E1}|V2a_eT&4{W(-7)nR9zxJgo{7vX&Padp{?%fq`3=r?G%{qCZd<^oXNle
zlarlfp4IU}LNZH*@8a%OMLnKKrCK!n%Us}=9sXQwD?bxZ!y?%tw~_wumrD8zBsjgF
zwiMLuSwVhY#Kk|xs!&x7iycDZVC_4>U8N=`wJAQnvSoHWtr3sz>O(x89zU7L=l21M
z<RvLV*ewd(C%J!j`W&_AK$J=m+V2V3;|!kxT@2Dx8%34HlE1AB8coMJFiDFY#*q=S
z%~auqPCw>9c9K4&k+EQ-Mib*Dmq^Tf=$Dh6TTC8d3v}-kbkh`K!hMHReb(HvxvM${
znu~DgbRFKebrPSXV6Mv*l&u^rAyu!Bn_D?egb>d-xEAsnT?~SlT^=vRLxu_EK6|B)
zo7I^|r2N_j-kETw1cjWXJvbUL909(bJWp~YZ%eoyKAP(5^B)0d?u(>)+jR$$-t0R+
z*|MCVRztyJ@QlSBzZwcUynZ>;FazP;KFz;W!dHIC{8nbR`}B_KZYhn(N|E9@?m_+}
z=<9pR<WPl#wAA}~dWPn|B-qYgTc)#D_y@mwRKQ1_D7Nv4!@Tl`^6w^FB{N7^;@6*I
zqH*df3jdA&EsDbnesq(|D&8ET{{FFOOjBAs<A{R~HTDpdmj!LVS>9EPpR2w?UW-la
zOVW8#ii>RseY@C^yEoVKtFGHbU+F&hRy}HoW6dfQKqb|%n^xh=3#z8xDEC^8meJ~D
z{WB+A-B1GMP@v=y6bk;9W^*|H@ASie<~s33a%M_RWN+Sb1!g-@FXmC|$;^MuAYxxv
zms;mYSC6>b{u<%%nw#n0LXO%qi9X=<P%8y>3Z)pcz^9Tl8!sb4)!$ED<1tTxxr_Gq
zc!aS<mX`4oqkU8DGG4bU;Vd$ptE9TQF=CyC%>k*-!ux?PjLnSl3gv5N47VwAnz%9-
z*Qd2f`V!tRggt5mkuo)8bJBAk6J@x;MZ8zv<>U0ox7O63m2id-IpEo%%M>~0%9uYe
z9_^udn(N^4co)Xb4&kCvt<hoK?jS$6w_9HgpNi8i2;d$a_;j|t28;A}mkp0ASh^qb
zPIi|&)jQIn<-VGy%6J#09UW42PXTgrEp{LD_yO)GNBVfG*QI+d(ugj?`kkwC$85&q
z1DZ7(Tur~#M;FreV~5)fuPye?Xu%zvxQ*VvqTFfqvWQJFZ_TS!9r8AN&5pE_ltB;e
z^KP~|W2NFN&d4;LsannE&{G=&5-@5Yv``{Qgw8MciT)fZY^eJ`hfbnXYYV!s4y@0&
z*^O;>$62E_WyA<%+qL>)xyWW{rYk}@K#a>iLmaDOtVT8k8t}nv7^LFD{kk6cQ2e3U
zg!%A&@5RUOqoDdf(QeZO@BF}8;~zv(yLjK6qF!L7I*(ffao8UPtlkvo@O$G}TNE~Q
zkaVXt%e50j9jp3J;sS2k><A&8;C%TfZ4;4uk3U~Rc>3ASPkX1n$coNf{TgYkzfSw5
zb84{Vs7~XR_+l#N^$w$BF9{Rv?Mf=mf<QOFTt=m*wfM{QFcpB?l{=-iUO4M>;iymj
zV0QxYbY=VnWuFaUi6})$sNR6ZityMRCwt)U*|R)sjy>PY`ficrA?!%!u&Dl01*1)_
z5KQQw=6!*|^?$4#i49I@(!ggrAKOx~hg1yZrr<)>_v%M8yxKcG(&y$TSDWD|@b74Z
z?@uubR;+PoRGSteo#)>Dj7{7tjXY76!z6zTi#{CQUG&E~(i5W7@TC@Qvu>a+ge8`F
zPxP3wK4V+1d&B9&MC?n&0)<$THQZxp@FwTN@LX`PUX-dKFPXuEu9)7ly7EfP3%!Ld
z>6U<ns!U~(rE#R(a*RO$X+C^_PTD)LEeShm{3~(MU3#yL3tI9uKKfd(ZI+dk8u4mz
zw*l1p0B1RRsU;y)PeS5f=q=Vq^LdW#@pnEZX2OYXO~OyUi|~fx;Tfv%Zkqj2=~7Cb
zW{Rc_--)s}n{9a6T3kahw`XZ+xre46G63b6-s0bc1~Y;lm$<sdSloh4TEs+T|42a5
z0dG3_#~=ynmo|Fqqp@!Ux#@(~U@R;Yy;O^QdsZ1JN-_modYwjuw^DTm8xcC8?sPlH
z0GH=S4+jec%DoH)HF~K3-=?{c<J8r5!;v&_4KJ*?Q^_Lp&`(n4R3)`x!yW0;znh5+
z(2jnyBk;H90P9mxN3y8~5N$V&OaznNg6n3AF<0}0*YS9VBEHz#Tk<6sw)fNHCou3B
zAWi(cP6By*o>6f??)rJVj5INUCfe}K5!8*KvovF^uv{<JdyMJBR>JqOgZnrIDhq?M
zQK|yQs$sA}_vYHB2hMueX5ipLpnOJ%VagH9LEs9Z6(S=vGyl{3CD3!FV#z!xgB6jm
z7X6St$vloHKUZ((fmTzSXr~>hV`UC82B+{bR|vLNW_(njK;z_)^oAfT)+nrGTw#uu
zPVAkqbBB$xOnT998HC5ZarJffaux#%!|YBg4s-~NMLnkWg<k47nFOPHjpr6P{N4cC
zw3zE(Rjy*lNxa&V+{kMl1f>?_Tw>?t(<-jchE{#nAx7C)HaK=GUDVccd<O6Eqv}GG
z!7MPQ-OjR1i2je5`ozovmX1Wx-xwh{DGbyFVXfR{t;g-J)F3rmtEb#xylTi8H+vwS
z12=|Cd0z_k{8lPg1t^fl@jg^1KD=iwQVH|J;uzHo<g@K`as+a>4B8t!H{^?v05Na1
z+n~-+<lY1`Of!J%7P5cNmQPOCTIXLSyCnN$gB*nmiK(WF;1btzdbL#FCi0p?^M~Lx
zoIYR+ORwx!gott3=C#u$08JZ?H;1q9>=fRts@>}J9}ckg#3&<-ZYD1~X2SE|u*52K
zJ^|0XND{(+@rrh(04Mlc5M+pXWX+4{9zhruTfFh+O<10|ZIhEy<wJay5iJZMwBI|Y
zIByD-_f0h*uoIPkG$DFJgf&-mqvBBYdoMXV-F-3F_6KZOh^N)7(f`hZtnGs-W3TWv
z_W5Jb3eFR7%<~)R`eQz_5dA0Z)Q=r@{Q^Zc&F7eRwIKdNVD|EvwA@^uGWPtiECr|K
zy&rk;q+{ds`*Tp!ul?fLROVCLgSxRctS!Uw|F(>&m!HMVn>*jelqDyxL%Bf#x>8H4
z_MV1>yd4`t^(b!!#8Mc$9*p}(>sVn$%mM2vf-Efk65KdrC3LFl!Eb$<hpIoMf*R~;
zAFK;fiaD9Gf305Leuf(QjzWr+I!-M@4I52Q2bmgx#7|hTYFHuSzkVvkncA&WVpK>G
zN_(8>jQL0br5<>-YYVR<0S-ylt0yB!;N(d4z1uJB;LuG-68=Tiv0Y%TDujDPTaa$*
z;F*9xm=Svqu{ICmOcD8Qo=|@BtIBe*0Gt%~iyq|WQ30kuMa6QV-X{YIG$ASmf!B~Z
zB-9<98WSY6KIDNYzp?ra%L`Fi0*wDAZ|xf0j5ZXHYtOJnKpsNsCl2*)h*^j3NidK8
z0ZH5z?|1MDYW2@>i~2p9o$dcV1?&`5lL*cb7Z274rxBEh;BAfT3g|b+gLOtH_d=lI
zZi@O6I)*-c*LSx)XqKrV=Y4LD^<aRfh(guONS9Mb<@ASF8#^hX1Hn0cGg%t#-`sI$
zbh7qCSJ0A7)ok71)>DmW*8!IR&O3kfP7ziJLw*?_b}F|Wz{(J$T>^bdSFw<C^@oG&
zmIe(SWb%iY6WYC1)U<S*PA8g%r;**bZ*O|@`vomD*YVA|-1wGVlHIw`wJ%mkRJW`j
zBB)u1I0EFtc>Rz<Vww@cipCv?-Tm(%`t7ISGYlRriy@prHvKa9Irxki&wDM|mkjPb
z3LHoR_s-rQ_EO~JwqNH)?e6sJZ}W6`DI1?AlPzN}2p&WtR(S$8-*E^$EBkQys&A7f
zKt>BPja+(j^TYT?mmLwZ7QXXGRlfX5i{luSSO`um8&SP`26gz#v?v!^#=LZjWyG+g
z&!xoGdh2$$-@-c6y4ksW7~?LWP^n%G7HR@qF8UT(LJNUeUn(v@bo+Se2lRcvI)%Q@
zuj25M<k!|#PJ=KwaSx+JiKRq*S_oXBh&Kcv3cx?*C!&_Yv}U=<#ZdTSLha*+LZDyj
z!)9jMK9uthb$IkmH&Ji+(}ENUxelg@Ut+ueS!APl5HdXs%9Q#IDLbD>Rw_pJ3ul3{
zNY<x#v(Q<wJvED#!yS39JO1s@y89e?h1b5g=jwBP5}Ushr4DPB{Yp;qG%Lf3v14YL
zt(+ZsaJJ`o+llfk7*>%ps5i|fiN?F2w{=<XGig#tnYy``StMrqs}|?%X-o@}i=fY<
zsf_|GHm%|@X*1U`DmM-P<O$GO%`$2Z9&lP&tbXJ=VUcZnh0uOaE*sIefY#pFevKK<
zQSJ#YqdSYnXu~A}$h4=+!FRNTrael5lywMw6gID@d@dop_Gbw#MB5EuwIVTJJ*~rr
zlFzTEIU&%2yYDFf=m_itKsg=Y>`}cf!pcPUQd0TNt7-O6&2GrU`V=svhh>Z^0V%KF
zoXfL5gLI0gVFt_A8*}^O3&SFY5kYG~y1&@`S<*4kY!p~t8*@*H)9Hp-HA=Xt3;}kV
zKeA#Z@!u?jQZ7CUu0WtapwK!&d>XfuP6!oiB&%hA{(R_<io*w%ikkZ^_T#%O>T$(G
z-Jfn>PyGtEFZT0oahliC#mBOhnicZI3Tm@EpA4J&lUkODqbvvN=FwtjRKo~H68pNw
z1E0N`!oFz7X2XwGwm7hAxK^`=ve`AQeB~ebVZ!40e$&u1^QC9&e?^jdkmnz_r6VNx
zu9%nMKSCFZEc?}snRda#;pD0iH3=an1w?}I*8XeQi#=F`W47mad_7YXi?4KVa=9nZ
zZ-7yxuprDJWiKCvrZ{!`H@G)uqm=vY(dqL~IcA?gx}|sKch?)0eUKr`u==Sw{hC#%
zF-A{sk@kC;U5=LTg&8dg&j<tomfP#ns(H){xnV5QOp=SlWL7Z6o2ife^ubzP__mkE
zd<i{b)`OcHz17Pvwpn^2FR=~x{7UoTdNOCtJicnZ9H)ZW#k<Z~-R_qq`rBc|)6tmS
z-vCA4OpGtNYw;qFE7sD3mc1kW`M_6lF~a83>vHRMc+aCT^-x5k&ZndHZOj7c7VOM>
zRV#ik7t{9+tE0q9x;_}JMymh&nSi($4LENe1RTHAS~2WIGNKH^kjX*l`vULn-=*^h
z-)cC(ckE6UlcEO4nh+IQOk@3Urpd*gubk1n?s7|mVT^B~uvD}Bx6di1V9-VjnSxQ+
zh_vg#z9f1d{cP%h&$@W&VU*ponH>UTxeyTrmz+swggcQR-+);RM!#l%urWEQwhq-+
zs#Wa04~`<oS$5VakN{ah{{!SA%yc`rL_zllg|NasjKLn53MAyrI9U2G{6kT$0#KV7
zY=ufm(U!6B<N4M2v64GsjclWJ7HZd!ozWDr#WM<PnR1iWEBAE;|Mvu-xdn6JHJ4wm
zvZb*6yVsK$Q}~Hd`K4S8kNq<-)k{HZUC~$bV~|F5z5UP8R95jdZYph|t4#9~_qz1X
z|9HW8Vu{b4TU52Rj{f{Sr)1{s4`bu;8>w)uyT4_>Thb(<$L<yapA$Oqa*Ex01(@R-
z3vT3A`!Y4>%M=;FIX)^XMw4cT&Z2qG!B2sw5KV>+xoyQOZd0gFC(4{r{L%XmJKyDu
zQlx!`JN=tngNjwt<>eRa-8UGGZA8ts@~M}Ps$S4*8_4q#bIn3sf>YxQ-SdOV75uyW
zFNBq!!mLi0)}!h*{BEhp+cGA2bAzk$WoY);>=ge7y(r7sLh9XuJ(RPNKT)d46uB~|
z%I5c=DW8j#;Tbkm@q*{9o|`Yq_->VNZ;Fo%VtcL^!ik<U?OE=I0ReZrt@xdj$yiZH
zei=lI>ir{c`4EUwqD1+N07YO_`8T-QIY5u9|7*JxU1k1s4nQFGa(){W8XqQCCU9Gr
zt;G1V=LGvH7as0t@|AlPjJ>p%gXQxFXgTJD{@aufB(e^)@@$8gllQ1+-<l2Gl|O}U
ziZ||*pLzd5>?%CG!NEuzg@29hIZZ7NL1tgvg2B>WJa+9bgPUxF$o%6VFJHJD3w@Tx
z7KOBU7mg&odik<GQ-)WQY5db~QigJs@y81l%gVxUAq_{`D?={ZmY*=4NXN=b{@BTH
zm9sNC582AuxIJ5Wk7E)u*R65U28792Q^zS}KTBe?=<o;q_O&u)ksBBAkT^jN)<b(}
zq<@?9kyVKA`4J=&wM@qcOO)gd4T%AEv6H3iCs*7T016z7dSBT)OCJV_!Ng|$xMLdv
z0s;*Cz$7#w;bHfQU9)T^$NxBP!Gb{V=v^?yn_1jPd>f}iz{`xY>Q@jTV+-tes?~MN
zu{Yd~*yg&u|8VP(X%f(f(Hbl^?WEmG*=y<-;zwuvS6KUxt?fW@q5aXWeBn9wzNxQ>
ziyvk(Q@-6s8N5n&kP-~xpm=#dgjDYA_2Ak_sf0R+(BWQvy>@JWgTLC1B8FM<pQhZl
zUds@#j7VFcFe2!u<(&sjSPsUv_|@&bP><9N8|u(#AFQp|LJj0E3T6U0cV_KY7n^%*
zn~*u_6T)InuvjnzmRz!pm%=*^aW&q<yujjb7Kcj6lwN4<HSqD)X}M+z%hv?MH5oym
zaFFV2Mg7UHOCqiZw~ua9EDOQ$;0QLhVW0&So&Vyy0>JZ{WL5y1;63=E`<&ilf0mqJ
zY0i)AHjVq$4!w#(Rk%x60s9^=*vm&TQtHIRY4~R(qa}yDEPRmvq}U)toCD9V#_BI+
zuCa*BoK&ST`<U$AfA^~-vmxEa6)N(G9oz%qriV}`7D{C{RdD-$W98}ThZ55`N(;B~
zrZ)X=YCqU4LaN87H3B<+=Gq6!YXC`wd}cM9b>+NhylZWfPtRI`o6YTNlPT>j@u~A0
zj{RbBswBe;m6st{F$e^lT`Y%sjCNU_gxd^9{~j0~0qPNUrs7DIxyi1OJ?F!F$9k(E
ztD6C${ic1in^bD33*$j38L-=G=W2QmamHf>tA#ma!H93A7MUX`N3<;_^1nX1KdT-C
z>TBOwvW^%9I{uHNv+!%`|J(3jbV_$f2uQa`jIT(SN`sViBi$P!Dbn5D($cUIN_PrK
zNcZR(+q2*E2b{fL=e*BOeeUbN{<v%OQ?=osfj6L@p@bNVjZo2IzAjOz9bIijpJ+AX
zLEgA>X3FkME)~IE8^ch4DmM~6uC>_b-{&>D+rap>z21iJQ$x&Gwj{+hA}>|!`w2ek
zg)bf48qw~}<m;ckUj1tGliTb1iJ>^td2ba2OyBJIc}ICTqh?Bv%g&T@6-CY0N8DC8
z{3HCQ<I2|v&$av-w@<x|L>A`W|M<UWWR9V{E^US<lkUI;w!OrbQ|$=t$6s|7Lxzjm
zxfMn}d!Ozrzl9sj+a<&suZ^&KyqH0kphJl~->g$m4NEH7lwIy{ZM*eKbRAb~bxZP*
z+y3&un(VV_u+Rd~dr~hb3yG*A?JTeb`faEVM6=H>wHz)yckD*C-IkR;o~R>lmD|fa
z_st<r?&C2zvfbd~IS!!%GEsMYeccj`3tzsSKjb)CZJ<qUirx95pB6Y9D!>IFPzAIJ
z-9g+-l|g|b5&vhuWwpImMJ?#4`$+%Z+g&(B!ji039sKF(#(7{~@gbXl(qNO|<4jra
zKEv|V*VyTQlB*dSy_$DKr)rQ{+g^q7+srSL#Y&07WVO99r%(>yRExx<k>)V1Mzf{3
zAsT7W5qZURnJSaeI`w6~kVWh!3f;rs=}zNXt%DAjN7Pv;iT;X-zFpiTK-)$((g0mN
z*v5LZ1Je6wBVCyP|DYGqwwFM6G&+s#Vi?vGeXEVmYBHYoiW}v>K?DLd{lp!U(fA(0
zYUux96Z?Pe2BK>p)~D&W_ohKy2T<&EOEczyLZWwvbGc(Ehc-^+-)8{dc}FOo4m^Nz
zF!t{{gcq$Hfr&eqYkv1UoAYb)$9TGQ2fFsVwADyVF2NJ&TMQy!dyzclqY)aM%Qx|&
zOJ59xh}{GVpg8H9qIZn0)tW!+2u=bujf|lg$Q5_(3+X!L9Q$&A&$-H1H&G=}-^~{v
zlY#GaZpVvU*SaC7MUJk;a5)~rR>HdLCzb+~fXvqwch&zk=$xyZvrf-GYM_X(?YUSs
zR9t%8)Leci!|jYNJQV=+il?_ixOI^(Gkc>bem?-|=Ib$4)QvZoSU*w%j&}FhZ7R<|
zdHno^Lj(vxu9c&BKmBXzp27(C(WHv9ob|GINX>U!bQucQ>1Wb9>ea7xn@(+{2H5hM
z)I$5ilTJ6rp^tFcpywyr?^RfQHmjUH%3Ny>f8yO<BN^)P_<7@V@MbQe15=INnR*~~
z8=@FXHAG8U+YObjHTbLl<5EnDS_Bw8G{3va`qhc^ntHn9_Ix?pD8z8T03>OrFbjtw
zUw;h8g?GE=&0mx_IFEX8KT>2fiMC<hZmRoLaK>uQ0^6!$H5?i8-8mmvRB-E-zA#mI
zf=M8f6!&0a^lxoqd^CtRYZK%(?lPe8{7<|MC%-=(J@ljFFIidFuFGPGAJOmK-%R47
zWjTGRGkLyooBYj|1+Hr^pCmVU?{MT6%azg>;xI>?gR;v;CuQ%Ldm>hZNQWSdFzF1v
zw;;>nXY~MIG;=c(1j)#3Bv$vpGZ;K@%R;PA|NAviV){h(^cnMMkppbrM=V__I{O||
zhd@Pw>jI49+Lw+Nb(9r~;zJ};(lrEj9{dqtbQu*B9mK)q+Hi3qI&1E8UuxH+Ao^@}
zz+oNFY%9vBK{o}A!HCfy37R<3*+~6hZ}2nZ={&ck7sY|fSHCX;cmfu$+K~S3Q#Nh5
z>o7@o(zKK>AgZU!zWnc(cVNPv6arIyYROsocFzBXvNupe06y6N%P^!L{$v$xsOyuz
z`hGcs$O_MfKkEq-eCuRl6vev}1@{Uf&UQyzUP#@w_l+ukzHD$cx^S<FJ50P1pnvkU
z{Pku?bVV$wIs-j16L>h~MpX%yC9JWuIX)%6dIqexj$&F(9u0S7RX^-L9%fBnaR2X2
z8_-<AC^D5OM6(?FmqB}%a|KmOw4vEhU&c4o%n6JiOdjq<x$X4ue+wVR@8lx_>BtKe
zfhYokc~8t229kOv{u#cTyj5jP-!*u1R_F3mcC!CNuBO87L02^w{|W?gvxW%_uaXY5
zt6$h@y_Dbxv$-Scng1957~5Fj`2v;u8A!W)%O6C%Xnl%vv5~$r$ylBDp<@{{0rWrX
z&8h$vJ?}-?t$2IM2vi!gz@%@PJ6te5-bD&s1u=lK2C$&}sv&Pou#1jF56Se9<`IDb
z1E!frx>)%O9Hkv==F0+lVTzsiEq#-3J&oJd3HIY(P#05QJ*|&CMr$2?l^~2^EoPR;
zL~bKN5U0Zi%`hPKPq#C#93)ZN26GTD_)B1Y4fAx~m7A&{dhr7l3R;iGZo-V$2~{>Z
z1Ifjsh66b7jT#$hm>jGNvn44)B>tTqp=Tz1^=*Fgsw9g<W2qX?f~hw1(cfS`tOIk{
zNV8xhWwO;ek6I{mU62FjEi9DGD^~6zzw-LIqo1bwV^5gdW)+@mrHhZ!Lr@Kq-3rx1
zU0*O-7X*>^K#5@3n?gJ2%|3tl%$5WQL2D=M+!f8UW9(-i^qPQfT0z&fy{2~`Hz#Xy
z5mWrWl#EVW-m0Osy>@V78C?XlfQ|J+DPZ}!Lv&>O$T;Qk{O66Ka;v|B_;AYHt0`*)
z#U8j3Y2FKLd)}tPQ{$jnKT0{ltb4Z(IeK#@xS92x=xuIeC9_wR<F<p%;AjzGgLfx?
z*Lh#VgRz*S!Jr|O|Gst|MnfI)>tS-#S9bpV-5D<+VQTAnP%Y-Wzq@AnJoGZNyHBlK
zIQw!zgzKY#i4k*Nu{g2T<mV)-(XNh9Q1Srn%HD>;h}bOab6x3arL($~r+gkm&Aw!H
z)FF8xd#=9jKPZ>>X}@^pXy|)kcEIxDbBBIjPH@u2df3rRR>k<66vryAWdY-VV1-SE
zi@$CvA#C9@bF>9-iU+k`j7%!*oYmTAd-WX}Nx_&-GNmeqCW-ys%n87UF|rgicFi_m
zyH&^{6;kQW_s?rQ`h~mIbcFsrNDy&vys02>F|IV_(|jPvUq}RN>_HYbri*)<mueOH
z!5RnxI59q^+^gLRj=C?VC^|S&|AUbb{qeyui|8EbF-L3CofAxTPA}t>eBy~<F;i!}
zZPjy?zB7vI>vA=Jtgp!exE`R-p2VX=Ov5Oxwl+Fm;up(z@R0X>oy6KW$9t>tP{CgP
zo2ZV{X7bE28Ih7pJJ;N@*CS-H2n9`}QiD;k{!<RQf;=qUr9kPp925@{MIdKzU&X&#
z>y&0cGIG`@4XOvWaU6lo=Zibcvi}U)`4}rlqtuhuA}>tKT<rZsnwxCpeZ*BA-=g%+
zQ9XT;@C{KHTKf@z^GF261z39lRi>dVwq?o0G}vO}H-Pfjn?wt~E=`YooCzho{M5@y
z()Piuw~vif5c&>^HX>gO&GKUBS5Lkh|1;fiRxwGVihtDFmSC=IU0h-Z)z0~j8v9*b
z<b|`3v(le~^zu&$V0FDk%QQFvU`NNp(NGmpFQu#X)?4M4mii|3Yyw3I@3mg;D5Ec(
zGpIm<G<uO`2mG>a7-JQ^Oc9jWVAIpTj`z*v8})a|#4__~>$#ur?~N}+&E<qT%Jg38
zNRI3O0tk$R5<l2EfAm_PSmWE4J^9^s!~rrWf#&zdH1h{7m1Pl5)D&)t{a_?+BLu<L
z`xHjKca0yGgj4AgKK$p%OI<)Bi8ewp++o%Zuk(%AEhw45GY{U8VcxYT2`*yGlR;a&
z@#F*E#pW()E=_%z+!Tb{O|O%mMvyFANs<8iu`AGzU)0E=TkYYaK15pw>y9qjKn1TS
zvs@&opVWPD<_8)3Tx?>n__eb#rH?_ws2zb>wuF!9a*T<`msZ43JCQ4_72fIrDvGS*
z&hON1y<YJF7^CfCg7EJPf}Ce;Q3MZ8%k4dt?TpPEqIXR}i@z@Le@r(oP)0BxZb?4}
zwW2LbqCvI0;b}}wG=g;|1{*D-lQJ?8mXr?DDfUl;AO0>Ec{0uXOx}0vGrc235X-!~
z5M{|`G1ej2nbR_42-y_ptdrB(eUZF0S3+p1fvG@ZiiAR-;h<g@48M(#jPpK{kfraU
z>RJD9wF!+}K#-Y+6I;5gOT4meiQPvrBXO8t`Z(C13o~q7nEXTKTuENo@n_Sn3mJ$R
zJZoUEOXEPIREO7XE~P!PAq&r*1}~m#nTCWwps>9|CWj25-VjA@!?Z$a1OGQTHi8wA
zCHcfNVn@3xfw18KMWet@cRDHQ-IiRnOozMpcL&3Ja1$K0HyXTzUIi4V9)pvheqVQ;
z#0iU<sl-a0pMoqRbr?<VdOdqV_XR)tPOH(PkQOyQqm0Szo^`T^wlxVauyNS`1R&O>
z5%JnTzMUhZ5WtSU_`C=kCl3Zkl=m-^Z!emG40#3>3PL|KOTNrRA*qfSGaFnQH*q7F
zGzB|<NLeU6>}_UUOo%E8wGa(MN(8%|u=Q+HQSGI3g;453@G^@#m8;&}`WKtF%G=42
zkH00RpY_jK`<hB6m0g1N^Ix)1F_^E-QAZE-kv;XIjvNC=Gyw`Rhe^#V#s{0J#-wgl
zXSJ4pVnV0QXSLTtI}gvFs~9%CnT;iqKTeMgR#kV`86Y{({xK=J)7knq=-8Zy)YY~K
zA!(udBk=vJ372?#1K>gVHPS=?6pfsFXm|Es3B}je1hf@sWmc<^llI3?ZpL}nijSR7
z4)o1&1nHR@m!cxJCxPV&MOhrz$X6&DjtqmbC*lJ~m5^meQFX<v`vlbJ49bQRq(Fc$
zaBS=@rA|ylmO;A(7J2QNmuUBl8t&5qoQ<gv?L0?`$U-;OG>s2JAOR3k_1nvU-@Dyo
zvD1__6@v|G;El;<YEbKmyMyOI7pGb+4vOMQ@`txLrk!L#X3NsY#Kv)w+;56MCw>@~
za<%@oQZ$OJ;~D-_!?!_Ix_0=YLEQ5b1;+E2vltfX18GtWZtpU%8SF)E0tRAcuoMnD
zD<>l9JViTpFscmG5t`!jkhzDn*QCZa;A!G_LmQfzF@>1?{obWW#}{H56b1$YCoyt9
zYZzB?Y<n&K-(6)7)qa}`>oH*(9?F`(-)ysCwy^vyhW&;2rXQe*0n?MN9V=YP-6*5K
zuETl4pLHD#u(mnc36F^A7;qb!W=0bv=Hw=QdEZAWD%BrbGGqe|Ii-*~J4?S5dL!s}
z+<3HQ0#H<!%pTGQ<egxGv{<A*9ORN?n4~svyC?CpCjt0buNHm2`G5cIx!zsWEq?O3
ztLVk_pA8iM@MKq^-H)IvBkdK&^lf=@G~+V*=#XY3N1<bU_yKx?!%Z~g-Rmg4D;vQj
z%>L9)jmV!}`$qLs)o+OEzmNS3XIYn^cJD>b8E*$I#LF?}i3~)mDbngpht{Iiax*%-
z)ki_`%kzh|_s5Z&cIzpUDSI7^QDw7eIelb!t~a%OmLs=14s}ypeyh#GYN$&pDY$AB
z7fN<vUOgOSp0^zmj!8)!HBJao1~ivY54N<Uyuao25s6u12#gwFXOKf80Jb-Mt0X6f
zPEFHAAsHQoi(mR$y7&M<g~kC$H5I^gVWO@j+H8BO4~JMc#;n1`tY6tI?QBahmN)MM
z4Gvd`T_*bB*V&jB)@wg79mS+s3?8}n9sX2TGuBtcsKSLbIEc?R(fk30t_DL|NN#h&
zY6LW(Y)8}0ns>;jaB}=_m<5TLf+m02kCn9TO0PLFgQYAvk%J2>mBOx_3}n;Wu|D)9
zxf@1?)DJWgZQYGXacm6r7kmR~5!PRX-ahZc3-BnlK<vKx)IU3$ygi!-lMA5wMN`qW
zTm0vq`Yq9b<-^VriNZpJpc<I{epbkA;keWIHJ#@$QyVtgXVQc)vOFUVObAL)7uK3P
zx6^M>w<bQ^c_(9_Fw&TN{Aw8?VIG$&^X27;>qeJm5_8s18@df1Po_TN8vOWjN&!-J
zQ}I7n?s$PdQ;P-8IP77cf3l&{w;^JqPH~l{>NZo@;)<G`NOKBx6aCNEllXzJdBfU?
zu9f`ezLCSY&D&_hh2}=flbMUUCq>hI@V-~lafH)y0gQ-9x$pIOAFAQ<!d8^p&QX!P
zJZr8=f+-lBBFeQ&wZBC@TFX8#c^O}6q(`oMQE5e7Srcs2IA>fWAk`OMzEAlZNkC{K
zH&LhMrcWTnex+&Pdd9a+u!!}F6Wfd&;NQ`PZtUxwHp2L=K?_lAH=osiJv2yU58H4?
zxF@gWG~4O_EO+mEn)rw#g8?Jg=vAe?lA1+!c)uGYf6G1hK^gsLFv58A%Pm@7CW(Fs
zr{s~rJ3w^dl-?m5PA_VL4Zg>N>=HdQ*%7u(k-h7XlIK0`9bfX()CiHmyLo$<W2o;o
zmEUOf#>oo!>~E|!0p^>KNVL$RTv2iar`dK|zNFt)?8ssWg}qZ(W&=P_JlM+9_kA0<
zPzpQkN`Xr4`ij5xv1Q-O!i2xa`P9BDnH%q5S?lQGl{)#gf2yTlsr;l%-I}zYMX9<I
zD_y6jve#narsjGOQ5!mJuDKARdguRDYD(BI;v!>w<Kx}bw1H$wKDM+=(A*!l3Fm>g
znPr?rF7JwYT;Ovi+{-Z5(ib-qDj3V>t+THk#jRZ|l*9&vW}eSVIIz1NwB2J9vK7I}
zyvL&0cbgZ%WX75kpD;uM6=sj)_ShWN%Hq-WlmNV`eZ#Tiebe--uf3xy1e@r%lNa`|
zaQ-JHX0K{RH>K)7d_ei<$i0^=I}5p@tJ?<2^87FF^~s*5wgFpo{?9!BL%lJ=BHv-^
zcL#qo<(cQP7u(lNkG$1ovm<4j9k&!JOHd$-Q=i2FkdzRqmOTFMqst!ri@u@hWt$*Y
z1<TLFkd?1YTmPG6i8!ycbequ3S-I&}5c`;<HfD_aa8>6gAS`S@EQ1UgjToHcG0nqs
zq<x7qJdX3!GR0C!@UYQwT@=nG=8~Kh!F*k$hY|uz{P<RCw`U}NV!!s^MS<WjkTjf>
ztblPqW7uL+zDZT`s?|0T(^iZ?mJQp^8^cFBqrrl0+d*_Nzn1yEN=S0+hvus;9$tDy
zY|!1ZAg-|bFpl`EY;qK?kE;o*!Fmi?24XB6spTM>)#687%%<^)(zA%eAgs%Q<n8PS
z*`8hTN`=BGJ6d6J`or|^UcvU!(v;^1-uo5h-E>aNLi--1&L1<#0{aXOCFVRuYzBiD
zOIyRS1;sILJ~@w9c`h_c-1i!eubqogO0S8P44f(B6uA8s^5{%lc8sVHn`e`<d*8K)
zVTJLDZUwVINrBGEw+Uy5(b}&+Oaz~pl5CteWFnR@B@o}^z*C*skbK4U?Y9#TL~g2Y
zwVxxKj!_5mQ>B?85q1H3S!4xs_mi&CU%4_pX8E)~xha3h*k&OiCY>5q>+EbCX$7_k
zx;lKaRET^D(hgT*n2NW;Gf$pA_jDx%h=f!9++6Nv^xIFevcYPlF#$zjC4jCV*im1g
zViB8^{ozPS!?HnL^VD-z7bJ;m`x@Xi!m(J$uw{c)C|hku6j<H=+%da~I$|aKhkd}H
zZa9i+P<)Ce@EvCq_cW+fxecgS`SyA|Lj2@McMDq<&urh_vCwds$k*mDl(?nR<OMc{
zU$CM)ncAY4r*wTO8crrr<khl$>(5Ptm;}bIZjo#<EwLY3u}#@q2WZ*5nZ`~KY2)lZ
z`*|GBX-K#)cpOc|ApeJNzW%ZMZ2AaiZP;kGEXK0t+1cdyQ(fjj929z5jz34#L2wX2
zd&>Hffug^_z9NDDhr@W=l;qXtM8?cq4WEJbzi9J7D!fF4gEOK{%EjMAd(moHvR=^9
zq$Ol-dsb{heB7pYOvPRJuU@vHYlY#ens9-gW*%pn*f_Z|)?r8u$VS)Bei{G2*<Bv1
zWA5Llj1K|VBB+<2j#$VY8QUwKqq3<xU&Z2tXGR<K6ZOA?8`I0Ih)MOwzIV-9<z5*x
z9+$(RgWL_R_U&o7xk~RY5J9>Cq33K1ZN(CEt5kMfj*JGyd?eyw<%P++S=(iuw7^R|
z#iHtFGL0$*S>QL{Fr*NmeV>5rSe%bjmp9N{Yp%_*LV?{cIo4`l<b?##MCI!m`-2mX
zW%$k*Pg6mCQqH1pAaSy2xe$?xZ4`Zn%(gLbcBQ9aP<ClZE>MatdG<q&Wc?iY@0cAY
z&2}O)Z4I%;DyA@l4Sr=d;5+x(OPQ+;K~d6y^NOSlLDO(m)qd6n%V`zi5Yn5S`w~ek
zeBk4lv3>~Fi*91sx#yu5_YB30af|fyALN^h1qBKF_o?G4aBrScKj`Wsv8~DWBI2_u
z?>Qw4viFnqK(EC73-Zzp!OjkqaHy-1Fw6P|2L4SbCv12u>?YR1I!*+rgu8Xvi2cgW
z*50I46{%-n)g1D}V^?(F*<O+De)gS!v-SMpZq7Bo_C~(3>UJ7{{Yz0tnxn-CgKG5H
z$b}w;pfeSnNr@q9Dy|Ri?gcciha^v>b-X_e`3K*V>CZB%!X*rkxp*V?ornF5oha(j
z%Cdt{A!Yi)pvK^@9IZm<7@$aE0;=NXzn2v$xHg}c8~N$<YT;y$;Bt>3%!j`HHP-88
z?;Il%iKLKIZc;JU;<NR=HOs$2U-~%s(q4W%W>h${zm9&^=nD8Z40E^t_HpY}+vN;;
z$<AW(T)QTY2ygc|QcjE*a%45h6ygmpwmVg@y;XkwJl2^@i;gFDKCZE0ibSDMzS~|T
z8xNL<?fJUj{zh>q0ekI&xr_L_I*Q&ub82CeJnd`moy^o5iP+8P<{oc95*8LJnpnby
z%Q}e&T4004vx{d=yZcOk<KL54nFOUNyw|1d<0#sVi{LbTXjv-$g$LZ-#Zkw=I&T->
z*$(wf&0rz~SxNQberw%L<#SV<U<&&y$}|jieKB(0UT>$!<Yy7t^mQnOx}V?ET9+O5
zj7Vz>(jmlsjC7apkA3Z(TJOry^%uS9cD<j9iaHm67H}7}+9Ntt>##$9nXRvJPO|f%
z-hkn<6GsUwgSQ9fvz?r4sTXJ1&pprXYWxb^(zib1l^6m!a`b)gdEu1nxT|3wy`|J|
zT~Pj6EVo`On1a=}>CMBNtAI}$TwWDGJNFl&l;P>C0|l084R!vyc2g`-NQ!<|)Rh_^
z4S~$@IN`zQ_>luQqw_w_$l*zbbk3Mh6`39o7#xqo_U^zjZ<g-rVd02YeXc(B_}g8J
zut*16WO%vgx#){$VjBO#1TnTu|B6KYdL^2BL#c0EpC<mZB(lupA(-C`JO=Uanw7-y
z^;Q?87Japc1Dh2-*AAES_fq#-?(EnLaCvz2F(UBbA7Eu(NF9rmThL_c<M2HWig%p?
zzaU`h5H)!-mU-yNVfb&ETXNIVV>^?Z#`a6h_T?2*jMJ_!@fdjrm~lT_&VOi(A{8O)
z|2{>p%{7UJZLyq@#sWb)Yw#HnDDl2>u{F|rlrFJ;p#8r(K{LIOe@}4|r{tDxQjMFY
z-9Z*DAr@KY(X3%Mc2f))snyxUu6Qz!Y1kKhH$xbQ^rEuT|LDYTGXj(Pw1%_|K1{XF
z$CbAo>84-P_cTt=hmJ)~H859%I@F}!1^D26mek+UZRt$<fnRMZx+3)qlK&f%@u)e=
z;ZpX|xFIy)GdW;T|4v~aN0uuhwa}^dO5Ey6mD~N<tgw5viDsE#uQRvE2(hOJ2QjXD
z#i1W%8$0H3X@Ff4@~;?4d3D1Frf6^X71{YRP0ukK&xmE3%jt=lC}sn>mg2f<e|>gj
z{t#}4Fg70V_jbiX!$J2X$~)az{?6%4JEt`YLeX(i=~?-OLf|I&G+kjMqJj0xS=>1L
z5xf7Z#ukRwnf6fCwl_?Pt7nd{HlzP|-7fbNzj+9%@V90<);Oi`WPkamtk3Y(Q~=Z0
zKl@8wqY($@5To&oB<pP<k|p~6wy-!JXUn_}gIIdw<8HL+@%~`s9Zr6uFxA8_lmZnn
z;wnGQoYu0`;KeLy&XP*A>jxw-_|0FDx)AKV`C~|LB9a;Z3maY~!#^4|IMLYSE0X*8
zd)Ef6=X;7$Jg0robk~DarUJETPRe#SoGz~5BODYO4`>piq@S!QsV??+=h}os5KN?r
ziZfq?^X>Y;tf0d62PsS@-8~tb_)~fM;6Yjm-3=W=OoG5pD|dN2Za6%Q<a#f>@i*4&
zQNSm2AV%x#fl@F<!Rkd1mGq&u%nuEAnV(y07me*v_H~xRR)a!;_7tSoQBu+)l%8Ob
zUUdV+iduQ`wK$XTaQzv_4>3%!t48=nz(KMafr}L3Z%1>qU!*0XkP7`aaf5j?>T0@<
zRnHaXn<KMN5sZkwhFS<%eTa_=(!OdmgQMhceaC}qKa!2-*r!dX4w-WgbYZ)O|5v^T
zPsSux-FK?jOxkO)xp%@8(vI-GzI+rO<s6<AU^G#Xi!d}BQ|)Wqn|1EQY*vxH{l58M
zkZIc008SQ3KvsOKosM;1@0bQnaZgB1@!yFr;KzVG$v!`gE!UmQgVA@)aoG0EeghNT
zS*ZdK^d(Ki$_cmPA5Y%`nBSY2Qf0}9eq8%Uax8@+O~uL(0k=A&vEsa^l9+V;3JR<*
zo?R<ClNJQ3zWZF|Fr<Kx5{7e+;m6C!^Zlne(sFV~=QlUc@UNUHj(*qW%(3811$n#K
zwu+m`z!o7JLo_i3SR@#K@pnG46oWpPoz>aSxLb4Sn?pfiLirB2P!)CIm}z#|s}CD_
zGGDBC$%tZIorx#BdWLLYn+eerFlEeK@fRo1NdYxQRy?rfrnycCtJ%CIX&;|5UnHJ%
z08UmQLj;6<Ww?;z1HbOD6d<wunk49jX!~^!<g1zV1x)D*w?I08B=+mZdR*L~j0a$8
zh8-s&)o_^K`qHrYUHk`<CzXB-Qoh8mdi@@}i6kX}$6A1D$c+u~p~a#!kIUhxBMz5w
z23o$UZ+8*K@Xy=}p#%@92+fI%MjcE_zvlcTwA)BJa?vX2&oF{&h@d!y8TsRj9JN=i
zJo*M@*#o8ucNkZn2+gkef*21c<p-&1(|073IKpM|G&$C40Q<FnzDD^LeVP~5<#m2S
zbK*&naR=J20+uSDwfg~)M>H2rxJ;KF#Ru(`5kL3;TNkxOCq@<|7i$!}MotAw1f_+?
z>VTb}RFxq+4rgtB0nRWSMl7UrCS0+E1;c^QQhRHE9o?M9g==f^i6)iSjf&Wx0bNN5
zEvF2yA<-xR>qkrl?=Mar0M34vJ$cwnC)W=9n$F^0VM1EG){Zk1Pq8@OYl7Md+=A#W
zC%qoV{KSPw=`*3hhGBZSf<zx6#FlC$+{F%6g8lw%SEk{)fP%O4Cd8_8EmvA*W)Gs{
zZ$gNVFyu{hms-r^B&1^_ZjVde=hayf-{c}&UyHO_iMQKpa$9$kc)vRE!F!ifDL!nm
z4Slc$Iu%{XuR{+A>Wj}viH1uRG^>B#h!I@ol9V&`fK4fi`Xi%Kt20Sx=yx)rPC78_
z)L&?0vo|EjnhXIAXM4X4PN4Ms>|TEq?apktS$($P+lyk)P46mYFLv5}i=DBfCinFx
z!z&azk-(SF!e^gp5>oTIYvnm=Vn*Sz*hybC>VuvKY$2HbfVrm6$xpBqR!-(eP{Vtr
zS?s0$&`EH1k`g!ej#cLJqrFkqi{+o}(D-Ieh)1<%#@toot(+D@g{|8TTdS@{a%8PP
z>&kZ<;tF%}!|kNTmHe^Z;|k~^cA`S4UNpThWR4A`*##VDH9|<Z3{y1;Oiq-V1S$Vg
zAYIyl%~tA%_$$UzUxrI+oT!jox&eZSbKg&ERs!S;R}ErQQ75fF<kqaGFcBK2+;M7i
zb1aDSx6nmmVYcrEhTC_{W_`SN@`X0RxUIylY?<_xw^wDY@DYqx!&fFtUq_3$B}tp>
zGB)rQHmGXiw3Oy`)?p@w5(ky3#P3FKe)JRiksM*b!ZS;=eKIU-ddfZH!X+3AVal%s
zZyH_fidKWBb>Qt7bsOXR)>gSonUr_7L@M0U=W%P*R6xW1xoy2Rfg0!7d!+2I*Jm54
zv-v{7^CU@5-;fm%^V>a~I+qjo>#aKUwMGrnX|fM5{~=4{P0PVlW)69q&EBixH374!
zk7-13OKuCmT~VruNB>mlFS$}UyRd}nj>&g_XOrDZSMSKL%nYyphH!wS;q+#z;G+9s
z22b`Nm9aGvQW>5}^>G8IrFSJmm~$H71+Ulssz~#d*y(eT4_-oyBP$uKAUCg=g_Qf1
zjO+bTQ!~j5X8vLRY&`L)o|kSBn!MM>Wg%}-FS@>%EXT^Y4JL?Po!g0ASHoN0fK{%E
zPsrmO-EE;CF!p7;!f&6WdQsIoMz&!dn$W4unL5j#Z@`#v@$sY^eUh^%wRb2p#F_pb
zv??)=f6JURPNsGU%yf@8EAfxFD$To;LtF19+9uJ@Zb(YX{n=sSMUmL(xlXMx;%tLQ
z<_j16H>obSSfZ_w%+(x~erEtNw66{>qm`;bbz?;Q)!A@TRWt6fAqZ6t=ar#nCU&_R
z#JR6*$b-M(aPB9(<3(V)A>_ZmSxKbk3{C`Izd8i0^;!852?8M=R4U~st3wgj1vk3B
zOOW-=I`}2rKA;n<1jH#T5!j3=p7Wo%MB6INq@UU)N+ypUgLQOWhwE<P(r8?w52%Lz
z{`%BVTt*JMwZzS%e*!)(PQNWP0$ONuaa@7%p|HCD=`pe@C^+7(+vzMwhJ%jKTjFW|
zdsl{)*hH^FE{e&PY?7W=+fVVg%kunr_+6`Yr0}Oub!*qPUaLENGb{+lgH@k)XHNJK
zUz99F&kwEm@Z1BJ9Z)yzd^T>!M#-L|l+gioQ!pX13O+8!Ok84$>TtgSoHh?;0!k>Y
z#~__;DV4qeybmx!Gbf^5L{HlCQvArt6k>OUsX(H$O?(6ubAQBaynEn6%l6>H(Hp^a
z2o0PNK~(qhVm%3OlJ-)F=adH#gazt7`z+#B1pUE(x&@RiD*Qy6*RNd!7D0X`Xwgh~
zxR}jHpIw~@nk>9#YusT)6Rj6#TvG7(jXJo=>vy&)js~LGw%Dwp%<s#z%si5MXi`PR
z1GDhNtWLqU#Js~dJ<lFw>T-Bxg@|Gc`GUhQWp^AFH>41l(B3Mb%FBiE!(Hyc;`Fn_
z-794J*45(AOvz8bA=8H4t^d|$N7@gkMa4_mVs9P2u{ZF8ldfu{o~AiO1%dxe1hl*`
z2MF)zLcZE<T}hVsHrzs;Hm@3j_PF4@2)PMxGqlmBlc=F={#}7N8vXbIhYAp7HGJ+!
zeDf;#8cBydT&7qzPe+WW?bUgWKXiq1!3kx4Fjmqa@~3n1_RA)S01KOR2oAJKeF0>v
z+kE^4B+ICojEDbEOp+|V;HHJcM_uRSov!DH1L~(R^aLiEkv}Tn>JzXSZU!>t^=W%p
z1W+M=I)4jt{sIu;)HcsT=-!*prfolat`VZeiHb+_L}r^{bHmw|=+~J$BoiDUXzwJv
z?E)=I@RZONXoR)Tyd4gB5=%shcY`MkpJmns|2l=j;?dyGi(C00K03Sdib!kLqeqeu
z3e<Ysk(<H;%huP2092xga{Na6X9UrM-H8n8dWgrTky{Fx>#dE1Z+~VqlAoDz;0?Ys
zm5pTCwDZf|2HOKXEbYg1Vyr(Z?<bc?77i-?3Rd4#>~{yEfsQ&ewDv(*CcL#axG*ep
zGUHvH?W71uaFa)djt0_m9pJxP$+b{kzbEO3Il0BhTMl(o-D=@<DR#S*m5DUpL0<7*
zDbG#6t6R>*EN3r`Gv^8NKRC^Yf|mgtH`0vU7r`7yt-^`Op&SeE=APxZGY_&6>n~4D
z;4IOW&ao%es(z#`XdcT9-#qB4KSE;6>3d5<>qRh2=Odl^ykMa0nvc)R6T*k}*imic
z__AiMCiwFy6eofJ)PI)ug#g<zYl~L!Ozn~42{aNsI(To;%HKWBPhEB?-j#rtwmJy|
z@VUd6?%yV&JfmUiAT6R^J&tY$k9#ZM`JSM&{Egwa$lIEI3dEVoad#P9%HO>E{A`kM
zk9w^?lhs0DvmXs{6hV7Ym5m3q+@6g;mIV3N(ABE{fh{a-ZAugZ?*-ws`dt4=(_9#b
z9*9!zg9LG%F`t4D%x8)0#a5L6Zu)NSJ@}*3)A_q|tsT?6{_ENk$*3X!PfzOaZ;jJ|
zdP%7F_2f+8@I>aSOPsuZiaVw?|BEsYu&%qpwQp*(#`2?dSM<hDHehh_L0-UY9HoCZ
z#15{3{k6ygjAxx9phfknq~x#X9g`WX3|E^9&HBJ|t4y7hReQK**22yoJ$cuOoIPPJ
z%Xvu{M7Nc+V6d1V>+Z=nu$#3kIPB-=tKT<YwjJZ-Qg2Grp%5-OL)R6SbvWvM$QYly
z%5|wPX8l_f7NG7bR&iO#sBYaxc3mSSKTnpoubjdt-q#q)RfPI@fBy7xP{gA0>N1;X
z|M4?`!LVzp)n_MI0lMAUL#OOE(%tgj?eF~a`e>vboYG9MMmro8{1mg~ue^V0xOutI
z1|-Lk;Wf!;x5>*3NkFx860tkVrXZjR2q-%L^IYb=@V+Yc?MAnhh4$bM?xg|yrdN?`
ztn7b_+DWJ?q^(2&8@bOQX1r!MBC)Hr|6Fcs^#NXt+fhy(?)!jLrr23u1bHkBI!>;N
z?A$ZyPxfoOnRp>o*UsExy3I`?`P0$M6f~rRU3=^3^|nQzl(Rg=Lp`y9n2;aq<*k1B
zjKT|3pdYFDyx79VOO0xH0vjY9zC0;X`|%O3_G_YgWS|UnTB+70;VL&B14RdpPDfv7
zA?Ez<$k(*D<=(-{Ca|PtLebwVSp)kL(rKo^m}doLjmJfO_o0o3)i2FSU-W<p^Avz_
zn=z&D<86RueU6+y6*cB%*qvob8l%fO<fzu)e;2(tB_(4_(=FA)%N3+HD^ZowA)YjA
zM`5>m3C=I0RP+`W;kIl6C1>mJFzje*|3qV3MyypJOf-T9$hyKKX0E@Dlq*$l^ZJpT
zy`Y~W=~`6UG1c=dz6~K=8+pv&Lk($;Z_uulUK%&wbvl{_;AZcggNDHbOTO3mnm>m*
ziB!ksDDFTB$QMZSpZJfe>}yfT|Ai$t!30M7%|A-5$hH+9I&tet(02@x(z3(Zh@Wm6
zY5CqRdMo;}f}GxgRco?Rq_Tv?P?AA+(|dT@@|;;TQns`7CCIG=mHh?Aqo<ujdPlx6
z*c~~5@(w3e_IYhli@BY(4Q@nmAXFPr_%#P#A#>7+1)EY`9S{^Q-1uyL0k&s+&w4k<
zC+T59zh<|d>aaAMsos=SPJE<8e$;qK<Hec7<JeXbOgD|FCkgDIddd4zve?6?_S3KC
z6Y#4Rv!~Oqw={skmo5>2c^j-`rZw(+SF2l6-7Ho7eE*+KZ;KHfcsUxNzJjY>!oGg{
z|CSlOL2<&Q_yNYlR6FDZCqXOk_Qu;0la?z1oPzlMT3@-2K0mu1xGndoSyams7>2MM
zF1GmQ$6cNgt)a_<P5zTJrqQ|7_`LQc8>;Xj-R7rG?mp$&_xHCuQdcY$9d~BpKXns8
zl=#J`)91=LFD2`Fv7JY#Rt&2DA%7v(lQ3%?40h#r`u~1An0$+pXF+K39pfjh2VRc6
zMb=e*^!_vPN?M2DAehHf_r-J3dsV4qn^Jb^FA(2)&pd>kV>y2OS=ByB?^(_=>VJsX
zIfyeTO)tSkVKK`tPEG~<T>Aw*!9)tKq0Xg_M2TM#qKo>8_mic;=i@*(GkPeL^ytD*
zz(LHY_ML@JpqoHazUJ_xw98@4ZaZb<nK=aXA2`N1&09q+De7bf+2Bl0mYI54O#i;(
zi~B3#3nnPE)CXdOZ($G^AFnI$5-mrt4VyojDOAC|5@-XN>Qn*8?Js^+ww&g<Z|+=8
z{!ZkT;N1jX4N{d3Mc90pbgn*9ywzs7EDC=f-ZV%9mC)H+h|Tp+tt32zv)@NDEs2$S
z^;K#z-oeC6m161WV4^Ek_m&UYFKs3c$0|jG5s4>_ZTmdWyJjyOifJz~$hleC=9|6V
zHiq8+aue{{oz2PBrjVUuDF7iYe@|3YjYcp#l+WY#KwTtTIY7$BUr60+>s@1{T!>DM
zUY9-!Urnlg-F3b_N<8a9u|75VyiqKfB@yhI-LX@8f)P_$k~W91ltO#^%3S8<vS>SX
z*VbERe;s;2w8cS({a3A57Z6A-8$AA3ej9kiqn(T5TCy71LW^v%#ok=XHhOBm<Y1j$
zyR_dsh<o5^x^!#VTNvqn`!w^!a*oVW;|U)99!&p4jDD{~0-mPjUvaw}cUe4*TuJif
z=Yq*m4P{YdIB2RLWfFkmx`ev^*!cUwe^2L*<qTzww3A@qf31dgk?@?b93n%{;&Hm-
z%}~M*3_b%VT(jm)pLM%IWl-Ykvh;P-=Zo2TpOT5lGnPQG4)#Hw%@8OQne|fyL54sB
zxj0oqLCX)y;@zmqhoiepBkAqP{Z^>&Q#T&8(K5@NFz3I-)Ped8MBF=fwUry*+w`+&
z@Yu*!QL*zQS^8=GldPgcKl0atFgtGq%B<DZ?&uY5IrQR`D8{k#ZJjP=jKN10@GM}>
zhcJlXN%JV3F(WQrY61%x?<RbZPI=6}UuRN>&LHH+7eOtrZYNWrg!e3(Ywx9=#2&JD
zLT1;n9t2ZR5iSknD3A(>)jq~{v}b(sj{Ck)hFKEjcOMV_ZdR8?%s#=jkK(w;KjUb!
zv`gM+!R-`zO0_*wMch7~Y(B^AS-hmUGjEN{>}A;n{~cND*4_a}y_*{YeL>xvi}yp3
zT$|c1Z^6Gwcb>_zbhmoY(e*gsg|`hBhbsM%aubZ0n>--88N*>@8uV)XrW&Ow&GR2E
z5U8cp|5ffSSiO9sjB>d0Uo{~#e|l!pVOB$jXEth5E(YmeYPekG&%*N-!Tn%0JJIYB
zRIc_u_FmO0bEHgblHj(6cQ=?1WyhzUZWM*Wzi298Jdqy`osI8;Iw<eAjNa!q#g~J%
zveVwhID>U9hL0m^9*`Q@5w`OeG$*f>`!6=auNDVT*z?ZX9B=ZKP&GgK@`92O8V<9j
zFUfz44Q##lzeFgDl?3tsnTq?shdwM;mYSWxyCvXh;oF8QN&a8c&EKmc6=+d-_nG=J
z^|m&B@jIyI#vqrYR~kns%LgaNo4jfA8-7J^xu<})qx*H$zdf&1@0x^0L?rK;vc5}*
zPaPj)vKxolv|$=pV%sXhqGLX0yT3X|UKya7j9M6#A!UHqykL7J+ibg!7Xez5QiIpu
z@nOT1SnTLy4x7mrUUzgIJ*M^H)#{u^Ra1Nk$a~o9YFCq3j2BHCV5g&nB~5Hc(J>kf
z{cN-MS9bMS2H#QQ=WaUQ3pzvGSFuHE=r(d}1KsC`>EBj66&OBf_<5?mE4wvMBd#+_
z1be>vw5xDs9(v@?wEGKcdTnwFJHSTdhgZA*YSAD?f=iwTlNg5^vGJJg{!6~paMz0Y
zcwy!XRywnlv(Rs%Pc-{J<h+Y(xBHikn-W8lBgvC|Y?yB>{y{i?2<sgaW_xRb3aJ4a
zG@PGX;_)BK{AdoFWUR;HYf$Id;3+qyz7KwQ5{$V+e919p`Ve}Ydik~IE6WO$I=+gr
zUVY2&pHnIldj`ZSUgcQ`_G?0PJek-PzZ*IyQO-&9-P{~32NM`#>Eo?YKk~(u4qFhR
zF4i^8SV5$GYUv|9-v*TBQBH6xgj>CmhwgYrXC6u?W!2$9NO(obr?!-Z`6(t}!V|Ga
zx!Iq6Kw1e+tB7DYZg;-``!PM8iYg)|>tTBC=Z`rFK@=hMuq5XYpfEC<pP*LrguUL^
zwDNc1o&Y?-Q=ts<h3bQ3JbRH^Htb2F9%mKzv|*gY3pM%2F{f#I*Z7bHcw0K|4!ej5
z1VNsnU5rmfiB~b1@$s>zTYvcr1X8SE5f)(cVcbFjVf;P;*GuIm0UVnVSXx2;n0%O|
zcSuFd{iO1mH4JoFfb>c*n)O~q)A={!ZTgo9Ix&Z|FCj1LiDWRDfvur*055Yb2)~ah
zvt|jD_30M!2;-0Ax3AY6XQJYPY#q(`KNlig#W@AuhDyt)^V2II&!|HFp1X!Y5a<^i
z!6tR5fu|LmW%<v*kXWQNjGjRp({=O3XiKEz6ln0t+$y_DJgdDfs@6$V{NeA&89-e=
zo4Bu5S;%s5=I50l1}S<EUJFg)IS-JGk-*7bb=B%RTs-3W!L;@g__b;^MT^vhFhEQe
zXu4t5CcdKj^45P0wCnK#Ob=so$2BY`l~P9LV&;e5%HtjXm5262*+b%RSd)2-hKX~F
zTr7ZoDsXKbjN5znWX<|};;VJwDdv&b2HnB>7V<3QJy^~*yNV7%(UK|s5J(}K$%Nj3
zXmw||lYU1^xmi4PQZnVi%7V-fPiAwq7UvUUs7Zw612HSEoHim(z#Z3*q-$g2m3m<t
zg+UXM8febC0>EZL<%iP+V8*PLkNhJ^)5q6zU~MREyC;VsL@I+Bf|Yl%lkgZz5=xQ}
z%25Fmwh_w!%`i!&?y3rKRkNdNe6;@hxY79mWH9mozJIFL*)R2%uHZMXAM8ROVZJcW
z4Y~RO2^~pRS1reh$HjA$036c~f7K2v!98o4zLg!f<;Bcd&7}GUoqPIbd~S-VmAps>
zcYhq}_8x(=Wk7??@>xoN(_bbL$+IB<r}V>=0k{JB9nERJx}5WNdEStKp4Vf_;DiA3
zFh`x67|t^;*hF6+4Cwm~q5@%r8)TEJUbJ8HKCnbOujFf>WH+QY@avy5`{lEVjb|Lr
zzCy7Z*X_Z88zl7rF*=kWg$y=)69)m>eq5CD8lq~~ZVySHU(2oNe+_flkb;x@y+z={
zr0Q{wF+D$`upTHZnEccp(yd(=o-OVDQ-(c8<4nOXQ`Tlhh#{i%)z-auOPwUBTgV<v
zbR(?`9faEFd(4xITS>q(EJn8XQ*Q84!JkJ~P{CFh;HYPQ7&15#C7OOPyr8VBvsDNx
z9p*~}E-x8%(CY{}y2E`2bb?WA;BGt@5)@w|^x^L;%a5lZwg8L)ZXxL~QdsUqj!LZZ
z*yfLO6I=~7zx=Xi@Gr@a%ap9X#z9H~Ov|;hP%Ig&5Udcu=?EE(w40=M<Y}Ws#joJ9
z&`$KEppZciwQ}Xa&XMu15|w~_oO%)&j1Vj{%+^pTsr{WVBLXX~fm?(8X%(`0(_wjb
znDZiPIvT2AMqkV_?%;I_*bS1m5g61X8Q1%w9zzBjiQiQMSTbthWqu$qBL7n-3nE;_
zFaxXu<00N^e6RRFCE86*7|mzG5uWIslB(yE%)uE`84v;3iJ<Z^L74H76`07>#>B1P
zUEzPEUcPK0J$sjgW1v?DMkNYna4at-kSY53Yn-%lh!fHL5*+$&lYHPyG6u4TEPKrc
z9A9OVA<yRXd>a;To7bwvEa>u`6Vfp1r6y1U?%&S9L$_Rf*ycj4F*fdIOrWEb^>p%H
zY<ZT9tzkN$0=Wu@c&Y$%K|EwjbF@H%&1z{F7wxnDNp%aLp#^~e$0W_p>iWF*9asxl
zH#8uSj&Hsh!x+-y_}VXe;SzT(NAwq;$B481Smb*AC78Yq*oB2uJ^E7wCtA8+w^7>l
zSjBa~f0_*Cj1YXj-p<XLS^fFmdu_!RJ%!*yXaa-8;Lo;klvv)|{h7xnj5}sS-f@>$
zY*yf|Ve)8ZbazGYU{1#RBV{X&q<aj{pO3X;)yMCYgU-tT@x?8ngcWa(l1Kx!+%frP
z5wB1F*d9L<Arl5t2HiLOy|vA!>!=f~lwbC0H^%bhX6LS*i$4IgLdv~ZOp#7mQNFcb
zS0HkAtrs~N8Gs-HnFNGFADG`BAJ?N>Pk#$qysr3;(0Mb;kh&RPcdrKjItBdA3(7pa
zYI;;e+nDDubhIY(z@Q75gp`YINB}thVaf8U<gTQqRsnJ_f=O1Nud-_aG<H_i^06BT
zKrR<3&cAL>9?Wpqn@H?oin6!W+E`<v{_uoTZH0ho5h*X9-$z+8h|cYPoqu4Zm-S3V
zOCkZ+R-_TAJ($(5AieW*hkNDUf<vgotuMEoFp3$_IB~rHMSoVKfYh5`O`#;9$#(Fi
zhpJk#sC^buMVyh&EgYrs)lSWodzf2Ye@wcP2ln+2{2M+RvqWO8BZecZe-RMFLJaP)
zBH*%cDmJ+%g8qxqE3G<N<6K{TWVwj4|Mq%x9*5q;@wQ|al7V=QShKz6*cyw<FwG7w
zLp$8>^4ZaI?OcU^eeRjAqnmpmH$#{&f7YKQIZBxa88plCV#TRZX8Jx0&c6ukyja^x
z!r=cMKXJSK>A&CX8)W!~3Eu|a&=paOls@7w?eZt$Rdk!e?K5f0*WXan<cf4zi0{6|
zI9}c3BI?;K#o$kL0!6?~nf+z-xdbflF1~Oq@qGW{%m>kv`sp`j)z0~F3W(2aH%yWu
zBja+H3brmc)1Hgo7YV!t7$%tP$zw!894s{BTv#fJ#3AAv!pV|^V?Ns+yO#7z1sKxX
z@Na#0t-t3U>7(P9V(Ai!Okh^}p0W>iLCDF^Dp=fMwLkZ%6?)m%J{X>ZQ}3G||H)!~
zwbQz#$_Z+4x(kJ`bCQ-HAippdxNK4h=19HAUc=BwUul?Bkpsx5CgS$G^QhZeJ<MjJ
zRUk8<l{Ul>X#PfEseM?1WRm~fZRR!s(d3-U9ztOFNpu6hx|ix{;t9+;v(EO1EcFj&
z@j1FI@`Kbl@jn!PFzV|<`F~_Y+|XLq0Q>*CW>5gb9kS9Fib8--7UA=A+Fi?K9;{^$
z6PUt<Tb`0&I))s+cJ3LxuYjU-JzQ`yG*Ck?Oi8pz*ja>W*Y>xrri&*G$34GqrDP)d
zSL~PJJJZAX!VB>C`F!HawLc`pnR5G0%-$6gfw8EG#|DI^hN8WE*T!SK7#T<z`PX)r
zhx+VS+-O_{^yZ0yd=!40IDq_dagT>`OT^my?`nHtFbRUafem-tkhaLx5x{&2A97P?
z9-(rnOzz==fq^K>6<j%I*~$ev7tFaP^A2JJ9)bo=WM~6-Sr(-}NfhP<m%QJWfn(me
zZ78=(u(5Ul+X1{O9S_l>A2iW7Nz#qL4dx~$CaEK;`;obvj1iy_ZVwPiEb}P`w|%E#
z5||b(rkKMG*$v%;fiVO_y<d<Lw2Sd4_)pO}T`0$)$lMw3;^vcg8VGgRP;5)gxd}-~
zyf$pa(Dq6U_(V}ll$MZu@&K1w4FwH73nl(7S7?8ha3#sz|G1Di2RIeFzf4#t=GpR4
zcprh0TcL0bTMV@ex#?0Ac8ub69uyI9yqd(+=D3&O@8A9Yjo3k>+rmZ~dkT{o6Ge;&
zE6r{p(iq>-&}8F6#m6E6OBRPWiO1&h6Pc8p%@|x1JGao#y`(Pgy9o#mYZB|l6#Q%-
zC3%p)1yhr*WF9K~5egpk;DIv0Zj)p()8H!$>(9i4Modi9AJQRb5ERv2-b4BGK<Z(G
zHp+ka(vBZ~O_E;8(aT+Y4-X}3eDwiQgC8QX6!NJv*)z=}n=`RE3nK$)mW0$p^>IaE
zG+&#GMDZLz2@w=`xbE4ZI&7^zOAa}ZM{tcV4le99dL%S(jigBYXKXbSLlMq1#uN%h
zG4nF@z4=*Z4tsSTMoJ8ZMaA>tmr#<`@_l6w0f7xndar*p5aIxT8$l;R!@~Q#@<S5q
zhk7YtG9khm(cuhQ0MnCi+oPi_tCk;wA(kuf*02-@B2hoq2v&9!{B1@76R0OzEh{7$
zLkI#|4;~FS?O|T`30m?r8MylQjb+`|S)knXd}<FXJuOg3n)FZSE8mj)fuK5=G)Q;3
zn52HlWPI1T-iN}F?djW<Eb#N2Hy<d0O)&n~RhnB<jFrX-H#-_c?l6Z#QRfdIU(8I|
z9-Zf`rdaR^{rihYp?nZA<A#u|9r)Dx!}2vC3x?^gtcKVDjbZX`tLfZ;;NDT{Z)<_R
znY$Ht;K+)(>q~@o`+XCvYKny$byn}1GNdQfE)UapM7Yr<MxZ#QLUf`S?2<k;eBgvK
zS!9Nr@QQKt$t363)_DEy`KelIXm$H(_Lf$YuI0~*{{zQBIKSla-|}~Nfh9?JH2Eix
zbfoJOuH%21KkACR7W#~|IO-Nl0u_Gdfi;?L+x$Y(2MI?&IQiMoK;Pkio$}xC53lB8
zeOuIDrYuYP*V%bAEr-t_iZG%GsHXAs&n?kE^8T<oXPr^oE&{94RDpy5TBYx%piv!U
z+}a?ur2$wWPc3rGMF9u2X10c9=IKi9x5D1_7SRhzTtniYf(B?!YD$Pn(u(vM+0orx
zqSyxaaE(npCH>1BZDriE*$TFeA$8~y3+g2RoLbso+fGA#AGN+Q4|uso-6eZ}1Wd7N
z*77x${<nS0G#i!6=jUkC)|l%wxmWE`hz)WX`q%*IdcPMa0Oorpa{E(2AHvDNvMG=l
zm;+W>DKywv+98rtrA~(~lUXJI7MOPqeeVI1NOT^!8IXCU-<j1_vM>_nsVAfOu{2vn
z|DxPhlZ(vQE)?Ydvf^)=Z36Rr-m=$)XyPWXG+UO2jB@?PQj7fx&9Lx`0&dnCf4QW8
zhWxkvkHiTMgdGX<4IAv!F5hoy&sZYm+;>AFIsmtn{|^6FqYeGo+2Zh!XS9~3sjkvL
zYqVbG{+bhPPB`cTt?2rl{@LJ`TH4X6msCdG#C-oMFE=gmvmi}^HOyqBvBd*orawLX
zvnoxN+1MY0uo<v*v!T<#Ixl<d2>)%{G?6w1tkw*WqkmPNw>?V{m-oo0{K|wU9bBID
z2=t5W3nEcBY6Ak<04Hs;Ko}yQ=CO@7i(O{HZ0jKaoZ&HxmP_Mz1V9Z@SMZ0$WqkjP
z2;eDbQ`T5`v(%jfz(`UUq2yyL1v)J!H!N}w<id!I`wUS43p^%P#szkGW;7OXTPZIV
z0d}R51IXbSwZ4Imflz+#w!qOFgeaT=xFZ07a3Pb6A3+;9PhW3XQ^1L@?gIbT!h$x;
z6!-yn_-dJb7H08G``h!Ou)s3_%k&AFzV<9gxn%(i?F&|?a0XzBZ6{69UdqT+6!@a#
z>B|UYojN5J_dS%mCrueT7L+yZ0L8S-)-~qmz6eB&jE!0IO8<iVDDoY@NH))rGs}~D
zCI4JpwZP10NZ19U$wLY2X6UZYxg6yb>F<cf1;>YI1eybFdTY*sJU9HzOQ!<3i=m)6
zltb>G0d<EDt!n_1JSq6!skq`D8mwvK{~8srg>Qk(Q5aDBM<jNIzdTd}HZP@@oR~G?
zPw2qT(b{%hFQvoJ3QwIFw@iv4#k@A3cMfM1Ae#f$Xpq}taf}cRV$9oFH2u@zWcV)g
zY(*viv3C<xB7w-X2|$won(`yRP@<s1-+(#${TPEbdp2V6G)w>^U!Fs`9kvNv1gzFi
z(V`XNZv-NYrd4W#DoRYv`E3WF_DjVfu(EQO;~?V%sVW5XH-=(BM)csUsc6=QNe6VQ
z?Ri6dlYM8Q2!JzSi_#QR7(Y5=?xV(k1#PR-VNT$a7{e)cNp{8YDJa7RKm%r5CMlmA
z{g(GE1i&W4%2XQ%+OogTa{VU#R=$emE|gpnk#o-7jgXlFnUeWU_A%94h!eTMJpUOe
zQ!{d3Da2w~1Y*hTmNd1&4A>G+9vQ1gl6MaTmxWA(jAM+ZeDK*m>HgV@BHG|7OMw`k
zT9P>l#EC9s^Fp3ho#!cs!sH>5cbWnY#2Lz{2u$H739>2i-b{Woc0Ogo|C|P7iOI4O
zKQmJIfe>6s_-9tFS>u0`hjz{V_WC=e1{sU~ny3C5PK!fp5QZqwOzdGpX2CX{0c$fT
zo*7F)n&z`IR4DK78vke6c4NBUtgnT_wwAWsP8}!6rWyWsxP=h;%Vbz%#30b<ujIKa
zrq2v`fw+=8BPn4l>fDX2!Lbz$QT@h%(BBDg4?C{b5M$5~xwTz3y)(X<P0oXDfK<ie
z>8SQIuQw7`AYpG|83HN&u_&!j6rg6ZLq5_08V#h=rj2}ck<$_krJz~p!=O=~;hC*i
zK%Yn)VFDETlV&0yZE<o;M?hoTve?CPISDb`0U8O0Z8l0KV;h!AKxLZ1Gq|zPZ)J8B
zpg_bR=KC9ZHEK)tQw24UKafpLWY|+9L9fI0vva}odIMQVLBrZL{)bEq%?L@Fe}QO!
zt4FoSvH&QWoV;WQFHF~(CeEvQU}bBK8%CToFfw5Ze08azP$Bylvac*rFt(+0#MG#L
zG$UkN=I}CVQUMr9?EF|D`owx+z+p9x{{ED7EE`C3i2w>86fM14Ibw09BlTuYfSdLV
zd<eElW|lD5AN$~~t=a0(=7UZFNrGWV=rLcV5Fb(%k>wK0bGDJ-rz4$%R{IacAF>s-
z$Ujeht?$~jtpx-%?O5R_!JsW?-eRBN%(tWweaN=8$%Q_c&FLl(V-JjlG&Z)WfHJMn
zr@k_VfUyuswk?Sw*%>n$w%k;DNkHBtx&Tddr5tAu;d|(pA|tujyi`C5cT;oC_S@=y
zXQIg*3*wm4VBtr|I-IdS_sMCJ{z?E^{`<{+%f*Av#S%B<k+eyVz*?Ph|63wofyA=s
zapW^YcfpX9`t1l!X-oE`G4RbrDBox(G7X<7|BWA}{Ugzvg{Z;M7oa$n_s7CQHC^Th
z8HB7bQECG-0M>aGiVrcKkxbYY9|38?Eo{pB8@}p-H;}Z{*)1SSc*Z1_?63KrISH|J
zoH^m8$k}2VV*NrfBMQV|68$4{NFu*(5z3C5KcU5>1c+j8o;_?KM<EWBf-+VI%&L<O
zzjG|3EXN|_Dqu}B0HMszjRY#T@>(SZ^kF&<|A`MC<2;05s|P}ATM^jXGbQ0eDp7kZ
z&TK({FqQo)<+sbuwM1-yMP23+MzWEK=}#+kg8QHOK1?BBT1a88av<80yi$czAg(eu
znq7!gtJ)K(pK8%A8e?59qg6v72qkRIWQ!wz4|9Qmc4$@LN*QyTqB4nX@;?gxjw<$C
zT}HL~frRj(zmUjw%YG^Jp8<V;i0w7vt+CK+6b)pf0?|RC&xMR%T6#RO?*XZ^rO`;u
zp(<Zl(^9qkk$JQFv3^XCeh4@V*#$#^m9p$pnuo~$qc#X4y^)`kDHTA?(AdtW!hDDm
z8v4#e6^`1mMfEh59T`&?=D?H!T&q{wJ2e{CG;Az7JF+TACSCvb+LU~asEl><Blum4
zk?#ST2d`EIvb*6pejuh}{9~Z3f)jJ#Y>`$sfFC{-g0TuUq5Lt*wQgW<uHmXu*bzLX
z8A$`J<$nue+4NUKmh55Pd_AiK#=Z`>s$2~DrY80CU`&d^%^Oiusd@1-R1L`jhaygD
z*t9Gh+V72gTMc1n;mc@T)1R8&SB8fU=H_)<fN#9D9oDjOrSTv}$fPfolvK5wCbTI-
zP)jGJVwrSpfhs0#ILIYbRZVEgU|UO}&s(C|zFPw^t;q!Pjg_o{X3_E&BYtln${?1V
zoFHGGS@U<Y5RoowavBI?j24TCGkII!0|T`sKy>A}0kB{5Hs2iA)XzcNdzAi~Pg=nD
zuI(9BS2ZrE_6qd<=Hq7Uq%FiSMnsn#FVB_jp-`)0V7fv3!X`k_I0s*Vb>+7I7XFm<
z4|y`$8m3ad(`r!aBn9%n)y4z@^3118BluK$i_5PG)uzyoH&7XV>;;ObYGYF0_k^zl
z$DN75cw;3V(-RtUM&bt5OhWY%%`Rj<R5K}s20nx;63lziv`;Pu!1$Vtzu%Z=t-EFe
z41BrZ+Dr3kn5)T(KsGh>C8DSaZz_-ue2bsSDnRWR>PDETk2kOfE&{M+SSE9z*9fKF
zl_ll07wg8?5HOdPs?yLKvI;RS>i%c42K8o3Raf|1S1Tf_ALKrfWH?KQD&uS$`&NxM
z&63$ZDQt_JE&CZQxNel0M2n@}zYcAbKq1V-q@0NYJCyAmnMz81t`-742+bN6nYGR>
zA{+TRE2Y-{n#!BqD=Rz<MW-72bLsywT2jBPTloa3<UDL4xz7coE;agd`93YZm<MMq
zKp?SKK+pYxX<s|C4`1)ATcwHZO39@1Q_Xp#vU3JdJL6ND!Iaa=uO#OI!ogZl8gya{
zk-`*w=TO|T5g{D;YSi>WD%W?GUttnviOk&hIt$UWL*ajs%l_p86$66W>P*BgRr>oB
zp_EApGm*8tsEN$Tq{c8ik{c;icp#cE;KNR8(?Wi`@a+r6ewd5!l`k&@xq-lbB*dFi
zvcMtKBT_?xuke3cHM9{D_~t3k$K)pm`amW%aV=n&UMf|Sf?VAaz9+7pvqwh+Q0YXB
zIC0HVIH`oAHHvNBr5~0HbWL66zCJX=V08v_;yiT~vyC$pf60i_thndLjHnQXll<};
z(+c9uEjcx5O2tqXz5+1}HTe7$aOrXsshs&zU0oxj5UV}pJiT(9&QRu?HGPSzX`e!5
z(6KO0IoxhQ4<2K1%SCnD;!6dZa?n{{Y+dZz%|iU6T)w3=U?cgy%+F%fym2FE$Sq52
z3aF^zTe6jI{X@CmXJ{|K;B20_o&)uO5QxI<nGJL^r~3ZYn!(uhtQO|yJl@>{(vfK6
zTs)3w%o)(XRt7lZ`v;<ya(~T{j|E$Dg`YmbZ!%lU0%=Jg*B~O5%^QElJPtKL8Ry}R
zAe>PAp_b(FBbX6K%9Sd|@YLsY5K9?Y4quCjw*D(;05l>scTr=C+?VK);(a8SRb`}K
z-Tct!Xe6pBHT`)-2f*+fijoQ>kDxsrBOo(}Og)nx|Eg&(vnm-yzI1?0nbvZdIgq<9
zAT5HJ$8rwz%z|C;LT7c>LQ%H{)kyZR?*yPv1)BSOH}s_5ovS4YSL#6zxy4O+xoxR^
z6o%#^RyPI=CZ*9di=qHHcQ!EPUk1LGu-%u6dilqVaws23<-)Ill#gngX2ho)h<Xn6
z9lZllV9l}Es}_!Dv^iJC|4SLp=H+&T{~1-9?f-?NKjVD2o}V<Wc`%@CjHrhD#7jBf
z>_U?ER#MW)S2Kl|c87gN>#2>Rwp%$@8wg*`0F?Cic;Qt~I&sGx!bq&w6_;G${|va|
z2pB7chGKNp^|&GgM&=ES#LC$_6V4wu>ye4hEb-sxfXM-GJW%A;^Hc)4UJXAsu*Pht
zE!*&u3VwRPXeju}{5Pu_G%8Yg-vIPidlbIF_dwO?<+At-;MC|heb8emGm#fsOl7r!
zJ|{X|03$PH>6o4|H3$*jR4&*W7#L5=o1)6*Zo_XbDQwL=rRo3;<jy;bV`~v~Bvv7i
zATCX*Nk<Y_Sx%zOjW8+7IM49h$y-k5_4%sM+v>PdqvoPt)@G5|c#m^7Gg<tp=_Zrj
z47dP}od67ZTh18|0!6Y1cK&HZc)6t?li>gr@Fnd_By}YF7z_<Sm-hUS3a#&VV4DFm
zWn$%)GvL$#XvA6XXTXp*4%qu6&JeS&OM$cMYBd1l=P_`}!+lzl)%Y=>wmjxQ(31bB
zoGsG!^Vty7W5i!^pyRm{00-bpMi8MdhGs59fB(;%02CZdaw<`**>H6NO|j;5ng)uW
z6W+M!UTzMlD&vUO_k^>QMya@S<_6><G($;pf}^cZs_RKP+vS)x(7r;8=-^+6)ZV8y
z3#rc=2Ts`>8wl|^gF6F~?N`bF3x3&1yiv=Aw#KZ-fC4-){GW2T;k3_x>mk;R+CT={
zmi#~Ac%BX)wn2Xl*;K>Nsj+axv3TcPr`l>r;oiK3GlF$s{5zuuZK*n|VI+dWjKX>}
zj(_*}>9LB3#Z;mHyj$;ZM1|<&Kk|{UQj`<9r-GLfKl{Yv4EH}{f+QF#i>tn=_<tlS
zFp&JMCPh;!>{AoL5%uRe<u?<jr$u&#0-h6KPx*f(`c!4M05Z36S~VQ;a+hP<fSwFj
zx)Qj?5>aS$;RXusxvPzG&bB=OE=2*G*J)k82^FBdzi011l9y}334Rorb3FVpulR&n
z;4=wiweutpl^AjZPRhI4P!yaDcE~u!@?=&Ou=Yknt7ibNcwONcU7?ZGG~rvKKJT_W
zvi0|6KxBU`+B47CG8d=7p79?v0Ow2u)Ec=`@htM|=r@PJLTL4)G>2KpMC`sQfFs^o
zTB|8$bkTCzzXRic?vMAia5#`Y>XEp3rob;O0=)F;*Q5YMli0pq@b@eH9do#vO-;%D
zj+9gc^Mb<^&mE60IN`h}$z0orCXY#K_;aZE?Qyu`DGhZj2|GKE<+fwQ_aCYJ9`m};
zODPErBsUMLp&=*n?Yk>(3z-ae>7Sdo(n}(NzkAuC?thPX82YI?^UMoj>WB`<Nsa$|
zyz*KBr=~lg6kL)2C4SDyj|--=29u&;vwlb1?8*Qf7(e`|C6zA<Xc*2p32=h^URY&D
ze8527Kj8D_K-Gei1s(oV-}(i9|7J#Fca?uv)L-Y41?LiI0Blu5Qd=z`<&BX$GUHD(
zwpucC+WHgl*{0#00^6K0eMPLMgkB)+Q8}-o*w%5Z?ngV~#AO2d$j?H#U;bQP&Pka#
zqX6}#ODpmn0t+8bIFa?-J%1q+YAuM!`EosFka4&6fiv@RjQ&IV1p~1`fpiyvqgc0<
z|0B+(K41<s5Z~1Tk^!$a-{&~)Q&oVr{*fRym!Q;b$~hN*V)W~D*nuz~zAKtidmnO~
zbxFTM>e6o)E&PZ!=l!xfz@ClsJ|lh(H8byL|4jg{WE5;g|B*}>SBZ!GcS;sI`VSTV
zC*=R7Br>!QAs6lC_HR$|;~;S=L5QI+<|%`k18`#acR$(&4ye=3SvLh55Dqqu*yH~Z
z@R_?h<PvQ2)gN%)+ZM2`^zX}rbFCf);`*f2Ui+kfWc)t?4rEJGKd)CvSY!^!agF}%
z^1kmXUN-|CNK8DE`;pIU`saZ1H}ds1q_5BDKda$qkL$O3g4!VK!nxu&{0hG#;%Ap;
zMSbW*(jsQPJ^r6l|C~s?YlDzt@dvK>rft2LwMYGP;NOxGSQn5c@Z4^1*KQUy9@;Qz
z%OR%X*7bQa>N{Xd7qA7q;YrFSXSU{I!NAyIAJ_!8^j5web=nrNOt_|uC_{n75I6)r
z0Plf415nNp;aj{&s3pFCAsFm)p80!PyHWtQfe*j~&Ik;|6*dDh;^nwb`<@x|Y*m}U
zJW%jZPAofNy@40N9??25<#YmU0iQWgbIQ-ffXd*EZ+Z`WHvV4%yTC_agOeuA?wRQ}
z_hAqCSfjrut6BrPOjqhyh7pr@yTAwFgJ`P~Kc9h3&N!T=&_G&i0vga=U=!F;1F!|W
z<v6DWLLipFQSrA0>?-|3UirKYtjawmPT~@&9bQq@Yyt;*A&1i6l|xOr$WuzBZ4y71
z#(#(39bi#*z7R`dbrYwmKkG330WTNW<SgWtAIDk>VnF$G^bZX`$G|7xqufOvN*2fR
zHv?MR59+`<2WXRHJ}ua=(m^4$_dUw5N5A9$4zR)>=IpFj;qB4;`V1T?{YSj7`xZ~d
zY8gyD=yP6fTsd%Oz$Wk)VYeUxrwtt@|DRQRn*nn8-yr5R{~Y+%^8@$4j{gsw(V0l{
zDi?e1yq_&TU+K?n;5~4c_Xr1ax-AsbdB)dF#*fqAU8&&(i12@~H^^nLKLNi2KLa0B
z05Xo0n&(`QhFGl;-9Lx^SHMreTa&?@<=ZxI&T7d)o(9ucPEY;?`~>_4d{!n0K$q8+
zCE|ex;)FRz{{!#~-|x}i<bZ@O4X?F}S|Z{I_yGI{{0zKN`a@uf6J=UrG_;A>MXK+B
z{{X)NpS4#z=X8xJo(gS9&MV_8PJs`=E8tb_R>_Q4zAVa213ik7GIIU|{0Y1U-WUd4
zdsSCnZe(kFWeOYuZ-8HbKgX|?bqC_-coPErZmWzqbM*fLyi)<l<PJWYaRI4=6*M)U
zf!Dy#z@Nr{$Hpe{*P-v<p!?sT^*jRJ0zU)40Xs^6E)%BOVtO>A;9|dzz-!>war`HQ
zz`R^0Ig)gqGoahR@4!#MpC-Ug*1MG6mh7A+vK)Ti13v@50$Vlww8(!OD&%C?dXt^@
z`vClh@>@A@sSLT9m648t^ahS7zrO*$*6<U`NWFP!b+T_a)%^ze75Ke&t#-hd&#;j%
zmwwQY*7q3r2>c5CTDw^|<wUPp&f^QE<J%OdQ-FUE!WI3NYPPW`sy%bsutVT4;8);}
z8vYATmzZPW)vDG;#Rxb8-jM(A#`RB!r>eU$#op*2&`!Jt{sa7F2H5!T^j}N7p9YV=
z^cg7p1^fd10qoW2Z*wBiH0wSokRrnWgA_X3{|E3B@EX{!36SXkQzPc}4jDY}0)GKN
z1FwN?3qe3Zv!N<Ynk>*8GVS{lwfDJdAIIpnT&0o8KMm;qZj*bjYgZbkv^BQ-mO9Jm
zd$~a657NJ*0+?~aN{8xZ#El8$BA!j)SKuezJ$rvBGa{;E?vfb30Jdm-{;XYzX*zI<
zBxon*h_#;`;4SbP_#N2M8>KQoVG{cuY%bRS6}8`GjsJm;I1{gXB$4|k;CJ8`;CEoJ
z#=nB)lBNu0sim;?fIoqsfLFk0#gFSd?#PtA#1|mveS9SU-_-Dv%8}rvq}x+*|4)G}
z;CJ9x;LjR<a+=X;nG0ndb|WT#4uH47FLYp1eLm*q+fXL$JTd&d;{KmpN5760u{!oh
z@c%!+@4!I~KLw`^Pl*k*=ZB&_odMmh;U^Xw7E9X8+N^efcffDJE8t!2PGuJ|wkUj-
z{Z6QZ-TFV$`u{Oe|1@N#h^0WR4)50gC;9(L=})O@x-w@gmMv8L{5np5%6fg*-#qJU
z%!vPAfd3)=b@`2DY;;2+Ff05!1#S-H1pJ!=VP%+{b*BU0d%PT{zbQ9yW@Z5XZ`?rB
zeog^g!T${9#IT9}6<cIo^5Gru3mJG;BfxZ^vtR>uMVas8(ElHzPcyV-bxxJekUzWx
z&X_lT1N;R1I`LBWfdt%IH0U#Ly-n-&YN7&6{nYz_$#knQd?I>(1GZ}Jb3u13_Ho-|
zyW_c7uU8W<1s_Pc08+v2OX}|(@EUjxysrV+PL<^>Us}_q6^@<n`t!K<0|f=*co!1+
zyDGDbJzfF70h@XuvC*%^vO>IvzG%;%sKDzABxAZ<!=GGyM;E*Ohu%k>|JK7ANW+ME
zFK1l;FQk70|3euRX3zJye|sdsg+Fx%CKYvSs>hypnY-VKpHCC%*M5cCnK73+_!IaU
z_!;<EyAT-sd*WZ7m(%)~_50;I`i=jFoQxf^PI%ricKQq0ohZKrjaDcgwC$gZJ)HOX
zQQc?7kI8Q+nJdfOy&?T?#vQN`eV8sIpn?s9XUwrW{yXo<qu=<iv7^<!2IQZ!zP|vQ
z<NUYw4J(@DWTBH^NB_^)IWQ@=JrV<&G4S6beq8LfcOCt)9GDydLqz!BnfH1__wOxm
zG}bF#0!{0Emu#1FeN!KS-+@=aU%<gw$9<V<&{msEHoG1Yy}ts#)q>X=53EzTFRE{k
zNwU|#Z{y*E@oF%3mdHWhC7s^_zXPv<w_1R(&qyqCCTB~}m|_15_>Ji8O{CvE5^I>a
z;O%X@MDHtF&-(idR)BJ8Byla;M_m7(z#qUn;AouxL$)SoezWm8^X88J-+`^`_&@Rw
z&y1wxx(&SL^Y16ppGyTlm6u<zZRsPm=bymgMEn$DfHOI-u}|<h?KwevKy;9K-2?X5
zI|#i4ex>!C%zx9t1y%PIn;v(755Ql*Yv8Ym^joUPDlR=X2<<RQ?eO#A2K-pUERX|x
zC%|XQ&ns%b3H*1v8VWo5f}g+S`6CVARDR)`&P!ITK63GY0-q=1XXNwW&Ig^-do%fe
zU%OD)_-_?pNEg&4@%vT`(EfGw+du?sw{#`n&%XQ)#_6}hed=dD&RMs4L;g+BKSqBh
zB0Qk?bVTp%6~oVi>+l1~xuz`7IsQBS^%t<Eu}_VDlU(Z07{7`IdJlXA4#&CYKIgzD
zvlR!-hdG-#0Cs_Qz&l|7Is#mZtpLt=Uq^#CB)TfgTVNBo@=sV+0({75b)Q#8OaWjJ
zu+Q@2=|uDx{ed*b%>V`44y=56G;#eS-~jlGx%vfG(+a+P3-|zhti?V`!U^z>o%MUX
z@}i&`aUQ^X;ADLF0R+GWT|;L#)_6rmpB~6NYX5ZndB^`k0;gpbV=lP(_q<Cxn5aNb
zNx$>@7I_F_!0N+a<o{&)uYfI}AphrO%IX!c54@%JpG^Ng@Fy9*!%H*mukV3Bfz4X%
zp!6I6U9oOPvbPtc|NRa0Ujm<i80hiR&MxuiZ1^Y2&t&=!n6Y*H?-EG^MxZ+^bY6=e
zm!v4zZMDc;LP`hB(SJOVehB?8;kw8xS1kYahUNPS@;d_dfY<CZx&<r)2|dt#;%^)1
zU56i6B6D8rDvMdxJogd!1Ne9Y{vCcS;kU+nU;EVIH|d{@C6<KQ#%Ut&f;Q*_unAn=
zO#gtpy~J$Bh;HC};-{EMe-GH<VUrVJhb6U)5%{0L8({wi{QN0bJ}&`H#?FVV-QUFj
z6Y~EQST8Tv4)A|v<Fs|}O-7F`V@hs_Yq2If;PZPE&8K}DzzS~*Ycq{&hZpwsBI}8t
z5SZuv2y+y`9%bm@2J{>Kv%H~lkr+7VL4$qZXd-&-(*#y|KkYQng`Kfcd;hxKSNa!$
z6}e-nVE=@5?|qs7OQgRm<v-WzwLj7Q()Ta$*55@QL%rbUeVP9eFDx_ZOWEkZ_m%Xw
zcw^s;TyfE-RPEf1pSD~XJVyg?&d#;nufflP+?d#;KDm(Y;r!<3XL&){9PfCtIOxmx
zX#h*S@M((5;G73fjwi+*<L_Um2z6*uPJkUY&)oF>0GQ(iS#!jn^^dt$<&pR6@Ca(~
zcC1xkk;d?pz5iwJ{mt|*^1`-h7F1m*=|YJw<7b)|4leSJK<o2$D&!{oG=Vi*?<pW;
z7U+bn(qF~@1zxe_+K~o~WA-Nc?lt<mz#<*kS*8=uiNBpM>dz%!TIuAk$5S##c6R1w
z`WJX%*_<-~&t+OwAIPfK!a}H)p9FGchUJ5YJZ3R0f2Br+AgRG>N>lQQcGf)y0t7`3
zcS=I!e?=hSIQ#3bd4EupX~dRXITBC-SHpM4aj!pV$`uz)o?STij=PW3FJBSM^{*{j
z&r4>+vI#Hu=@0n4^}}V<!TpN=*V7-fns3#-9tU6azrg>o=Od;Tj0Iojzp>D&*@~aR
z*YLl?cA^IPZ$~}9jGvetNo|H7mX7OH`+p;Tnj8|}79G;_@G~xjL52UOxW6f1ZUe)<
zfS-`}8yWsHrek|R`bGSNB)%&Kut)riZdkwjVZuAFrWj3Jk^a6%|M(UVKP~@~4Lkqz
z4gBx&mYal$tQ+w!UuweFruWq2+<?Kg{FiUEO3<|A)30py&DY4UJ>OwLw#R7)U&epJ
zj<6}tP`LOqetiAes+s}=bS1rb9d^gLSngyfDcSU!o{#;>J$gFnYu|4@VCmm@zSe`*
z|JL>E0mz|v-QV_pJ3=%38~oQZfL3AnJM@p_fYdiXZ*|D8+J8)ffaRsW;r-Ugl`)_C
zclO`T52fFVU&v{I>~HL!Zl0ExzK#Bf@BilgSN#2*=WYK}zr6K@kG@|^`QOa{KrTv2
zW%Adz(Qo`8js3zGo`*E@<X<m8`l<Et5C0zjv%g1wShm9jH~d0FMxSQ*x{peCLicVY
zVa+#eur<4VgM!ibH)Jg9O=*!E7^G($ze>RM(-8W<S-@feX!`dy`0p5)ar*K147Bw+
z6@u<>razMP{~ZNy0{XT4r}W!7k-uYLVo`AE-&1hbE`q;-AJr~@M?u(86|ms>%??T^
zKE~fPFoE<6eH;G+fnfGG^auRszj^=FK^py^aS)5Y_dNVg_iG*e!25p*Q9C^GlW~^j
zurg9NhNzKTE8dU+OTAJJeMN}cln^&!h<pB(DUO6U!gZOW?n!>(%ORdM%uX}K(w8V}
z^z|v4uHCBalkf%l8<JwOQyy0wnD7lLCYPcJm>;^V;+!v~Xj**ULfk%69M_}><cA4U
zL>&simBi;?PH{x?{w4sQd0$SEDgF`)&;{e`FQ*6+9-5hAg|*M&LSKuYHt}aEj*G8P
zkwqeZ>Vw^wB9Je&{775m_wuV!l#-&EksQ4ZSL%Na{Q-}9*s+U%y-K|bKR3R=hM$2n
z4Q4kzPy93_MH+F!$wfJc=Ii9QCAatvh527j5jXg-HQZm3dHuhUBF+W>!>>${HJIgW
zN|x(tBL3?UKoeNvdA}J}XwO;aIR>u2SY()Ek!pbz4p(71V3Fc$i#&6bfs{q!-7gk7
zn=CRdvqCgvk=6#%eS!XY*0~mW?D~>LUiTI04bQtG?J}^!E|-vX)+5en>)r5tlNHi=
z=IE{2^O*B|zEGrGV1;RcTj(lmJ1kOvf&U#~8CV16d1NqSqsv~sNI3S1fhAxASY?eM
zWrh2I6BWPA{}mQlU6KES75;;Ii^!NHIQ(q@8$y4NMal!6urcwINRfLUXv>6*&%o~2
z6xrr|{DWsqcR8Zv3;Q?6_0Lj(!Pc8E7YSRyDzE`8ap1#{;M>;$bvNT@9asaFfG(T9
zc3EUS{>mniIi{H%irq2QtuM+?Lit_c<jjl?n1k?GgSrm^$Nxo&P)E+_?vwvFWiwon
zc9|kH%{uoHE6k@~Y!X?LK9N{v3m<-Elgk3I0xSUwK$qQj$DAm9^hNzM%_f~imYJ;2
zz_q-3`ZLzQbbxi>0dR{O5VAaE-OAg*>2-mKEAih29s&2+rj)YB)D5$7dvGle=Yc!G
zL*OpETv}{%b8TkL+LR6bw1ExaK5&l;pv6-r?x4*_V5@dSWE_2W_{Z&GAk*C50m*fZ
zPL3rrz-{0GpYO6p(F68?_dpZ)bOZgjfCs=mcBw?HkUJKBWSik+{wKgPa2L1>tP?}A
z3=?_B=Zov;p8{?H_kah$l4PMS*=hOV1{>A@R)L4WW8e-j0|e|bbUlunoKiBGe%IpQ
z-hUfdVw>+d+YC2>PhOkf*q1B<4}hnn--`76B;h?fT_=28f6_hRK5&N$5F7^e7WfPJ
z1RUMK|3%;)@QC<r(uZ<A%3a{Y#3ps){{nCycnI8Lmx(g~Jz$&q=vw+azy|OLcnsVo
z@k5?cb0cOP{gXctumn5+9s~C|_bF$O=m%iB&i^s`XSsh%)c&(T3pfWpGA7%-&WCFQ
ztH3?r9&m><CsGdS*aTVt>NY_BQ}MsSp;>{9`09}UYkkOBV1x3z0?Yt0@w3G?%l+%{
z@AT(=;6AWU12B+-s_%ghH^{H!zuW%}>c5<-`+$Rfc5dMRJa8Mh3*4suiDkgv2jZvF
zUzGt!fK}i*@B+9CEYpMN)BXG;U1Jj%;2Nav0MCKvz(Z*hJflNB$Hu+M&30439pEMK
z9JmK8$f?Uyb`HmSU6OucU>SG<yaXNr8=M<teabWJdK^qVDCB%E(;l}70Bcj-CP&iR
zoC5=58(0IL0xu{7Ey*G71G8+~|9k`e-vbZWq-|Xy$NV}wLw7Cx-vN*4fjSIcuxqf%
zeC)~i^92p)Q{WkJADEXprF~!<m;n+X19rygcMYNsf$s^SuFO|nmtzluiTDYCS>QhK
zga$<W><@ueV4lZDCgW!YxC?v_{6PFQ7=rHuyTBq5`FUK1VqgJyNbugG0*Tl*K2HTS
zxQ_p`zysp<HZaFBg6kZglfIJ)eslqJiT@YCEnrTj7WO#DFu0EXg!uUZcm~{Ni`kIV
z=$5~R{`<fWz)RphutLC{0!x11&jcUB@c#^W2CQ)Ij_Y;l0gHV9Xrc{t`p1FK@qb=g
zWKQI-gNa8yobUe#_zrl?!GV^-I$}U)^iRN#Q{c}jzl+jkbRhm{4(v|k|0?hdcmX`{
z+kCHhRHbvX{LTXRfbabM&)BAZ1iYt4KA6b=E^r%o4m<(w0xMJ@0|v1(l)vhzL2S}D
zN9p|zxWzWLkP7IG+H1h`7<C3Z_th!jr@(EhixjwG+x{Gn4)w<OFd}+>1fFs1sLc&o
zlD1_#K3tdnd%#QJ2ZkRVy6-d0=YenjJkG!=;4bij+G|k;Z{^fJ4auj~<A{*%(+l7w
zS!ie0Q!0=l^Qp%*5|sYiG*@=&^qBel0Y`q-1*i?&23`RF1Rii?nYFo2$)OcHM_uRt
zD)1Qip8Rie2$c)M7DR#c$92Foa1VIS=hv8jbLM)1aB%q9oQR(d;4$z5cmk|5h37oD
zWzN9^?}n=5Cjo8&&w(F-?>ROyVhUlF&ljv`>`&zXb8g-uJMEhE%sccxE`j$G`G1S+
ze?oN^Q)$gGomsGDu&x6f{vQG_fM>ueXNM)+LdX1kBK>AP9ey6ON#82i4$c-(ei%34
zZEBAnffvArnEM$S<5VyiU#CB$`|3n!iPo<{z1J0grdN}nfZF>9;5oxDO#z+p(3;c!
z@5kxy2>u_6_Xz5r4ySltPQ*_PtkHlt71(7RHl;YV=u*`2@91~l*Z06}Dle^JEK1Aj
z`|ITQ2_2{vX%ij*Yrq*W!;JRTMEp4U{SJ7-2&hR1w8w0~CF>E#w(JVMPbqzdUKf83
z8GL56W|y@<G6I%?N2K0uU|WWsOmUW7%#`+rnh~9&0{kAhNAnl(@N}2^VI*9sAK9Yz
zct-bkniaSvYa$6xy&m(%rRw977f-0YmZ<mI((Klhp7|rYveG^-{CUa<Vv!F7lpD7h
zmpt*)Q~H~fen-D04?-$!s|NOX+2DRn`c{A^z>h?)Q9j}fk(k2Z7XP`XgI)6BI~vd_
z0=r8;%4+J?_FiGI1>6Fjk$-Ogz==&M3;G$8k>}%y2>1S{z!RpxI>cYfL6U7C<?xYS
zO@<ap|4W81u7X`q6HJkWJ_~}C4p8m=2$+{TY@21>1~a0^jHo>Ywgfz+`@Sxh^~Hoh
zn+oWBq78S+Be(wBtN@x0Ysv)tK5N>S<Me+AJmt{nT*gjJ@ixgG_kXV@zbn8a(*Ka|
zPQV#7nOrG<#e=(b{NJJn|A4`IPT~{#lSAp*uM!a*#y&sNdzlsQYbXu#SCn6m{*d~|
zg<l&~J`F;+MP;6G`p=+7{~{H*W8tC<O)sc?8r;GQcE(inJN&zc7ihO5dVeGOgXg>*
z$iq*<<mn6GIq-nV=ZqjJcp+$y>)))=zeN4<oau*-sE{Gh6z|iHtsn-?mrGsV16Juf
zf$q0cJ+r`?gd^ulK$q^nN^c@<go&hcj(Ek|u|4nd9}ndDmK+hv*-{4P19xhf=s8Ny
zJ*FDWdd0LP_V6x;)mH*I1>6Rn0*{%z2>c5hoau2HMwg7+*F=hozwa}7G*5p$BmXR!
ze$HyrvC=;a+y)+U-hYR_zhS7$VJ26sMxSaXvkR;<`+JMVK9B~s2C%@ZeGYiTQ>6gi
z0quvtQ{W-6Ow(Z<9TB&~!mk5u2ypy=#`n)tV}(>hj+S+f;MfDs)QM{W%Pc+IqWkCk
z<$^7l2?M%)Uf6P~^gH+GDR768W+9V+JH(=mFWcu0VF2;(=-;5yZOHu87O=>MhR+<9
z@8RFMkB@*`>>V(81^%H&tESiKe+WDP)|kAEiM)a+UZnQf)B6UD|4)Dil;6O=_Q+h>
z6F%%!;-*+Vctrek{FcTpsVL;ezRHGM^}|8f@NWZnn(PR?;F0+w)8XbmJMgXW){rJ&
z7_z*u&Y26l<MiJG9sv&+J3>U*;_!gc^ger9JpMnS_Fd(b8wI5>=6?j7MYE>{U>3M5
z{BIF_kzD)S;y9aaV5f#3qyG_o_qH_PN3?_iUB3gKT`oTsK0jvoYNs?K@dOuv+w{R)
zRTmL}Ch!3GC-5V1U!q+QLxmjl3Ta1<Oo0W!8t@(PFW@-=+~Ru!*@;+k(4GO91|9?d
z1^meJRV?PWkPCDIis69`?TJbMKY_apR`lV3Y_P7l6TLGt0M>!;fPVl_7<|S4&{b>3
z%UKdQPywC+9s&QL`_Yxyr;tm-tS$eNx76is5AFfqGxnKf_hQEJdB(+nLBL6k|Es`D
z;9m@1n$mvV;~>|DpV2$fAZQMF0{jE`4!A=Hz@QV#7KUQR%Arw-@ZVXFHh<5a$vJ$|
zI--u%fXxg4);K4`UT)4vNQdx**T$Bg1>gnnzX0De`BcdGIy-R)sc9eB(4GeA{|E4Z
z(ih1=BhZykSdZG%8|NH;UI70Fo&zg#Nlhr4DDp>N9V-2Ez*EYPWsmKwBIE{A7fqh(
z*ntLcANU9GBV(^b=JlHnk9`Apr1Wn9FM)pmPk}WSYXiTnK42Corw3appc&vHs|UB)
z{07mU4GCh0Y%3|NxlP~^_0Ju8KM4&$z`e}q#$C~XRP^5>{f{K}F2;stht!0p<NSXP
z{1@;N*kJl0_SH@yvv@D)!{$!^+#~-Vu<n)%Y|N7kq$$gj-&;%`Jpxv!y-nuL0gCw*
z$NY-&v%=)pQw~{a(EytiTLc+U{~Q8E=(MkEp4p)7ZuqBY0ygK*uuZ~Q`O17QvUx(a
zzZ!s;{mHJ)<{tHIWkJ`0JG8xtcrc-eQG>sEk41vgdN}v@4ioKa@&k%qmmd5>{$}}U
zO6%dQ%<H$4o{(>u<}ezoX_WxFL-%!7NH77k!v)J+|0=VZm;%<gq#1c_DrYYmKL2j<
zZlRL?CEzX#N0#|0WQ1zLIKtW_tR`4`U>oHB3<G9$!crih6M9Pm#ftwArE?|`b!*U#
z`2O3#8uR(3a++uB<r3%p$G+&grN+P<aGURMEB$wYd+c1X)1jGo4<SidWt&umpEYjY
z0z3FSq$1+WBcKDUu*YOp@v{WnW8tkUAyXt#xWy57f&0pTNb|-zi}iL)y`b;y_TM#`
z-eQzp@&7Kl8jIg!J`mVnZ%w)t|68R0j#$4y!v9Ey7pySQU9tB&{NED#6Jn+i1sL;E
znOl_quJYeDPmMmr4|nybCEzwI6czm|?EIUN`vn8KQH31NSRqoDRsYNZ8|<9vNYrmZ
zV9X@O0^9u74gU@m##IGxz_lbShB*1Xt@7Jt^V1y`+AMaBs7Pzi-<45*hy*{YOdiJa
zegmK5^j&J|DYw0)PF?5j$75F0404B~uvSIYl-sh-wwb2ya^(_>#vEmKi%F^ynKSHt
zos!dUBN<(6-h`Q<RTE&RK&|Q#%j)eD@H(1h0%)Eo&GOjBtPD3Ymn|HVp%z)NXaI@{
z>swqxL*n;@-(oVnz-dgC0Iag_OQp{gxY>|I3{XYCYusO_P6=f2e=4^%J5}p8`?d;j
zgKA}&{13$UiNt-62!SPO(!>;}>#6q7<f1Rzf`nVJ%$|ah{zX1NHR-k(JItChS#+C@
za!LOj2b@jG`_+L6=o31yt*H*2mH}Z_xV7YuMLjuf=n{M6O8VOzuxIa&#Bb40L}A%b
z1>T?mH0vAtSM3%QU^hT#mZv-_{H(Cg%MNY?V}s{iF2VUAOEvnPNUqX)<_!6BF`y0c
zVK!LgE9qa5vlneSKn=NkC}1whZT5=Euj!vz`BlY#%22q;QxdCmv`YoLOp%C1TEPy4
z5m{*Tuc`h?fEj603?=G`L>xnY#k@@OEAc<iv`bUm)sWy(ICX^BhT`uwNh)||()@MB
zPn+^LZ~S*swIqE?iP&9pU}HKED{@c=aRkip*hH6^gG?NEivR+qP^Q_`(Q;|5E>9jz
zOPm*o7@I@l_HdS6p<P!BY4Y&GEP0a2)$@@bh}u`oxeW|>@?@TUE}`#!+ruHR#am=k
zk88^D<}w9iOoynUkbAhmFtHS%dFu3OCXh2>K|}13?cuC?aLYV+(eOhKNFvDScPB;M
z`v;~#m*sgini#{tBFABNT`yIOVzt0VL*+Jn8yL|BmtEEY3gdrQ_TLmhCd@M@aGCU9
zngLs7>A;v5iRv0L2x*7`?wA3$CZC4ycv%RpX4I|M32Yc3pXT><8GEY4#j*<lb?6)&
ztxJ3VyeI%uU^(?}AyPEMaC6G>zYWZBylz8$Un?V6I?maHHs3c@!@m_eaxOdLc9>7u
z@wL?zi05VYO$K6p3qg^YGzY;-0kwH}Y=ui0@UcQfI1mSJi3T7DQOr;u82$FzOzNO!
zm=}4>vgOKH9ggSh`tE;DAEXd}u0dr}(ck2Gg|=+9dM|-|X~g}WGyHWZ&n>YYq1fP9
zx-l%>FmLiZMW14p<pM>Us-r^Os9DoL3C}%uB;JmR0*EKnAR)`#tRQyZHqWLn$n96)
zbC5fr8gjdB)1{F+M1M;_YhD)!CKIU+@Ah`Js-htRnTX?B_^!MKf(<$A9w$hrC=H<m
z8=;hs)RZRv?W|{z*^;qut5o%8;sm=`YT7oSDGI=5KWN4<Vn)Ywkj*NHVr0&sE=cL1
zkj|Z)MRfBf=EYs9WO|;dcToEkeiUzIkm;HS*(CpGXn=D8pnaDKW+Fa7Bf+4{$;j2A
z5@B8@D{8Rec!l&{ngU#u@uWjae=dzgR)*+EQ&0dDO=<L1>uKR@;rnzA>d+35G%%Q!
zt5P+53gjywIcVocPH`H0HJdRR)BgHJrc7EKX+EX&&x!Rgo)zpLOXMcFmKcDzfpnTT
zWwvCi)N8d*NMaYLK(0)F=h&*H@lqkX2;U8vQ3E_r2cpAbGbHd-m!u){kf(tp!LVzK
z@pWw>S}3CR2QWv6we1W*n|SDqUG-}~R+qLd5r1U>=A?0BC_bdB5X*+x`GzP&)}cO0
zgkiNL3b-9Ddhn(EhQj~0P!3XI1;|i*^ESIuOP03%#T)Yi$%0x{vr82+12p2;pKJ@c
zH^y96_;uKrjg_%ssw<OeP0Q6(Zy}p$IyV7^4IUb7i>&3oD+>9pi1U70K;jI*Gz)Z~
z8ZxHkel91FYvB_2du=N8hRBT)oXLGiAs64~?-xzB3mSl3*cyvUu#{*<9qL$~B*@w_
z+!#{6ibRcTBBCq9d@KI9!~n*8GGXhR?Oez)A=4x|ZOV=DH4n`8(ze6VZzLzq0CXk!
zU^$maKnv;Un9>MJ_oFESQ1G>VX-6`qG^iCrAZxN|w<fEQ_nTt?dPZ#r;RFL~_YIhQ
zv`td{nLdul@`fqow#2Rw)u}oRLRv?L-K4sI)^9Ns8IDB+1G(OP3P@t4P1dc<>o;2y
zO5=p#qRmM^O~b$8DfU-u$hUyj)Og$Wt3`Uc6bUt&+Ur_K1n+Ea9Ans$p1;gDmnQIm
zyx$al3;e#eT(}&I<^Uyc$n}R}z}+vWiO7Q`uLfL2?C+)RNJ({!tVQ=b@%5CctxVkL
z7Txtyeq(~kAZUxvglC1A)PVZ04A)H=#<FHJ;3Im{CXulWtBT`TF6xfu=@FM<OE>8=
z9>kDhiLovay3L?#GT1obUO8-VbXrryCGZVEC>Ih2EXcHhIHvZs%u`A#0^fR>0Z632
zS<*i(I#|8N!sllwEnQZtEn=zH<zfIJ8I?@DVWmHb(YIp?fxOntjYg5~1x9}YDH9q0
z6L~V>EG7F2$ny;ofTs9<x>dP=#<V90zy=VRDvjtnnYl9Sr?F@z3a6oTTCpva@r9O*
zjzz0zgGB(b27zpqZCmc|U%;<(PsP@o6Ns)bi&^;JV$ig<{xT)jmJCfH9y}x_fLsWu
z_)io03;6uVx1oikGh!L_7uPD~wd7wUxe&;{n9VB{P}@(QS>T>Z4n2hDqF1OhFk7#y
zHh-xmE(AQzX2+34H$%yr*R;QWQN(F&4G74aRD+s?$VCShT)cVAl>ul|2e-J});X7t
z?Iol~EKV?J(}B_aUBL}c{YJh>z*oty)(S()GDIEZ!W3hf<5U*Y8khml8sJD05{3@Z
zpMiLXk*~&bQAuXj1Njre2VG79&<boQEU~PprdcX;W-f2!yP^@l)V{G{YA5n9s)rO1
zOr_-fx$b0Qhah4`fm41Cyk-L+bTx%vdFX3-i0aDx1iCWC!5}h43IA<|*ih_ltvD4y
zq+j!FfxM(?ui~$1Y9pw9h34;S;jB6X6;Yw^KcxYHm;s|j!=I*|5()f8hHR8feGk~)
zp>cf)JZp3t58PCMATKZj8U@nolk<xDjJ+5IL7n<BspVO#aMU6I)fj;<Gnojp;m6x2
zMTq6>;xQf6q|XrQQdRpG@)PsttSo38^;#6Q*Ao8XYe`XiFak!Q=#Pkorooszknd8h
zUV_RV4#sq4&4DcB>&!4zd5MHN+29eA1sUO#2uCenQe`xnbeB@U0lt<3vP4kg^iPY*
z$l4wu3fLs7)|?R4@NX-R=<X_hv$3Y0sD}Rpo+8%G%Y0|X+~639;=<n?V=e@An!KpF
z^3{vXS%au+(>Q7nN1QNT=8Hn%TOf$feOsf&G3yczA&Lrwdn)mpg>acLrcQsz;zlfs
zEPNXP8h+Dl;kclr637)YgB-FwDgzZ2spwkx2`|(Brduky#$7)9ZOeEU!eESxq#{G3
zRfE21@{#|1RVY*In~E~mbWp~1)vO&LTV++RTm-FV0BW1!DO&rh`r+nvo8E|P{I}G5
zCKXJJ2}azVSbjg$|1ukz%fpe-rK!!j2&o#2W!VjpY6D0^-Wu38q`r6tK1eL1q!tDy
ze3tnda)-l71SUSxs*Tr#1B7_jBNEkqTb7dnMIg)zg<C2u>WM%kE!QtLpU8Bzd-F)P
zHuE(_eG6+96VrlfUIS6XArM$fIu=W&TWw0wiB>8-J{2R?*U^fhB@~n?VQqudECd36
zrTN-VoXc>voFW$Z8Hteeq$2dCNyx;^WRiTzBz2|nYtGa7JsU5ct11y;sI@`WEJP^)
z8Sk4>jzPXrvucG;kw<?asxuQUmHF^Cx57+Hr6NW|d@Nh9b;Oz_$blmFNeD$(Sw26O
z7ect0!B>T$k3n-*Yg{w|OpOI2*;`BdsEV(t1XGMzM!iw%{L(!$?FX^Y-aJ3^$YS3K
z)yP?cpiC7&CVrJts4`HCGAv@s)vRW+h5FTlNo^6iGH>LQ9s0V$jES{I<*JM{L!%D5
z<yA9rvK;Z1e`oX7004jhNkl<ZK2z^Yy~~<omigAl!+*wUH3N2G#lAOcE_NtVY=5fr
zDw$vYw5U@h*EY|gCe!GwDO^yQtr`7HL{7-?6`d_15u((dSNzHZPa3$YoD|ozI27=c
z<Z5Db#$L7&5e-1!uCJ{mS-M8xD<V?_!Px#4H5NpowgW+w8dHNt24r!@)BQb~|3bWx
zLIm5EVQxja0Bbfk_pk7mp{XJFf`MMan{qIRDs8R8j~n4i0i|-;LFzwTGklu-NbPcj
zjMGi+{TjB`^0~ILhWXs+HyKJ~uQX?3-)Kl2It4oLZ=_TmV%ZJICg-$yBU3=8O+#6^
z<-H@HmO`G-!1Z4dlbR>W1YU)&jC$_(jsHWD^1xrS2E{qAP<1DiE2?5&#cSZF&RHPL
zce3SyNfUNvLo}}#60bwU&ww&%nnC@j5r3Wtek|fEIdRU}P;HSr-3W*d%Qz5qVEpgX
zN3TT<mY1s4^e%b7PnHXXBmWjX71)~Ut{{`jWh*0b;A%dxS>l|`tK+A~ADQ)x1;wFg
zCCju8fZT+q@Xt?0KF`z=c#t<@hH`V9eMP`hzR{nO`4#_1{JZ7J%wro#3alo7?)fwo
z@F?0NQX9*OM5`}fnaG&@4=KNeP@&n3jP$29`K^%<`l`|zW6ji#gauhVlKE~U`SnON
zc|y6d1Wy$&76jmEEWfN)H}jpiq1-`UQ8Q$-Ec_aPIZly?fqNt!akx{D3Ft)p6-e-0
zh$S0V7uZt(uIK@29AQciq7{am##bIiA8|gG`%%mH_u-pgGv6f!MpU2%Aj6RPUXjoO
zxuB~8(3ParYJU`r=0Wx4k!n7dO^r}nVaY$p3x;+<*pRnf4a8S~sJ@!-k;*)>fiXip
zi&~H#`o`Qg|H|$u?92D5>k$c%10RmI<>^Q+yqcX<O0Dg`L7)`4oYys+F?kgG=>P*S
zMD<<(r|$jWfY(d00T5?-#7J3H(g|?lP%{$Oqes}=!`5}s;3`aER`}@=xh8Ff|IoK&
zg&6Ql!_NS?<b1}V-*^%F<bcV6k?CyHKLe^OwX(HOCLEo0SX1xAhc_nhrCUI{MY_AY
z6;wJTrMqLB3W!Q0Au&)=NhuLzfJiq;!{}}%Ikxxw-fREu+O_RG&pF#U&-vV+JFI4>
zdHCR#fcq@{E8MGnRSAFdTK}nqsH<328YN{{Ru4`<IHo%c;;7^_UcyL^s$W<-7>sZV
zPtgp5^Qh<)H@kiDBCr|#NbxKkFMaLRa`WNLLsoy)Eq+ml7mJI&*3pJuam*)C?!-Y4
zNopg$&{b3wx+$?%kn#x8oyhs=GkEB!esnd8ECAE{Kt|wBD|uYUjui>2#H+6Dgy=$o
z#>>mu2lj*C-kh=V<71!hs;v9fgp&u<Rn<QnY2sd&{Y>JJ5g_l8{Cq;uLBxbz%$`kl
zQ}vH2INuap>NH*E`dXZvR;lsDbI3^i&y^QLeaHWqjmmQq`3z2G9EV~W?Cz)A2_4h6
zv98SSG>f!KmPL{FNZk-N$d!x))Tbu7{IKr(@ai3SM~4h%wvPOrywIkV32#^|9(Y&?
zvOS5@g+U%A!wpzBle=y1VMFPdHG>?Q-~W@#-AZH8|9Yy8rl_L%(QBzrulgxL?U$p}
zJn1vxIum{Wozu8(_!&~<)bU~UOGVzO=$6&*qL@~p{Pf4h>kMSo!>|nUyi@3~N}p`O
zPiahEHUzvjch^TudB5i6tL{|4PD|aaB%p2cH|$ENn>CxA;chz)VbI<t!q4pFEV!OW
zjO~rxMZS{B%mbNw#lk_-^ur3D|CMjpUqe?Z{*xN|`p>u`DrxaJ9^dU7E66SAbCvj0
z0~YxRFHZLo)qD`TMCDOh-{=^{&)j%<Qg?Ql>io%?zUXsL%M>a?;!MatVq;hN&oopC
z6aC!s#_kHqvi((fDcC;D(%A8jfbzY-!nLCD<7wnD6Q7x7?VqTNU-*R7@-g)IZk`B`
zlFCKxLANe(bTs)?<y9#Ej5j%7NhHbjgtA}vKAA`&)@;DReyC)thY)R|VHXhUHEN#*
z<s&T)6f=L+&)>OTDn_1xjUIr-Q5V%v1mtjsco9_gGO3vGq!_>a(4aAKQcrR46J8u^
zc)#r|vY>L2*5E*NLvo0Qu7hS3u|;IM`s(kvNqF;m%bIU4ME@f+r&}SKz|JkNCt|DV
zA`OaUKlgA^HzJ^r>pyHLRZR>_w~=_gmsUbs#I=9}bF`RLIiO7UmHJ#5mI_)6#J6F~
z(QD>(anDUT2T<U$oaa|_gds4ERm6SO0KL}yQ=dMi+_(ikZWFERmp*n}g&73pHc++b
zkcZ0Ia`|8-DyB!0YEU~+YlqtieM7{ne3=Q}rKh~7&u-4&oJpjl#XTPoVrMf&9sk^a
zq0tvM=j13)4f6cO%{9*gGqvteJ~kNE+TjduY=LeEw7BtiaP1Yd+^pWX4mxJv2H%Uq
zYdVM-4zz2ElxwopZ<$n%?{IS<k(IuQ%uC#({ZT43L|~zI^a}J<nmyfMVF+Y_sgYj<
z?|fLV98H88!$vnd^oQab71tVOj1yzrC_{;0HXrVEKd5#3y~v_pxBX<Ly#5AdaB~|Q
zHknBj`av=_`UTz(9Z1-Sv)3c|Yof`N?w|f+Icsk3<-^&vkz4n+J|7BTHahs@qwGzO
z)KMP^C1kShEx6?pgsUZEY)0(iwe`!|B`7AHN@e(|2~E8Rj4<y%n^O_|qy(JPqnlI_
z{FeWku0`pkt`$Bi3q6~U?H0?z%OqBbr`;X5;J1fA_M_bg(_2qKyY}r&vfq+uxdwvO
zDP%#);DsXwWuS>6H}nsAw`Gl95lK?be@(==5l}gvlx==LJ>zyBCKAA@QUd<eNOLg}
z$1NT}ek1lw3^r(~)hwu%{&nR~M3miNbR6OzO{M9O<>*%sIAl#%-`kRl)_woQ3`LNw
zNzpP3cstZn%5tYxr^{0GH%gDr?>GMpGOvL}uh1aPBJnvg9lA?mK+^w0j!0z*q~S&$
zaRq3e3Yy)gfBEc2B1(@@pUtY79LDgyQTI;>t_qva$lmq~t-O_2HAu~*+C2{#H^B^_
z1p<TzFKA2F^*ft}-JiAW=n~K}UI<L$r%A-w0Omws6;;Qd2_Bv&e=SR<sQr0@koK?9
z%Gj0tNzb5n&l4-9x3TZ28t-_=t<IoXS%=fF9==^)EUUnFqXc)4Pun+=CD-SbJW-DQ
zxhyMYhas1NpZ-J-_#LTu^}pv@CdJH=p3r&U6yeUS#2hpauM?)fJjVadWxVrvT9Fnp
zu@gG)Q4ZW6BbRlFcF}cm6)M74y0T4FjojEPKoBHFF{Qk}MC+ZR$}m3H#>GF=7*7KS
zZhukGs`R#rhuS}Vn<+K~m%~>E`@%<ERheU?;iDZn-K|ZrRuOVV|EdR%OCLTqu%~So
z{CAyOfc01F<6D8OJMyR{T52DAn;k<$Z<@8LJjo=|wnyQ%4mzh2g-@X=9P5p=Ru82Q
zU};vX+j+_zsbh%i0X`?}4a}iMa5spzSNnlcveCwP-=z?c*O#bu4>+10Uj?6MazhQt
zl6PW|;23ZWzVcqx8-56@!emYG2UyOSnFGlc{uSGbNpAe%PBWFKx0rlf5l7K2-wE4F
z@_eENJ|VZEzbal&<_h|NaE0!K<V20`@c;+kIrH9={nf#oXcV3u7zf6RQ%G*Ejx<3Z
zO+IK{q?hBlBmgv0QZWK<I=pL|YuMMLm*G`AqX1_S?~iQ`=Upb()NCi)heQI=cpQD0
zch-ZJtl2l&{H~vcZau>XM}gF^NQbAx!cHi2(=OihEvo^Zyi@6_(&?p|$dsO3k-Ppx
zd90r@*FEWeoXVl{yAK#dYPgkl-`mWoUpN+E^95=*7vyGD6O(rKQW;J-{O+|ufUPG`
zeiWmfCn_<h%?YdmnD(lj2%UH)Vqp;0`_7-uILM!>9Rgn*@!-xl##)?4o2DA#p<Wif
zGzbMR@5V}Mu*OT!rRa`$s~!fDN8!a(t)kGQ4MWA_w6$~v`PeIpliao5jnGdT!v~M;
zIDS2atl>fd7Quk!M+Q37g*`y|{;lkPNP-vwtoYr!eyFs=7q0xs*lb}wykj^Hq%w1o
zK8Ou<;jQ~TyNjgw3{|?x72goN@D4NP4_d_mkVy}&ytKnr_V0kHYRgMpGB9)iwz3uT
zjoh<8Cu9(7EgOaBg;xW&hanH7N&4*mNIP^KI~ZB+EP%0iF?cZ$Wp}plfyrU{VKtKL
z5q%`Y4qOAHXjG6gYtnDP6pgV55uyXI#puJopzJ^juj<4^g1YCE(JLks$O^jL3_TSE
zGq%T~j-|?%@Mhq%MLHJdKW37@7D53RS|F|)W90vs7s#y?mWiP3u4s%`NjUrrxDdO-
z532z&KOrcU+v?P&A?v=B!6Lsc^?%a8ZIy!bq`hZ86Rw`=fWl?~;(fkwdKQkM`bC)K
zai?*0yjrGq{Dog{!A5^dIOI0*O6l9FO3L}JYsbH+K70g@@#;$)^b1l0pZVST%hd&{
z6RI$LfH<l=RVfAfWdt}57UDCI_5k}KBZZMQ=GLrs`9PSKDqnWFc2fV1cqr#(?(Rfd
zhrjOru46B<8?N^eO2+6%_4YR3G?)dPE3=|Pi;;?E@_6J$4+Nda!pvLKt^bW?u!?Wu
ztPaizu7Yi1bn<E?{t&sj{;u@#>%PMgUDb#1eP7oE8oSbzDT52`AOJEobHqKes}U^s
z<k!hBlR+pDk;k{o!G6eqy>T0Yv^SW5XYrNspm<2XOEX(l<@@1VXjPr5WozIF5Ll6_
zm5SkSX(YD1TbP>VIWIMaV~E?>WK?6Bi)*J4F4g3)OCViOcS0ch_^8uY^4TH6G?*EJ
z0DzYnfJ!M{CX}iEO+9W0iTR_v8w$DriE_qCnai&Ad^%oh0FwMM!*@g1Q9ByF&L<hu
z`}8Y+gVC?W3%|tX{R?cYr+DfB-3`|4@%wF(F7<)cz;z_@Dj$PTMdo~aObW^lRVhN>
zkz$CwzTVcL<6}l1mF&s6eA=B!xMHk}aDnELvwrVApqwp1<H!%Dc0@3DzJ4T!Z~rhh
zj*d>T((F)h2Quj2lXCcgQ?m%OS%(Uf({kT>F3<2;;buCPHL-;4$DTraBR4H4*KSq_
z`50}}RzvQ<MK{D@yXw`FERG~#<Uym^6IS6sryI9zd5laqxbIpk%P<!hUqURXNeW|1
zm-ZGya{9|Qb&KqTiL6iWa5eAWa<+na;r^PmFU{5Kvu4{#1pe)%M+=Rom6GGtLD#6s
z`pu*zmymgqCcb?F<oLvF5AjuhI_qFA<(Cnk^1dAeYCOfd5+Jm4RB!qBW45$bWK6lc
z%I{j28livX;p;L_SA5}jpYUXEB=n*w=3X?8_8rrm(4Vw~<bN0Q<KCRi_`02~heX0=
zfLV(k#>}%?u|@DQo=e31iA-HyI^B+R+QkYeLv;_Y1x|W3_kogCpxo-OH@5I1QYJNK
z9eGWzbn#8$MkNIY{jiN-KPp$dOj|NmSj3_Fa75^ZF|pfpz39J(xNiybHQLB$mD&ek
zL3C-&GUv$SP-V;CW8lg2W1m`V;m?6T(MOX(hXz;YOeYtvH^ch|b46jAH!RrYsWY}F
z+=Uh-!Z)k(SON26W1-i7AL3zisK8RYbeb|f({Kr-(BhznEn=7>5F1?O;j8p(->#W;
zgn<u`pxc(JwIo4(#4eh%(sz%Rme*d&?6Bz`_odkTcmIry#92EW6JQwb=)FWx;AQ_)
zWMUZ`tqQ?1*mi~{0_FP<>C1PflC|eDkgjfiAR8=r#d1Y<MKddQ<HX|-56Me{Y2}25
zgT3lQzAL3h^vYy$_z`8X*wxHr%x?mygyWhy8fHRW%W@XcqD;cZ2EQaNc<s*@L-AjU
zfbU0h0c-5`aPWNZ__3*%LqkKCKQH~8Rh^Y|TYP!!E(EbF`*S+Z-JRu*w2g3CBK&EY
z3M9n6b#x;{D*h0%!g9h_EA=GeBaU%!z@RPAot+^j6mpk@<DemUKNP?90h<4yvQ<PI
zSCGMUaec<|8({*#HRojLR@!LQ$#7D`I{(|Z^2Mxe``gr44{w_Zhrm>3+X_!1GN&q(
zlZ~X3I|_&2$C@n-?D+45|FG1Hc#*%|%wA!=#90Ashmp^lb8F^1avV9ddRh(+6vK(-
z?mZ8y<Zw9KR623}eC&fpfQ~IHl-Bai!S%=18L0IAonwe~<um4!>Cf4}2#pDUxtx`c
z;Am$v<FgV}JQe0kdD=5^y~<enFu4=_+Xk76JKqVFQuXKACe0%ymx4iVjeL+)6i4kS
z$-KO3-^Q27YgeK05}=2NO&3R1JE*Wf0$4Y}IP(r2pQ}fs+%1kD-md`UE~D~COmZ-p
zIXWph%pPvQYq`HX9z69rAW}EU*FbIU9p28*QSF&%txnDT6fvBn2!p)q{wmeN-TTwa
zOSr8E3c@<j&p)bgAolYjJuSxkkv-lg!xfhLXD8^Nm#xYB5)W!?+&`r>DFwM;`_Y#V
z7kfkO<1(NjbFQM-!&2De(*Y{)r6reh0A&f>TjAgs`>Y%&ABFg}myS4o^27FL>wcUT
z_2}(<YYYo6*`g*$EHpl!ruU%n;!_Rp1TKLIn{#RWD2Ci;3n%>k<1HF~^ZmMa9wVI^
zvpr7%i;fV<xf;KnSBhY{gUhy0N`m;D+dZY)h*Y>%vdsyCLAv|fN5eoAs<=2LwOR0A
zd2-v)cmg7jpgg{9U8Gc#^?T%*)lEKXr$c6RVM#~PC8#5>+X&5;vfBFeh1}haMo7)_
z8w3sB_1pbqBDhl1jd6zw2ZrtYGe3fz3AplxTG?doX`uS6{6G1h!E#$d=y;j^QL|5g
z9cf}J<tW&b_1g}GhmH@#?n?5{f9!6YTZ@sweq{KneI%_v8(_lmQ3Jxabx!SYjEqF;
zG}V{`=kL@M``dtU94!i!VTf)CB7){%8JlP}0)oDWPTp?j(%6YjmS`-kcfV`EVn2^0
zS+F*Vck#FxMG4(a9_aSxB++1vqU_pR_-DAf)85nkXM@9P&U2{C$w%<^MzyQrXcxvG
zWDwH#*+ui<n3PF$xC>a`H4;qfVVW<K!$vL^_ji%0Sa&;%X-cfUaR)B(Du?R2?#r=#
z46sIJWCePQ3S9BPyK~y7D9$g_=-tb!hqPzL?Em*(Gba4$zX)urco!|gR}`-~9(f;E
ze^C0cI0@H?@g2C~BB<N`JNHp!7a@#m02&6u4NnKr0=NZI-4Quq=l`h#D^8&AvXnQv
zNAF$#+YxQ0)Z-ucDpz?%5bJj5(Ifh>S?n~oFXzP!s_DY+Iw3lo`{~BtyfV<kS4?>h
zm&w!W)M7jrp*tqdcVF~_cW$F6pM60+$f>MNUBa|YAo_IW)}dnX6(Ys*uL()G%!NBV
zTt>_{ZGx^>7=i1W_s#JVC4S;K-cz<TRv}(N<sX^2gcAgdH>wLN5KDotVn2w}eDI+7
zV+{2?@0q=2_OshiF7O4#J80Mye!K3goK8*RPl`rof6?)2>eESGb@8wIG0D9V6<BiI
z;lr+(GZgj)N~{8BR{3v3b9!ciVwZpjD~+S6;HP=zbjDP;kD&Wmpo-g`uA|nyLmLo3
zDv#kw!>HREC$4$w{>Q+lnKyb)cYG-$K+EcWcmAz_qIeBq@C_|9iDFkJKy|D1onOu0
zo#Xn+nYLKZMfLdYHIV#oYkHDLp|-f4L4uKOXkQ4;j>EL?XB^YD3hzPc2R)3!l`rwt
zccdCrgA{WD(BB!ZwyQYvW8^9Bcm+kOc~f;p*y(#EbHa8-u;qJ(lK-UxO~zWtH)$`Q
z+n-`RrH`$5lHvEj?QU<^zmv`-G7BOK(r8FhPP5Hto>}~xbd3_{;aJAkkX*!Uy_)%H
z(w~q>#n$d1vtWd7)!<`UCdm>VhIYQ-w!e~OL>LG5t?~5g*+qD@7huUF3#?<_t)>0W
zewVBFGW8z)8b!9tkDo27FOGftdGc~L_}Xj3E~AF7LiA}>lQ9O!tC*8j{sLL1fb)^w
z*7F{OFEMcMm*cwgy$GlI>0}Xkle<IKfh(q~IVc<DH4d`q{bPh)*fNsPqOUXWKf$@&
zaLIYUFv82^>2Ht2vB=d<ILzGwdj%?mHh#EQep_=@WSwvAM&UC^L<`&g0asnA;#_}2
zXN_n-aTi4{xD4AYxU>&J6>1&a?t94ts0^BLXf9~8o4MF&Z_V|_^Xk|XYDcAF{^Qgv
z9VI-b`GIZLqjmI!;P11ne_W~Kq_yOg^#kPh`6(=%TfQ#KT+p#N&{=>pVU~q|UL6;a
zz8$EJT&{{K7&V~*LGH_4eq($xGsXtg*or5Uj-<u++Ti@>N&<$@y#dAp|9SX^%z97>
z3wgG_FCZ5gf{=7@ii$8aaQ<5bzLqM`vO5i<BNsWWGY$%qz>yS^4V@bMm~)TH+Io+&
z^#lmv+yF3y$^taM2vv9=69Boxr<pQ1I0F*x1!lA9oZ<dU=RT+37Q2K6%ARj1-g3s=
ze~XZNbONNvTkT&X7#(tE<`;@e3@Rq=049adqUgV&I<1~hbfG>S$s|WK4w<<sqs-?i
zlN-*gV^Wy+Z*U~&7p4Vfe!&CFZ!Sr2+i;guREiF>N+4yo5k}@FA7gPZvIp+tb@tyo
z{~NWNXISD++>=<`{x!1ua`f?)@I9z~hY5)VC=<r>H3e4Ia3P!QA&wim&E~zA3Fn`H
zj>z9GJ73dbMNU@b9@PXsLPayiJzHc)4t500mC&xE^4xA2v)3{{d3`1$fyn{U{Cfal
zFb>(Te-=ja@_89;Py1&gFHkZ};X7R8-H7zq>HJVnK{Ct*z#g)^pI7S!{Bi2a?I3ix
zCHQ|Qgp$vSqIZhG&rv+KLE~|~Q5ksf3Wp!XENBa$UtADTS!N-;lL!#NO*rIxM@$NS
z2hPW(9xVvg6ag9($kB;)X*Wc{p={R(V-N9p0?V3p3GB(N7n-m2yhh$z1WUi=po4ik
z5MMoR$}#$^a&HU10A&C3<=Z&8>|B39&1n$6z<WbhgaHNz&$h|<#}~XRfNlIAZ_z1C
zReS86>-`A<T%al<?AOl0qlQ1yzv}hl^hTRu*4@a!LPuQTi(Ip>ykqt$(Es5n-Vat_
ziSfp4&_|=X_9DnoSgWR57Qsq}_bdOu{7OVB-#S<(RC+msla2VQjsEQb2=>h}hytdt
z7*%WVGG6L7cO&${4-&<DM-yNpIf}Xb%$FJ7x-tVb3phq%6y0XcYGXxmwASmhNT3^<
zoXo|rlW>LXqyk;kdojAYEet?2WR(yJ)1qL)J8Fk>d8`bKZiV&VD4xvmn{~~Fjxwel
znd@I9m5_7Q1uf7B#HvVROa3Qc)NfKhiYwMD)*pmJ^#jV>mszu6*Y1k>4imR5Cn@t^
z4o7D_>K){Z<|KigCjNgKjav2YZ}<i4nMcMSx=LXRz*Ay?-{x@~^W8Tu#lg9Ed(qSm
z_+j^=C=?69f!-n2qoHuMXz9~Jlm9C5lHpk$wjN0GdR?8VUaH+uz6mH(7^x!O5yaC=
zqF#+ZSF%5sX4iiE_ddF#H3%6p*OUjcrnzRr69A@((T~ZZnEZoF_bKgd%mwd+3gIGQ
z0p!yC_5Io1Q2q3y2g!UZeAHMKT71b3(S3Z^hU!0~oB)SFq)6xkmN4ZG>Xmk#MT;>4
zV2Bc+nPB`aqM)<u)ra@Z3eg+LKXRjj56DdB%2-ajM-r~gc7ulpMsC^wnaJg=>)r(v
z!6-I^_7|nXC1C-FhrFK=brktQt!IchmZ^J`21r$NoXwOe=eTMgpyjbB!h$h)UOF$F
zwYFzF5l3?ZB?%*lB2)xzT{nTfcQ~Wh4o_YC-<(}SP8cj)uyrePIdUx*dybrUgB@He
z_<rVTlpJJZ%KQ(e2@-d)OJA@7K2x+TFGClDGZ}ZQ63$wQ74H>5OklxhDlRvU30PHs
z7F^a_z$3)#xQK5m-!1%Q%uV?#IRF6>oj^_?wb}$fDQ9=<EpH^V8ZCuJF55{I@e1%L
z49?A(`V(Ioam8ueosu2k0rH3cc_)63j=c5*`B6O+zV4QXcthC4-XPX3MaqU;e<0|k
zcMW$#C)Pot0)YwyUg7ml5Sp`Hi790q=qPQ0i0PwXzT@aju<C2#6Tq*cv_ekTL22<t
zfA^FfHhp7ocuO6o@1GC(6oho0m7+i0-y`<=gn{bqj7Sg%yQG9$ne1CtP9g~uAz1L@
z>PoxiE;{eBHIBk#lnIv)wsXdM^?k^MUk)7IR<8EOB1U0u4FxSjEkSP$=X@<k-I!w9
z3GF}z%wdn;jx}1C9V9RKHRFsUm$+g&X%^{&bmI?Xs@IYDz<vnJ>X2~AU)BMCurPd=
zlXhgwhctn+!T1(mS-4J#h<_1ZflU#saES6<Ax+%td12upLaqlhRg=eUHEuy5=DS!@
z!5lv3QNC#i3oak>kHwELdu>o>NWB$dbGnh>1~J0MfLI^TXjgtI01+W~29B(&#(l4l
z26r#yFai!lJEnOfJ*OzVX-%m27y(^n5kRu;e=SuJx`;F;bj%@k)6|mI(oC|Tt6WD}
zj%J}*%V77!w%{4~;{?BooLcNEOxZ)>Wq?XX`8AuUnTyn47{AnLTm#%Xh|T8A(MtI6
zQ8}qZMBvfle;MGxAQi-fr6`HA^L9wOgb0NvS^>0JbnBn1mCav0w*_;Fy3W~4kp?_1
z1O=(LWbV!$rfI}mj!7PPw18y>vLd(JO#JoM;`iA{8<A)EGL?*lgiqOx+|BYz^c7YU
zetq-Dst~fCXIRE>3BQ>Mo7vV&dxw3~pU(Fk%g_3Jq=a2&<Da8>BaKG=|8r*47F#PF
z(zaOPV2hp>F;<+l@?CS(kLDk#73j7|Ofo}^GRIBU%Nw(H<gmv3qwbmb2QkcdCf9U<
zq|>I-gYuA!8>vqH+iEaHKT_)XiMJP=E+|sb=Xt2={RC856x8s>=~EE_-3lY&m&mio
zWHid@9?f?y=M8#Elky(Em2Il%#7s0p6GIHzqF){$ISltV7mwXrzRm}uf4scN!yv!%
z`}N|sCE4F=8f#PSjNPeh{<C-uY@*Y>MQ1&3R~-|M-y$QPnB*KxxaMVe+_F7^WLN-u
z(SECW{@lt9Jp)#m1AG6as%e}r*zBfEimm$;M>I*l-)+)$=leLnv;k`^1@T$Eb#h2t
zc6!Uea!sD4BDd1E`zUqwdv7wVms-&^fXu)9i9;!b;xEYJtOsA@y*RpQWjtONn1n{M
zyrSES?4v+~#Vk$vh?{^H-3fRdEhZ+}Q1{b9!y%R=MjYE|NIFYo%#$6Y)^FR#Fvn9@
z8}VSB8ZU?_nHPuxPPh^C8tgVwt!>U+{|e)uh}*x{K@+jk^n!{+MJOq8w)$%-=xuv%
z2c*J|`;9IF>OBr)JBz5%zwrT6?6k`6qc+(3t^^etBZ}oau5cO48Xc}p7FPO@0_~->
z&cU;KL#ztEkJv3^Kw98?a?Q>U(`t#hzz*Bb(AQ1kMdZPebDzPpPErf&xUaIJ0u2TH
zRZrPvM3OB&mVeT>*;_b;#;!UEvdo7ZeAdB45cuhzKq)Lr>scq__a;c=0)8(RO&rs%
zaIM@Q{UIG2@Y)GcrcKdvdj2%ccdKJFPix=ivjk9bwdRZiZYRMM@sC8qU02Pzr$3SY
z`0asBl^++z1QYCV)T)O<HvG~pzJDk*&^JGeqzF^TjQS%E8r0Q|6dg?9GgMN8cdZna
zAQDm)5=KXcNl?^QN47q$fuY^{z=aT>!%bRU`?pTSYSJWvf}tH%R8{w19ABP(WW=9;
z=C?8$gcoAzh<wv+?JH<MZT}kv#VvxTpcn$oe~Km7?>C~fguOfAc_pFS-^cF(s0SW)
z^|rdM9IlIQd4`U3Rh|I=O%(>rNMBp{mG>^(#xfMJWB?i+C>T{A^2zXLz}VfzV;>g5
zT_^}c0x&6*7VI_$g?W~9sz+@~PjAfO^rXu4-AB$7oz`R2AFi<Pfd9L|6(o%9C1}AJ
zE~iEovN^Z$no9?Yw@4V{yrI~;B$KGHBDybKKhYbpX@9R0B9a|pn;}5Y3H5848>2}Y
zF5iI?1B``IEyvvXD<qWx<=-+=HU++~V8wetLDs|L<R330UJAJC^?Am>gOLHs5&I6V
z8)YNt*~>n)ca)e&v<jdc!GIPsi`FSw?!V*xwIUQ>VZFe@%XZ{{PVG3NOywWUD<Lod
zwJ5<T5lUMKJaTr9HhaY|2?z2d!Dsc{r=MPsiz};?ktqcM1rro7HaJ_(Pv8F8!B-5)
zIa+c>kVzR#jo9^d_OjDM)-d(eVHZDa-2)io3O`_0iCh7UqAjBs)e$3GQ~V5c>qO#e
zug7Jaqf&FI76whLf&#0EU(^!Js=vy<Kbe!&D4>uJIim&i9j|f_r3f(mWv9B&ZWVe|
z1;7;J4e4^W2L6EF1i4pol|MD~1cp{;75cPGS+KWHTGlA~zC~HqSOZxep)}0eaGv0m
zox+w?t)4&8k73aoN!)%%G~gH0L`j|EWXJuDPoh_(4Kz!x5pLUkc^{%Ex^Lbv_bCwq
z!n)}l1VkF9V3<GmCRW+RpE1rhci6VnUH@hV7Fmp3x2A~H>Ef*-Gqu8ck`C2*OkMbT
zN>NZU7Xki{t=vbz$u$}8Xa>GVhx!C^h&4Ga=JE@f@uhw-2je~4^lgdzFl%(0{!Q*8
zPHGMpb+F9K2YKI7<&TlYqB2L4Q&##n<{!*(B8e9>d1unF3dI9si1*8e|JhyE>Oe^>
z2#yA@GTeuMGfr_r4gi$On)@zh<|azV16dkf^?~yAKjgUm8FEm_(q-A^E>3l^kT({v
zXPEZmcuX6+|1FF!r(9D0xMnl(nzVe7g$mxa8Q6pcXsf;Z@1oVP{y7FIwDMl_O5(|H
zp$X_WUZlZ6UJh?F6EllX|Cbb@n{NlH9v{}-EZbzP)rHM>z7`uY;ml`SBn3)m+Okxo
zeO)+l+a>>g3EiJ$XP_~t*Bme<15Ugj{Hlpl@mUhIG&`xUyz;lPb*dUn8q0xDpGS==
z=Du<3*|m4s>hn7M@5SalP0}*kFT2#=3}52C|2$cB&QW=wiysGLu-X)C4KVSM__v6z
z@gVziA%`E3<z5;`JSI<9u9P>JPjW$5Jw7vG$OED;?{F=?{BdkQM>wCtbG>N_dahNr
z#}TN@&SVPD1`F*Tg95$se3D|B<-9G@&EHjhAN~FmM)aalt&uhX+(aqwXy=-QsUbZP
zAD^V!VTi~zjBk8+=vmnhmh==5XBEr3CH4{$BY_BXXp^asbeouyPk$qCVE?Yc^pwAl
zymqLsRj9y<V<*`iwms{jRCZ;#dh52!tFv}1^#^kB*xDS;*TEb)>gKGjxur4YwL)6k
zCuz_Ro%z|yjuCMdeWe%hUl3fXbT{o2*fO(+a2+W(E`>F}ayJ#qK4!aJXPip#df%8W
zbz=^bnn0b1EY^J_M8C-U_G}gTr=ICt^4C^S1A~#|4|)mOrUBUAT)cb#+{-H_u?htv
z-=fb*gVgWJ3Y6EWnlLf4KH^i<V@%C=vp<e(f7^MXLUq!UQkZP)+uozd#t#9xK11);
zpjsyg-6>lAgFMkxX#<z^sU7>38?MrhxDn6}+vE5lMwpdilvJ;v|LQ_}Uvt99$51@V
zIJ5!j9S34&Bi>sti%)9Hl2eV|!U6TN>(b(kpN15`M$R+Tv~b^l)JW&?W?4{Iela>P
z9+ITQk3u#cmhX+c@dn3>3Y}(a4@x^2KTU37et;tk$R8k0whSlMC41|WodY#^5j<Bc
zhkp+&%F{bCG&j*vq$Xy#^G5J{f8_>B@RrnSDyF5|E}^hP&flOxdh=Wx+RYACZS_v`
zG9Htp!#1oxi6_$#O0{!g{eepzQ*k1w)QDB&D(gLTs)ksMNyR)Pe-m9&qBIou>46M8
zcb@jNKL$~xR`7AiO)L}H*PqI`)+^CLP+`GP`k~fl9xZz}so3PatZ{UUZAvB58XITA
z|3Ct0QN4>s4Ap!+pPiyMQ*)bUZ!9FkHQ1+$@hTNpcu$?q*mT!apDbqNie(wblwDz~
zlbarUsj{OKfQ1+GWFuoU@QiqGs+UI}dy}|B5-xD<&SOh5yZArXa+TMr500z?3@inq
z)K6kudmdpFuB}g)e9z9-XzB1%xtX3m_UVx9<+7@i{-=~}!Cles*Fe@SR>OF3)9%K5
zt{{#gLU39EW|dLtNC}XK(o<?Yxo=1OE8)1NarN1khP9eW-eHbtrkhTl;&wKsvsFfA
zn_tl9zz?ckE^8_!3AX@P8r%Hxmua0{O3+gdV$Tsqo8D2m77s}(3Fiv4!9O`PVpX8M
zfphI}ZU#w&;!1?T4Gm*5quzq)YBI9~{xts9LADL|YaXU_X=Y_gjXB%^<Ws8j_2UKv
zJ_=5{>3)(D_6NsU*DjadUerATXLy}m{gUvlqde_7E%=$$r#}+3jeNu>pSzFAPJbJ{
zG1Xo%S`F>TB{dA=x8up!Js@6;*v@Zd8T9T?RV?fy`&0a;A*KK`kFA)gs;jS)xF7#*
z4JSxaQ?ydd6Jx;LQ>^z|2A`XXP~>3@2~jv6Dvn@^K2YkqapQeyisXNU$OgUG{Rj91
zb7ZbcoNRLA*%Azcn*}^rH=e-!Yk2ctzks!}Kdil_d&!K9B6-iKoNdHZor+4Ajh0X1
z5f*q)1t>Y_EHp{8l%&Ptgj)FM+GncTlTX>7fRi-YD??m%>R+mps1pT0w=8#01q<o1
zSYKv6KUd{*o|pHeK`R{O^vUfJ3@zv0dIkse{Vh)L>8o@$ITVnLY0{E<YsnXAsX<#S
zw;XHI6qDcYMX&pSA-MZ;jDE|h2=wxIDVZwRTivYKBr$XAH3=3=s<iT7pQ9dH``!mC
z>f0xNkIyZb7=qn99NYQM1^jwaZT9$&EHpkciK0+ZWD9>Zg#!mFAYKzD%PS{2yeQxA
zuFwE6_tgLVzQ7v>o4#0Rs-!$={;HwQcro!5GxUU;I%0k+E_Oqg{nmHYYMT2_D4Zi|
zzho_=@8604>-2A*oGv#A@YDn#aX}!1*DR0Cn@hX}Q>ut7OsgnLgwjLwZ`}I29&_^4
zFBGNRe-cxgfNLj8=S7J>v2dkk_H6jw@`&8xDtmCv=zZ*38ZWl?dl4fnQ?+mlwAZfL
zki$~=;>^?bGkgc7UR+oRS)g;VJkrt8wAd6;w-M`D6F(;yY-ctSlh&N>Y(;T)aq*@L
z>CsgryRjK9<uE2V^K6wrx15(A*5$ynxZ?c-n&Dn?bf7i+L?d>d(8i4pluNZaX6c=9
z{B(NZb|vO?uz&J~SY(=PXCQdNN=@($R}NGuI^1{?`?-|<Rk2M2_1o2VSYA2$K9W^W
zJSmV6>%J~~%e+O5LRz`Wdk#64PnjRA20876C3_C+I!Y{^&7~S8qYm-;TejAM9{|2L
zMQOs%96q0`6femfwpyv`6w`XsN}it(s<&jGN#W^oq6S6pdpwwZw*qecH0&@PxLG7x
zvqe<FH$$0LPe4`}QvLFbyo=;2)wb|TZ?@PGF(&i7NY;Vp4jz@G=R%HdImqZ@J6RB-
zP5g)zep0^dWO(Db{38}Gl`$g;74DWTbMjIlUF-g{U`wsOA9UBH=H8KXB|PnlFIS$C
zba{E^@}AW_Ojl<f`@}fD+3*w<JB7%Mk`$QqBkbhFDM<2|f7jt^uo`lTXioX-MqJp@
z`$&`U6QM(d$VruDxj_9(z270jI<A~)WYwIKtla8aI|8+z)S6I4aCTb!@=d&S>hyPp
zZBGJ|lq9pq+|mZegDeK-HQ74@Tr!O`KWe0!0Cw(xe(j;KLwOnx$Q<;w>YF<8if1PQ
zVwXRAx8k1(QbcW7kZ7vfSzsUiqYERwM>YO!U+_)LGmQ%9cg~6E2?v^2a#9tkieBVn
z57BbPo%+KCOw~g#5{5P1&J5YJ^?2TrYu#hghjLzU&`CY-OPalJVOrf~V(@+DL4XD2
zXBSD4PD)+>N2@FU1+yIJ-Tu_I_YqFhS9>&%$N5CNP`h+9ouHHBQts^EJ6^nRDpV}B
z(k!a~<~_MPvf3zL9jadWk`@0XmI8ex7kKS;?P+AE{YqeRxB&Zf^V!=PH_aT)xb*>(
zXFnOMV4WVn8UHvAm5od1|6G2K<t5d}(<2Vw_&d<n=;HjHhtl||viIt~Ok$fy|BB(m
zs<}M<_5%dhIPnDuKHYiW*;FUP^T0jAzwn%frYVY40VgS25BHZ@(McQD4Si3Pd@Cl4
zG;^CCx>!Hs7%TkBuT`6;Q#d6$8~Rs{-r$2<wco=sv8SvaJUQ>SDevypWyjs0jCc1w
zrCcYb`mIW;A^ePIw0HsP-e2!PZ#0rPC1|+BGxq4C)}wcupKPACEWIlGBR;22rqvdG
zL1)a;QW5rZ>tVN3d!l~O^|+Vq4%dV4FZK$a+P^Rk7W7H+xHslW)_Y3-u0cz^z$K=E
zdtzfNzLBR~aeGL9=h0A{RkFdeB9cGs)6V>~bv5|+o>kgBR$uC`1E*YX%SuK&e3qKX
zR-|2ajS(I|t8?%!nb|cuwqnI@Xs-24X*M1|&EFDS<pkmD3n#v4O0;YGOxd5rFha-8
z^^eIbpzvW-7WjJ={vX|7UA6M8#F3nGEu$rm`QHudywZK;<%)-@eh_<2b)ppUq(DZH
zWtDc-gTXy2xmJCh45L7;Up^fFIg->_RERXlnYgv^gb|NVEvaTYy={K!*pa{+-qR_Y
zX#s&oKKy0#Xebu%$8#GoU%ayz!wC<%8ILZo9^a<ZiS)P-S*_mW+jkEfL59VR43tF7
z-yEZP>3v~arJ3?FC{FY9jK4K`r)|lfQf9hdy~EDSe`IMC5i0Jl=Pcd&H-F>QkD%+u
z>1w&!#5598;R!8t4IuhUmO>SY3X}40XQ<p3E{A9aqJ7+{l+fNVYXb!LhBnfj&TL=f
ziPzWv1dVo@1P8USM&mP9Vrb;yDtxt9s6V(m7Q1Q=VTi(z&%G#6-#|@$+x7lWB#;fS
zMDv>z(cMc?>T-JOu$S_4gJXX!QWU1XsSRk($Cbh+zB~2h9)3<$7mjKU*LQjU%R8Uw
z>$EaiL`sml8h-Yt>IYXd%Nw+=W=j_VCkO&`t9}{*Y2<;x%5Ty8`{RVdOGYmI@t!%E
zBhC_Dk@3z9VTJ7tge`>}^jwu}f$GDGWZ^AZ#B~HsiAPLTi`3~ATxZ{U{S7s^db)gQ
z5;*85(0?EINCdbW-Tp~q5F79u3FsOADUQuu{F$iN?ep>q|H%iszZFB<kGhj548w+V
z-)Ry)M@iBee2od)VWMm-awk*y^tB=fZdGWpOG#5fy6mR-N=SDJmUT<|?24e$Jh$A1
z+U)1d4>~fJ{)#$>4!lgd<v_7ekDmcbToqk4o9}*1aD3z$ku%tt(r#hkBUqy;Zsbmu
zC4dXMIb)EV_l7jB#e~zgCB$H?F~a@HR=8YVUs8<5gIc)Q@vE#x?wGIrW2sN%4Y9(W
zX9|>3tTzKDU&PJ}^dYWSCOrA2cy8p`EyR#Lls=BMyx`!4JemJyr@&3*mfEZHUnAjs
z5kir!!Kdm&n~Z*@W~yJr4%w3N?ZJtWJRmu`TI=*+ak4(@Ks-b6CB7)nU0L3tm{ycK
z?_+$~^@vesA`RSEgdZu$b#<urhrg_wHq_pyb;6GLR$fAtNQ9n|(ogP35q-zUp#|oX
z@I#({BH$9d`-u`Y{wI=dLG3#EuPqNDt8C&Uy?c5+jBBp66FrtEmnK21bC2`7;N0)T
z05{b;g1mk?W<S~IqbA0$#q_v|ZK`y%pvle8$Xs5PJ^NBcW^xMEijY``Ci8;^DLnC1
zsDrjU=s0A=@#SgRSlZdkFF%5}JL-l%i{WqV_>xrYP1ApG&58;~oj^MozhsgoY2Lai
z-}RgcI7m`^OH*dqzLq5!A#Oj&;575zYWbm7vB7CnlKb3@?Rz&)u6ayeG~nkw%3krp
z^zn7W;Z&k?xlsbE5y6a^ui%~YkjA3hC&`!}8B~U)IMbi;WLV8>^HWs9WAlLO`-v5h
zw0;(UC3Y5n$rHNq2LryjyJn?%W>|tFL?msg08xE;+ge20lHgIx@EL=j>JI)AX`}i5
z@a--@A5JM7djIO(7k)G8t~cM^w<zJ6ckfzl<5HrN%ztH6B!8k3%^FT!DLx+B39WZY
zlC8IJ(*CCUz&yTfX}535CnMnUroB+(;Y^$MfF)<|MNF=DfEI-Iw3ReEM3}db6tN(f
zMitzZuvldNp(NdlL`SlNUKoFh3l~4}D*e5QN4wi2dhUGwyOB>b5e^aqff`<qAJu0w
zyM6=5XC0Kvn9EbLzx=+QhAyih)t+eaoPS%4CrZ07V|rr(hktlX-bbQfE|nqsbp$uS
zkt%37qNvC1(;^3oJo?CA(jsC@;Q<n49_3eTS`J{Cqr;z7Up5Sa-4`WhEDmx+3t-Ex
z=rW0>-6{F$dWZ-IGS4FQWAE#Uq?K3^e&B*I_C>rvxAL@XyG1pE>t1XP*gsj0EQOA2
zhAWVXuW4J^Nqp&4ks;*X3^I$~hEYs?Q#_+Hf6!TuZAn|2ev`_}z;RRd8D~h4ryX<U
z)F(#h!G6OzENflaRgaRQe#=BkekNaDz4Zm_&&hlmq-Aj*{d?f9R>=>DHU@Qple5GX
zIL-rYb99fX_-^jY#qYC?e-WqXB0pDqj>^(|u3-(Us^G(hJOm?ou1)R817H45iJ<MH
zIeoxsPo4OWzJr4=JYV5Pgou6c^9LCUi%sSATLFmQpVi0oX~LBsHy!;qYh)9u@y7UT
zH8SN<_SR0~?=LnlzuoQotB711q#|Q9j$ygY))EeUd5@LO&et>F8)!uq_-D_>jt6$F
z`e~qM#`cW7NyuS=>>{$3Cn+@F6aS`)^nj~y@K70^_`pMd5-D5O8y@v^4hozrO$|oy
zC5M2Fo^g2VUunNJ4_*3p^_h83_CcbGj%4`VgFBs$gz)aO;@C8?Myup~m8aKA1R+-+
zTNIt9CFoGsAAWa_*peN;nu#uWhhcDyGh}ebo*Ev{v~#M|FdRR+YH+SRgI1yu{eE4V
zUGauP4|kO9`y_|-am*5Lr>t@vsi>=&_a&zb4701H*eodt#p3AE)72hVDwhH|jS(r)
z>pX;<Sy#xFmn-d>-dz2G$=xtCKay_%=R{t{M6~rl^;K=q!zsM{X^<o+31-r9KWlt_
z`jT<RdUfQNcNR@P*jRUY&3yVN^WTKvrWpDwQsxCbVK~HrWR}o4$G`r2@dcf11M`Vv
z+XRSI?nPL9h$On1YRXw@`S2p)vL3Gkzm`k!?^*KAxC6!qo$Lj<h%MP7v?As@i^fGS
z*F?CW*}&G!0ZJSG{Yl(N0%}{tOi!mBC=#PTxb`Pzm(OJ*ddR<tPzrL9W9)73F~J*}
zql;RWz3a`Zjow5s5=TX*rCK6npR^W3v+U>Sw<<_qeeiS0?Z6p1#G7xqzPh&QmfawW
zRFEeR?-n3VDxWqTXWdJ9^R$BvF4UT}n@5v0S|!M7OR<&s`WC6<Z99H!5z&|Xyi!CU
z^}{C`=XD%>6{>nYcX26_qVhF5lwa^8CT7dFE-R+KQx`j<x$}0Ow~D`zhJa*x^Facm
z@DFIKjaLaYDcru{HZ_ENjyfO?LD<IDQjB7Bc5)XsU?5^PT|@Q2lQXckXZ9pNV|D$t
z3HP!m9}5t!Wz)=?R0#cb7iHgmx`Kd>N*mv#$!xJcxz1U7+TY*(Y_WP3mx5tR=uHRJ
zed+j|O|?{YxEJD8xbm@z?TW%&QjtpqyL%dL0?s>c9~Wu~S=1aIQ&~Im4)_-f?%+QW
zs#{N0`8Z(pHm@n9)WhNnG+l)(DvKg5DuCF}fUcuO5W7ebA{ixm*sOQ>tQ>E#2=9AH
z+T#Xj_rbS?yWzc%unDA5H7R4C6hk1z9|DVefzF_|LcB%ZO0s5Y^@sFW=!s7V-B;xa
z5yl(immKp8mMD|9rKNS;{k!a^4P_B*#$c~g*Hw~du-vJl`8^jFM#fzgJY}=L&(Lfq
z?K{z1rU!SQlKS4Vje_pC)%oU<r(Cx_Q2z0F3V!D_44?gC62Kwd!8&=ilu*8x7IW?{
z-T&9|Ajm|R*nxhQwKsVcr@!a#fN~g`Jy#96nb<)QEJJeJ*{r_7=!>C*vy%&cq`2Ef
zq(uswflK}9dn1)63m?&J5NnM4=qTvrL$PAK#i!K5S6^&>+8L|BEVCRC<e~qy6apy4
z<U6l|zQ(6Cz=pGisWoeI6vn)dRe^MlHjN)MXA2F~1fwk%7`a2F_e1R_RTl7Kh@e~;
zmEB3%!&4$4=}y`<NkSCBDvHQFjizKF_eVhqW5COw2s8S3(b6ng>RlWHi!6A)09sH3
zV>xGMZt*tm-c2IA*RV0lQ%lr_2e2SJvE;FaeK{Y^I=7Uar&x1=(75NyYoc0`Tlu_~
zpR2|)wGYiq?ts@~K^Ky;|H+zqB<Z~3Dq37@rU3%6G{8emB&g4Yy+d*dXAvQI<MnI-
zh+oGLEI$Owxsd`AR8vHZVrFa0IP366g&g`b#fIT~V@<B#qFc71_`v5+@0iGLxLfI`
zK35CcS0PXa0af-enJsV&^vJI}OW+HsW}@Za1IjQ6h7>SU2;9XQN_q^ag}yzO-SFl`
zepqJ{U*b6Z74q$O>#QdUFeLXdFQ^Q+MB3%<V_gFh_n21rfC&9JG@&AD?qh=FG_^(T
z<7m7#KCJXV^*nX$so|5SRXrnMmP3jT&w=BXakf5K8YB5%Q<d518GZm<qByZST~{`v
zZBH7yM;1*uwz^Gs;$ef6lq)O}T505lxfC=3L3=>{5mIE*^LtK0oP3D`#R+P@1`_z1
zF;Xb=q(?G0|1f3Vt8%pv3^|6NZHD9{F?lu+8Di?{qWVR@eFvCNh%xSG`-@A$yS{@t
z#Pwj~m{2$pX76!jjv0B3e||@WrMQ^=p}T8>cTyk1?`ET6?JBMsmBqorQU(H;4gpA2
zd_OtF>Pvb~fzt{X0O+u-7#$i9Qw^H1JaNb*U^az?l;6eFED3m{LpLtes^sO-Od#yN
zkf1yMbXRGz$-ERBJVdgluZvO!)<YGPFfs8*?t}<{f^Y!4Q0*LKT_M3k{gXMXE&7~R
z7PJg#KagNV&$aJ1)jz{lIo2fX%v8*_KJM{_2-Vo!9HDu{T=C@zeXaEuWJ(eD{m9}8
zR5aLYJn{-MYYHlBcoEnZHT*zA$8cUA_1C>d9v6=e@~(u3dvmqCab^Ol;@N;1vt7V-
zsBrq$Q4|Nd5{8js2mvKZG1^T&Un7l;a$JQSt91AclN^hEEY21kLKMY|TV90W+*?$J
zBRcbreK5DbcySt;;bra}5TugAmw&f8XZzzSX=<+<;l=nUsQ$G-K~z~OlwIf5>$>}o
zvcl)V%OE6hkeEbpN|az<K0d38byLM2LVPRQl#|X?K9e2vInx?IE%G<X8ZSp=u})C!
z#E(O_l@b8S1BxlaxUQo^oy(s;Bu_B_Ry;4I)NtK-dgCcE>7-QH;|{r&vaOk0ypP~q
z1E*)n$~AC|=K1Aw4a&)2sr04oz0sUvpmL55@S0TT)Z{6?yqfpaMD^0yq525OJ<!Qf
zOtkOTw}}BA5fX-isdya8Y|ti2dCwuh_E@4T5@2xfnIA@|Y0wS6Sj?CnCDv(eXh%+U
z(Km8PbMrX=9u%W1G&slTUABziQm1mc^l_?D+eXzm6NegY=*Y77aJzMl%YbAjTSI&9
zde(OZ_bEBEt=7w&8)<Nq{cn3=;>u31KdKScHmSRS(gVO-zBG9km5dw;F1!^9%}EE`
zC>=FYm%~Az?$8E2eSOq|(lPDq!h<`0IvDS-RTWI?RgI^L=r5|r_a6TGc)4M5LJ)@E
zMxdN{j@oD*)@I!wJDLmyd?N%H@~HAy6-IBr1;lPdKLP(G<`~Ty+H;DSAh14g+lDOO
z4&NXNfZ0F=_%z<{`d~g5jko*`D_13?ob}ouHcw`+-m^0tiUj)}2v_@KBx9kB={ibV
z7R*>INIA)VK$-L2gEi2_tl0iL(qmvCXV$Ok>}<{KP9QTZ!plTBY%pdfPXb28iLw82
z)7-%pb-VflWcf>ic2C5z5QQ%dmjPrMhz=j27;!g<sqfFi^`8cZ=bz@+sh!vakXAq2
zb;g5tpd@Vsc93J36Y{713k#h5ih15xlsp<qNbWgGKhax7mu^r>C<DM#|I-;n+C(ye
z*yDMG=TK0=iCY1H;bHYP{q}56=YV|9X|%`9apf=!V}4aO892Q^3ez}3dM|JL@4l9V
zTK0wZV?DAspcEMG!)SS9<lkjgd{Kw^0u3DDqmJfvj5+?}G4~&7#n+L*?*YNpzWj~O
zn$%Ad#8qJXLS*M?=hb-yGyjNIr0nsg1Olo>SU3U|GB4i?sh%O!WA&MCqxhn79$1>S
z_$qpBbR1{r`-P<!nJ~uF`iEQ#jsdsjKE(ClR5va&$e`B1BFCifojv}?Ui<}(WSIB=
z3yA1W?0lW7$7{OW>K2F)P@}s>`$?6Q*BOoRqn0lMZjMF%xA7XL+f-SBYA0+7|E^g8
zLBw5bgTUn+OOPrxwiv@|+?{+XvUzT9gJE*5Mu8M)cG+$cNy1yO49M?8$Uo|nN8SF?
z4ryH_B$sj=3eK2)`DZrQp0cSh7J1gPHj$#v_A*GOy|S@p&3b2h?hz3)o<UolL^0tC
zapQ6ClKE{OfH9mX*W~%3!ZaD9%m<eR?htb?xA89I3i)|Ho3xSNdaMlwI*gLhLz2Rf
zNhD>2mA37WWjw<5`O#=&7=`M{_aj9TH~w5g8{Ei6_Z?Fzu6m-S(_o#TO>@S3*;VN3
z8J5itXH(}MN6r#Pe_;>UJ>DYiaW){f97TDcvo;fbbPQ&LHOrhpa~J1bg&m0*>b$pL
zcLM3VgnoUq{rCqYiV&sZ|JoqRQ)%W>So*aeRj02SSfl^b!8v0t!}MBWKM1J4&f57O
zgVWsMDs)|eQ7Ohd0=G(^<d(3RE8xN)F(YuJrbmY-HW?gUa~%{j{ZN7fu+emed6+`s
zr@G{hUZx#Xy7sP13xePS2>bcpuS{&Rs&;JMi1ioofn%mj6Fyl7V;pSmvyvU7S*>5&
zpNn(@n{zop&<6JFG@!TT@lj>MByFDeqwQ00)iyu+ik7+Ft>~=W1!F?dI;zCX=OaI)
zUEI@^$BxL*Y;XT5KdQkb{2xi@9nIGJ`0)&TuTpzcwQAKCrS@vkqGrvaR;g7asx?Z@
z)+TN3R*Tv*Y3;4}po)Z2d&de%e)oHRC;#0OCpmeZ``r6n@7McvNP4^*n$S6^>oO86
zIYd(Rn0%3QVLPl>I8o5iws#?@8yCEE_DcYty3y}sy}*6>hHcKxW>o{jZ+M*Pdotwz
zfkYNQ9C-{O4;LAQUB1UroUC7>z3$sEpoQn-G@F(jO-ShlFCbEeX2$K~sm7SKNW2+%
zJm~wU|8AW&M)(#I-wplu-lP!3ScyJG4Ukfb+`l<#NgoT!Zcy@P$3D7eS_&xA(0vOL
zPOWn8+gnScSNn2JOY6Lkd7WaT(N3Gb7yN7kvBH~;MI2BUKdsp#m;iy_8`RVS@S4l*
zlb_XMi6$;4bZDWp7$kE?`+Sr<F=}k7TGGWvY=m)8W40p`BZb;WNV+6&tm+B9Y=3Y(
zQMyiv_A(x3Im7teO4!M0qlPiSy*e<16az(B#FB!wv%*DkgNDauZSTxu(!`GWwPUqv
zhhOaj_mT)%OnV;jgLdBpK82hhLyu^>czcMdBm$+|uy_=62{-}iyVlJ3)WqsPo(T%e
zqo4S@HEVG5(>A9y4;KnBkwY~y>_;Zaoz@fcXZh8xnBU|z63##7yNg7Xr0(epjqMl-
zROs%pTz_0MVJd+`qw0_xuj`bn&?{*&SvU1kxf8X9kyCVKK3!46piZOQc4CSI-3Cos
z50~fH1<yQG&i<IYINW*=HFa#J6_C6ieEOrYD^81l@Hpv!WGUv!rVRh3xF~Lg)|`Z$
zuN?cG{xtmsLMjMPw_h~I^ZWJ(ex(!cy1Mc`r(9BYmEvK6tg}Xs&sUV4(aYrokCp5>
zr)!kJ-;j4E>Y-`9GYK~@8qHT;UWQ#L;gkna47}+k5)@H+r1>`)6XHr2ywUVAkDi$o
z)*Y+@d3&a-1_UChA-QH19`}{bF9T!p>0tPZ^3Ne1iF55N%Js-oh*U9mKl^y$Fo(lA
z3&Zpd*=S6lszmoWC>0EwgX*m}->iBgV*%>1Ypg1sxzb3W0qok$!+;|ewyOC|ca
zBqoPKotxo0aQCyK^RprNE*{GM3i3}Fl}CCwH6$t*O}HeSBv{fk<2ID10w|u~lsD4K
z*e*veIetg0K5OuVnEQV~33l=br@@^^;a@2{F?ZooydIFQnkAIBSiTn=3X6r2Z%Ur0
zsS#O!z=_J|q}`)!ln^^J8&wo9VP@Jxx)+l|8i2hyRzC3(5O3>>C8jSUm)E&*6&omR
z3<pRL#P;5&*?hQRXn5%<_NBZ-lbDAgq56EK`_6sh3lTF`y9YXoEHy8bG05{>nUXEC
z^3I>RWmhXX{V^5SOllj$oJlrC57TYbzUz~6@7*MoJ#>CQ5sj}|hl^Y_$3;3fMhK8t
zQHwpFD*;&x?;&;B&gtlR9bx$M2|S7P28ptO-zcydNUHZeP|*K1`wOmlP6wNj`Bb9S
z^@bEVBn7NTzUTZy#5={L+ZSece|TX;P#+63YL>lHh%WM^6T*xS%qS+BIEzbr&J(Rf
z0QD%v-$=XKzJM{qek=cSjtdmiim!mb47%P&yq2wMp@JM1Nk0@z+mh5NRtXY(D0mGX
zZ#xU4n=khsWPXGswN~hVgOeKW4yroGIBg>4^W*Tx*{;U!`Z>dytIDVaf?Ki#1wHe4
z*MSB~$n|x-F40j=Z-Qh@40ETw^RMW4_|KYBc)m0|`X7hPR?K8+jp3Dg<u0P-fF!wr
z%t+3?G_6bd<!pMd1k6Ynt<_hooPrCJ++?~$_fGy(UPcK^{B*D+vtd0@P9m01FoxbN
zK4!Y&*a)CHul6|oa}JY~C@#Ec9j@uOkV?1>J$+&S=G6BB^N;8pMChi*aj>p7Ta~E{
z5n$^!QPMHxi7=D<NyG~p{K;c0zRl*9{{I$s8^+gcCXu&>r{76yAV0upKX(6q%>V?@
z>vbmIQNO+{3<|Y+`Gs5x2=?FnGEF|7JFihBTe=C55-vt>6oo@McZ2x)U#%~u&7*|?
zDF%pEJHb(nlIP#ENXO*#%e!a|`xH+J?a+r_&f+|#S7cE!uf3P<;X6}pD>|}?GPR)U
z1A>gfruoKx6g`O>tx*f9PA9qif+Iy1?!u`?pkJc8OrU`xQXOrN0Z&i(tf=e!8BX#7
z2?|Ej)KW;*_Wpe{+SRTmwAD5)g>)97{`L<(!0E=}+zp>CV;Q=nCn-mB9T8R4n7-|_
z408sQa2GSis=DNfnBI?hN;0pJwBiI3;-A^<6sl_Zon~FVkYcq!VDC%JOG3&e{HRxc
zTuQKIL|;L@vZ)`?VNAmG2z3L4b>E(CsfjRr^OfUTC(&5EO(;BY?R0fUr%l0<(U1vp
zw0;F^9wm+(P>D_bd+fHIqd7y(&kAE8eOb^^z*eyR=q^wofNvSBsfue2s2&E}cEhWu
zm1C~~tVRLo8Q&_RTc!Hntq6KEKpHVnV@o!VLp!GB4Xg1xi$lS;pbKC=2sfIgJ;jo*
z{tDEyGxc>pR&x9s<Sh0DnyZ$q;kR&G_v0JL&-x$hEXYQ{05^eFtCgadX!P!+e91lU
z?ig{w9?0LK3r?L2Z!%#K3wxus-AJ)O2{;lIL4xfSO9~%U+szaD&_aN&L*OcbZU9g9
zN_h&v1E74+upSWIT-i$gXj9G99Y>eAD-5W7E-0keygd!%Ukj$L9Uz2BJpnq3Ajnvm
zgBaJW=3g+N?;g;3{PyOl+jPwluF}GwNe=*mg`<Uh<a-~MC)eVWMREZUzg!?^&;7tx
z7uP%-p`{FzA|HTSpw*D6Nno?4@bgscVLLC;24R5Kb$RdCR@Kmkg1#AI?20EE0MFhT
z+IuV+RQ`BYEdQX#SwNcyQ4d_Q9cL>vHm_3G6*aENUXot?k9u*GAy13!$O>|H#*!S5
z0^U$;TtDy8U7cO&-$ecaL@eyJbf!?3^A$XCWqXYJIw1}50{uONs_<9rA4p)_l^TBE
zl3eP70$p_vbg8_JY)5OVVHLEKndZdEn%<G>?(<u|=pfrW{&oA0h((#-bCvrD2q4lR
zy8>xY?oB|SN$W_;>u>Cy8`+(CLz{tMstFKZL|es~Q?`$JQ4W4RT-OE0oU8-H2i|V@
zuva~C^DuYg-CHgKLuXp70ou>y+LPlo{||F)(9F)H>pPGzK-XY}<*OB<!I90aS<qk!
zDip#l*TQKuDEaQa_%lK|d%l%lzGVJ%_W#WvynvTP)Gyl{VY8V|U*r2=?|;IK0TQPt
z*kJPcy7fv$5K%eS9{2|kClG^$=F>Sj(zw6xzrAFR`^!T(13oMmtMy0Z>#)yjh{m+h
z^V#Kf=2}C}4%dP``79s)z6*Z9$s&xE;APtW9>{ktRP!ku&P((HenVO;Cp{>)Q**73
zk%QIrADj|!j!<fpL2@&xKbq`l(#4Jj;dOmTIp|>3(p#Bs!rwMm3D3{-DEFYU2!4b*
zu8XFLI^b~5G!zPajdirJTHw@VSqM6dlCxGdW@2!eBzM3QPE{T3Zw4-u=L&g$HVRBA
zeIWXOw%%g?N0jT}$Mc{N$`*PJa;JL<?NlH7@2F2%8?YV(99?D}{V8iw`&(Dyy+Q(j
zkzimJs2*HzF^Q$LvbD!mB(O3X1H%q@Uc@y-<n%_IZk30^2~TkSr#Ov9z*yn<y?s<X
z60PwfwML9j2}!>9zdCj>P{%HI%aI{Gzth=I?7>!D2&5Wf0px7=H||h8y9&*lKYp`L
z>Kqqp>_0>6_L279jY<8*tA<9O^h2u{0%-wP%m;{VMx3!shGOr#1s9EMf8dn20w^2m
zxJm|a_+U$$sMYZn4hHIi#(?|*Nn@g?OVI}BvsK4M>RgCFbR76BFY6rhm6}}Zf^x0R
zxBNLifJZ|Ci?kM?zpciGzA21PbS$`3!hHj(<KYn5C(!~>Zh4m?_4&V4p#3Rwz#=k$
zQV+p6o-38=Dq^MPsd4J$N16vLEyKZLauZ%okMsIvWFl@>e!_T@_BT7Ff_CnrwMY@C
zQD+Ds`P%G3LjFRqjkG&IW3w5_ocXOn@O59d;#2fFB*@fkG~N1ZB(q-+obkm_4Af#X
zmLder9+C6rMK~nJvr01t<n)XL6jYG!dZRBW#g!)lj%G9T>mT%4NB6lec_p2bif6{3
zVF3>Z`&4lhSN}L0Z~A^LWr_gH=25{K(XsZkT?L#sT!>*pO8AKl)Q;)xOwH({XGfPz
z=gjAPwpq9BD{3DOX1tWS&kLJEI|8~M=s{waE0CDMhXod0X_3c}6;J&2H{={!I}vWu
z$Vpgq)7_WG3LcP!gPztB{dM7<Rc*4rZ|xUlTjX9vfPrLua=nT>>Ou1D!%XG96d$@4
zs74vNv-{Q5`gKOl=Mg$ePBDe2F=QH^SDx3>6Fo*%KbZaZ-1ZOpx-GzJSyXj8GIabP
z#v#JD<oLtx2mdU*T-Lz8KxZyEy)*iAlqlXb_Fr1a;FtgDCYWnS(1PyGtsl<{v;Ir-
zxe9~e0Gp1+hy69sw5OTM#|_@WQ8LzZIMPNr_WE}>e$H(<r|~1UUzeH(iG?QA;0k^A
zSh2;Nl6(kYs<?Z$CPPp%toP0NxRI3Ne!77&!xvG`xYidHe7H>;6229Q<fwj!{Kl^?
zWffVl)9hQv2c)bcaqX*bO|&$@dxr#7dhML$fWBo3U_aW9U3?8Yt2|`kdr|jqKNq}M
zCH69ZMGFS;|Mi<9{d>WO!N|#9&(9;MG)gYD#BHAIazSu&uPeGeV=2)lY9QXc7i&+^
ziqI`q&rgzg>4>9+Stsf;VSxhLo43Yezpqx%q88<Tr#j?jF=td%m(uxMA)$QtV)!Qz
zV)9U_a=D$sYsv44b#u#d(T(Mb_ZwOZwIm-?D&ccDyE<3p2)TKAh?88w!!UD&okPrP
zX9idk38dcx&-n$2x2tq_&g<zk(noqVlt<?UbUFV74t$}%2`+@p0Aehti`h_);Hcq%
zpo4?~Je*vK0}cb0=rDvHP?D^jf2*(^d+GS_JLvterYGj%MwR}gG)9Qee6=eU;43m=
zX?dN<EG88!Q-{t`9px68>hWc6*Y>B73>6}0uH$R8eCz9v<AiQ$HQ%NavzYq~3u_x`
z(Vc-%sUi(>K7~`mC(63`!_YS*g*1oao1`2&%{iAQbgxt0lTY{byZC`4H#8YJ|FPj5
z1<q#rdBl%eAJXqyB8mqD&t5$%2&NuJ^JYIASENdzUN%_K($9RtkjG$8=^yMAwRxAD
zAbVXStT%+r!DPgJi&+`<3FCp;2ZbT%*(VB5X!EeQh`i67Q2p?AkcQ_uTPA6SvFIL|
z#T-+u$5q(!B`Yj~C3wyJg}xN_cg9~Dm{rIJzVEmRteYHt_zjM1o&#)vhxQCY92fI%
zEj^ZCP(X8#{-1Rk;8qaEY6-uNPpmZ(w6*vRP4zhsV%QNrs*z5Js+RR8ZyC=)R$YmO
zAe}*sA`#nkhCei&O2G?O7xYh}9gYHDm+}9(?S+sMTPnF?@V$cT?0Y>-O}xgo%sC0a
zy%#4A0E7?6dFHLmUDG6VG=Dk&VYv1LnidO}J8AC0PQ6+xth`&8C|r*atOa+t96YXS
zzSqa7$Ky_^c1Zcitk~_~oHsD1D_BJ0L%a$bXZ`nnhHk*LcEG6B@$=s3%Iv#*#d6A^
zo@X%^c9c#K(g#Rx#>Y|JE|e%tb>$%Ts2<b-B65e1&AXX6#J_`FAK3T{RX<$<?SU{*
z*sT*UaDc8#fZs2hXT6RWz0Cjv0hf!I5{!i07sz<kU4es2p8Yr^0g%OFT6%l#KqIdl
z0UI3}@{wKxLIm)Xz_bF6#FHlzaZT@V%=_1F9tP6_Ynm*Uda9G2Gmn1=v}wBMuOfGK
zgAi7!-Ed7kGs-7Tq)?66jOa(hp(F_31ip6_rJ6xc(W4p4_JD8ay9nIh3c+fPymwty
zgt&c0lEtdGjhDBj?24sIl%aaytSz)xS%W#`Vl`557Y2v8J6Cm0ae9EaP4hrjD*!>;
z8=o)FO+^p|=xGi#iRMZh|9*#pH1|uc8*W6>#3Io!#O3r-d4y~0otXo8B?q7C6`&i_
z%|z<M1;c9B<RiCQ%7K<Z68Aw|3r5ufLFdlyHB!#iPhx>F#=<k;!LD+<iucVm{j!8p
zrD?B$xY1#z#gc?AL4;?m0Im5uq8Az!yN~$eiVptWO0isM8Iy+4Ae#U`M7&7hDR}1P
zE@ETQh~RX&)-rm%Zb#4$a=g=C-?z(GBRiGmd*3OWuOJg#A2Rx|Z||w$-Akpp@?(V2
z%;Z%~(&f{Q!7@ZTZCUfC2ZjEB8nt6X@L80HT8`UyGZ*InaT@WzVCz%-$5;1cltar>
z)*jlqhDNrnqO^9T{B2TM$aZR+p?g~^yr7)l8g+O7z%rpT?37Bec35_k9X<lL!vhm@
zX1kog9&dawTKWj(dlhEiHn4Q4{c2`)iFd&P<e!{l|Li$u%kW&8V1E-N@8cq<4cKM%
zvE(Em>G@%w<)D_U_!4@!D9~adJ(H*U#Auq;gvtHNIfp0DfFx8x$UUb|6xE&g!`rc!
z<Tyu}oCmo7;c{dCVZX8tC`wpK^`9hhtU+YMH$|hJ*1|Xe;Q9E%!C@~2u6<Pv*?!qT
zee`Fw=?~GKeQvqGar60G;sY#$)q43E4UB*ladlGD&?I6qMshg6>a31mtTIT1`QW|G
z58OsApibfb*)6?V%SydKt*E+Xc(WY24>i}*%r5!`&v(|7D0EZrH`K-Q9Fe=s1jszq
zj?C3>ViY;?w#5{;r!8dY(GXrz6YRUg+rS<rDU&-DfN&zaGdO!wCK{TaBnbn2x$o?n
zIZunnhh8&hcht^xlOLJ5#tqTimYp_@z2{<t+(&#-1kppW8Vb(5-izE3;>yOx?PwOs
zE#5LXr>)D^yV0<EGEUjkAhoBxA?I|pV~g9Cj$$20vKdTW5{^F=C<HEt^u%O$7LdFI
z1)DZ$)PmgFElGQp%L;3nZ*rgLq-sAnPQrS*f^DwWv>Ph?3=E+yg7qqrMNb)e?itJ0
zy{h#mvB&CVhZxWQLnw*ybGt^5KouA}bn!NH@Hg-Wj@jrYfzLngzDyu79K*O*E)%-I
z16BGCLXSK9NZ#HW7Oz|$P=JH;>NP6m%g&uE_WhkuE8~DpXyFq$ksfCLLdtla)fGXb
zrb{l^@@YQ|CwpJS6@&W%<)Dt4p@W&n`X^dgroSrGzWp3KHXwb^!sgtr<V4Xg>$1O=
z)@i5NkbDWSmRL*QC(_D+WX4N_Qo2nViK;F|57x_TaIDm#_ELS);4e}V*;mf#pYDtC
z`3i6|%QI_Pcuaj;_A;LdFhIRcM?|DY^uno%|LhvC@}5UJm#HmNjO0>IWEZg09(^5W
zvl$qO9!QS$^aaV+73T~!SM3?fbV+U`e6^fkxz{k;vC#a4LV)FZo3eDhNmCh(ONQ+D
zhCCfX9ZMVeum0s}CQ%~vqH(adhr>~!cX3`WFC8QgonQ(|iW0BA(;)SQa_1aofTwK@
zbJC<@0tIVqTGD7I16ao;fGgWy@g(xs4`_rj-NU~vez~PjxUK*y2T4LF)|R^^9@PX*
zJ(%EY_&u!@0GVI|-=-8}BIkO;IFX(8XfHQ6%*sQ=g#OhV9r+)}&qnHc#)ba$m3#Em
z`QMSbgH|MOal1m67&i7pI2>V>P*3V?0c=d}nX55|=1D6yfW}sE?k1jN17z2cH|)ls
zAt8G$uN?gO04DjIG4aIWZsBPB^BXUNwKrD7-fF#FTcBxGanrs-$Jsw(QrzJ{%49RQ
za(+8+5UtHf3VzCB+VWp3|78!$M2pIZCcnylfEj~RC6x%#js9jb`Fo56-;3my=?o@*
znj6A>cY=4kE6-6Q2fDnKKf_#TZ-+<~K)L#!nIVUK1YGG3K>~jd@;_K9^bW{o=H!d$
z?IZ--r86X4Wd9dUIc#^|h4Ty~FF$kyk$ZkXH(_`ep=85%Ua-W*eap<xQDVZ5voJk#
z`lE8mmq#`~gVOz=6YoY+4SUWSXiCeL^BqVXCJ#kEb(R}__9WrlZo8Y)2U<COdtfmf
zyZ)WYN}&F;cKgaD+BtA}|4C2HjcuMJTa<0n*KAa+Pe`&2w*_ZR8`}o^rVXQtKsbJ9
zAaEPGu6k$*`o{_L>Y}WUk?KhbO*e84eWE0??nNW;{?cG1X=f7K^pxb~(cvv6)u6n7
zlqE(f+Hi8AN^1Iz#=Lg$$0ZGrn*PEK#-7fLd$9+K1htUgsegma9@%>`iQgzrc$X{~
zhwxs3fto|Im+f!T+>3QEw0mKExKB64Blz@1@lcU;4o;jEe$TK9?kRH0AhOy-Q;Ux)
z;)<cm^sAC5?-Ht$^7&cd$V)9sloo0{5IiHK-tCc_xg%W;+fDh2KEn#jeRz27>2~%P
zW0RH(&(0Cb+6L{B#K#ndEAg<4{!fkyphiWN>y(OLUk}h}+xYZn^kBFQ=lL3T)!w{P
zPF1#3qEEaNtx^WLad%xz3}l8`dsyRmTo*f1;+Lx(y7q1BCkJ98?B_GF?`kteZqkv`
zzb;=_3JxPhk76i4=tFfIH<HeaV{7!y;tN~@zlhlB@)!(slk3TsojjdHJivwZ;yqmB
zc_|c#_a4uE1Ydt%>f>KX+egfG1)?)QuJ1f6{1@{obJu7tBJ2L8_)5?CoQCgTgSb7$
zL<f`NwNLTfJg3uOmVVN=MDroz--(PIF><|k%PqBw>d;oOa-%!FE^6(oB*j^`MOQIM
z6&(+~?H}Wvmyi2Nx1&*}6u17ZG}`NzrI)vqFINe_{`>Qc*EdPvSXVtIcamM4!<#6|
zsr${l5h1k$dP^4fjZWEyQKL+2q$*H~))ilpS;0_-wyuZ}>f_2flh}O6rF6Hr`r@W1
zCSZ&5$7zg#&dNFF{kM#Fv(2B!8a(Gr1*TrQzj(eDuQR3buk#wqJ(U5Eq=oA>qq<GY
zGluV(b$C;I0($5p6{i5}%G!T;r8nT_OOM#{VK;uiUqrFPa-;uXjeRXr<>rR&n4QtB
zp3llw|JZkzZ4F<O$!twMdRXnv@yE9?JMj|wi!bTcEw2FQtlNQe*pQ%P=g}uzXRJZ~
z)%nvDE4C9%CMW36jCK0}hGKanx#F7dR>$~iInRM6e?a&>c>CTx{e_#ykqIpPk3`F-
zBkEv_i`AGGPJ{-3fiBy{lhji0mZoD>XP1+pGj(rWx;Ja{+%fD%_Y0|pP4Iq-T6aMP
zFChb+Lv`->n|@CWJoVqD&+FR?4PG5&FIVVZcw64TE2WpZk)3)sQ~05z&R@8c(6@P*
zxU77gdwKB9gb$)(PDYJsVr302$lD@9=I%fKdQR}%aC^))ti1U!Z(of?VJ6cw$_Q1~
zvdgdIOjKlOYI>P%dd%NDS;}tw7cLZR>t+{WwJLs_@c3O?Zq+SHNLJ=``V!QS(FV5C
zD(x@b+-v`?**jaaCuw;PEM|O$SeN9DK7|JHutr<OY>P+HmK&Z5<r_v`8;ukZGW|Sg
z%K8@f<$c2I041gH=9lIIoZsH$YP<4vf8|1$cHOWJGi$UqRPb&3+11HJM>zzc9g+G^
ziRDb;p!=r%BRi+Mn<k_8?3O<-iysTpD8?I>^w~H1P1(k1XgX~nn#jPZV}fom_5Xdm
z)%&=K&E>28lfip&Ou-)01>ZvQn<V2`8ogFp$d1(655spx*DU_!xY7=$+w^+-hEHk4
zs(mR39{b~mEUz_hj#X%Vd3*e?Si3mwEy)u)I%9uc6l9gc^R>pHPLW2JfiES>X6}eZ
zOf_Q?lX>(-bvQqf<}OWx&b2L@Tz+mddC);~{c4-OT4RHwG=zbS?H*n?yjZVC<yMw6
zdFzu@qe`5CkVZynk*-nCoG%Lol>L-H=y(*L5;gTvuJLh?31f3K_vfd>gVXu!)Mos)
zN3{nw{t~s??9O+z815!0-;K@vBpDU$(^T=w;KsrfTM9*Hc9LWz;c`LH!a+)B@+e25
zHl5{Pxc$R;Hlh5_mL=e}2cGvd|72gzE5BPG4H{qLrq_P|@S$GLR9Ej$-ui@eoP)nZ
z;~U*od-<%Mc&5tg!;d>zC7_$Hn06I!JRL3XqyO}eOg*cdz5H`E#i!rWP71~L7ZdW2
zs`#yQ)D^KElnRP@`_w&?WMAo7PRJ=;EfuNmKfXnO?fR?#5I1i4IE;`_i|R{BPJb9K
zrvBlxr(@TIS1s20+#1DoFZPW@4w;>cxrx+(bPtrF7vZhdvJmy6KFRoN=x2$J^qDDW
zZuD#NFFtDW@G$E}G77Nn>s`Mdx0WOvsb}e~@9&%4^;H`S7}DJA4<A0wtH|$bIbA~E
zhJRKt#eDV=kZ?Dg_NkJ7z#U;mssdO32O8a<#MTPkI#8-11mB5$tH-gXLj~&jm&zS!
zd3v<!w)r81F4xIrJ-U9!OHh)!@6l7qJA*egljc|#<I{CKS>@!DZ^XYc5zN-!X759o
zt_;y=icMXJv^JB*L}a9qT!-e3g1@sfjXoDx#tbJ?iW=h$o)&D;W?ba<PD<a>(Kvm?
zU@NRL9Aob%C|c&>5iY_+)q96-l~6TNBRZK$1pg&hAbb{*%E%bVD(in|ggHROq(sy*
zFVg?v2>$jN?<rRB_&a3T`NkK?9vePLgv`y`$A0ONW=v9%I3PDH8FdMmYDw`uV6dIj
zB#cF4?XIC%<S<V^1*-CK-OHxWcUQ{4{4#YHe68)`Uf^ByqAoE|fJ@S{#F17`K1tGb
z872~#BogtF!NI5MCjG(JC!V><oUT=4GEa{wGp(t5?&TAc+~X|G&m=1Dl)kL64#^vQ
zdV@x>x=iv}q-=CK5z5?{V(u$AXk$Y8;K<(4RgKwMFlY8r)FP9YT7t{ieDqfq*9;#i
zu!opgUc}Rd>oo299hV<}^jP+ItbTf|ka+tS{e<P`O@TFcF`o)ZkD+w}yNu;wi4^~%
zn-lN4@!xXe&#J!&h`DBY{m|sj>zz7RV2#St<#;2{lhmL8a%O{NBZwkga7_M{UuvbG
z(H^+!5G8)Ubm%MjKstl0`lElqy-wV{<i9@pt^*N~Pz8|9Gl@#0@A4xJZGNrY9R?)r
zj;h4SS=VLl7}xYWLzkX~;0-*($Gl$-rpQi|d#rbsj(cxuMLeYT^Rn%&p|!$^MA8C}
z$>o1i7xx<b5lTKv-Ko3LT%hA_ng|k_4lPRBds-RcoSh_NXcPC+`m^I-QsNM`BE=(I
zqNw_ViS-*7n7<z4Ig?auh67$&>3X&Q4y?~nnbNmfzt7!YtSr?1DucK$_M1ym`?t)a
z$v<P*pJ{F;t~+zy*YcB03NHP4g<JV09<bxw6O%6Jvjgj?<`?EZ1|ujwkzD;7x$Y%v
ze+_wD_L9c5aje92Cw}WdC+2af)PyiC#f!Hikl!e<-tD&uJY%7QuFirtMRTHZ^S+pl
z8(u%dYqbkV{L*O>`S_62Rt3<sq_xaa{dtu4AVK@uL}HFo;dh2I9pATaW1pJ#d4qHn
z!k~56|2<`YMs972k^Fn(2W>&cZIHQYp@|$*k`{f>{d#XQO&7&d+=<?VV5-d>E`?u9
z939$StEATsbf1}@$!yEIAIqE@f{BaK$i|~t<I|?()`gET9yo3VeHI$_?-8$~OpN1H
zL)cCu{yexTro{p3w%AWweXpt_aFR)71ds}~Je2$7>(mTyb0t+Bd_-d%x^85y*Hi|7
zS$45``u@!;dtsm6cY!0XV#MXkZfk&L+_9}(^a}dyZwp4R(|gFZb~q0UoaS=61iK84
z2A+4i()>(S$P=S`#SuBFKxvR%7`W=4mAStb5uMP{A7tLmT(O-Rl_!0dl@d~{U0NH0
z#g~r`+@2~fH!}N1n|dsl{XSfagUXhv(6CK|oe~{2mZ1uQV%(^E5OWqIl(&$7>b3q6
z^4oK>&f^b09dMW4jGM`#bG*qZ7~Gcdto@VKMadjR7SH!<uuK6AxJ6=9E--t%%FMlU
z715W$wzj<9X_iGc9h-9+#jjEVF;lIdiF#&|#+|Dm^MMV)%8>8~<Vna~oX#y5dBL+`
z+XY82n(A$GmKd0FvqeQ9k8@?-5?L9jMtkBUqM$CsXlJq*#nq4jS_xGCFw>NC!>b>O
zaZzJD1ajFZwG8tJs&T=?0CRNFkx$h#LQ`>*ZOz0ixTz9s)Iv5X7_hDOR~7d4{S&|L
z=>bv7x_4Ko6(VCVj_gw#jWgfV?5Wqk;6q=!JoWTad*b97r`ep0NQe?ni^^!}$86_|
zbE+q)hp1Hyxfl0|3Z*=b)+GPqX!xD~sz$$a1~!c|TX3+scIwu>VK!0`rtTKp&sTx?
zU-r&V!|Qveb=5$kF)|*vr#cja*n~Rr%bx?B!#V6<)Q^Zq3(OC!(E@#R1Bu(3!RN4b
zmvX(Pm31Og$Dw&4%;hE_NGvG##MHZ&_WNrCbnGt}Hf7YLPx7CU=p?{Hz;dF9)2d9G
z@e0wZiBJA};`T9bHJOW8B(GjV&zbbqIC_-R>cn|NaY^eiaOjRpHu((Acw&r6TK&RU
z?0wrn6@tuj9hrwE4`oAcX;h$bW{9*K_g-Tjvrr*SCCzNn)XxpYuCWdKJ{eS8zxDYj
zT2-8z*Mtllgv8yONu1xmN8VzPx0m!W`7VV#%xpp!8`((Tk{*9;)X&FM;S(ef<;?bT
zJT`pjW&;JF-q<Xvgs71*g!}qDC`?!RnctnDI9Fjzk^mRgnaB$sKeBull77k-_=Ee>
z9I>1`>%)MMC>L~oxK5Jua}j*}g-hYsU?FCImhPZf8<bgLsFzhb9b>Qxd*(D7uFWsN
zkx$>zRAtk_yK8$0r`iFpPd=}?;P8{>dzQMHLEpW7Jj6t~Ib!3;WOs9ez!Uf>{Qd-J
zXZUOHUqA2%<=o`%hri&aSKHmKJxhJC`iBX|Bss5XCu2F;YiG_T;$sAuB7WS6^m?@_
z#k1t6<!r?>WzW~nQ&p`2ZPmRl+R~GMMShs9k>qMpX2kkG+ow3MFYV*jTBl$+JPhHR
zFy~_@)ALlHzXK2845DX{pl$<8AH{%=E2N|-kG((9qYh4%!F!Mw=A>8H=a21q0(0x-
zY6X32nQ*(%Kar}wX&m~zNg$mDFTsM%i4`{6Duh1ph%tgCalcZc6;Z+#CJ|T$Sy-ni
zjY9bsk?Yx%HkgyZ;-zYtQmhqK%3m~*Gi-rkQzt2}Of8>e+{gm=U+fi7`cj;m-jwUp
z`KtY|u%zYd-|81=t=eAA#+nIbZi;Vx^aF_^s=QC=IoL-xe^1Az2x}?h7hf&Ydh-~r
z!m7t*N}3iM?|({{OlGWtHcV1D1BFul^PZ2XfU@^kKL!b9#zzITBaz5~VV85#h(Hft
zZ9pAqEL?d_EYSKdN!%reW?=|4ufTGTf>Fi1-RoR6L-0qHdzZjo*pH4O!CLyA8>F)v
zFfTWuC*E0!mtofJh3CflJ!!9q)<oOPE`zIGZh}O@yZs4@M?~{>EX_Ak`gqQjayH+0
z3RjDn{Bx_N(H6(QsH`nsf5EG<cIM9Jg|rU|Z%B^*MjPsy^L$Tdx?{htkYs9g`DG5s
z^@q7NUxwMs_U#7goMs@0A22ulGZKd(*+^BDZ1xYNIlP${MBnQrA+Mu34>-das|O?Q
zs&n!i-3SElK={M!{qM6P#tVbI{h+Ni-IQqXK~lTst#^FK6=8Y$ihNI|@>tOp9eJmw
z%Gj)t7T?s93A-(#!U6l-v|UHE3;#K&V_hZx+iDsQC_V(v5c_qW0%86`9DAFwNyk`7
z&||<4%YN{Y;)5R+>a`pnb5*JNP;;poU!_mtJ1Cx2iFd`B(o?)#l6Em6r$SQ$LHPj*
zFGr$-Ri*^xV>o09{O7ZF7=p3zwQnQ4-(9)g=5F5zi`_L{CshKq?O%!go-~0`L=pH8
zTHGi3DA?4x`e<J6i@xWl$xG{N1?b|4d+{Xta&7LIbyNMDJeFDz{*#AYe6_S4<o-eV
z9`-t_zN3RTwvdU)*kABI$t-1+Ut)RFuh&!>?f#NPtJ=DQH0nZh@}cR2YfJV+B);f2
z$5d6NwEnAm!6d&5hWA7(C%cY*PVR>j4e^{*1b%=EL5~A&67Dgp?a8;bbqvsdwKWLG
z71Es3Ug;4uki*u!n@QxQvV|h+8DV;bbqdBG!Ti`pzJEURcg2@FqlAhD4az~@)RH$B
zo!KLdoR@h-+y)GL2{3}ONCOGm=V68~D0d_QS^YSga!jZBQ`*Uc-}ssvFU!D*=-iK@
ztUmJ)ZKxH$3IHS!p&gK>hAA;@(0A`vBa7$+#B#R*)(b#5z5=&+R^PK#EC<{0I&~$Q
zF5Ey~p89->4pD*<W_=uTq_4lxZhhel=r97%O_Rvs5QL0EL@VyS?bZqF0Zll0)C6~#
z{Qp3iJYbI?F<hvEBk1GDffz`3WLnZky)jBawon6*FZGki!HYSDfbK+*a8vK-+p#vl
z;eIXxy}5h-NzKo?_&BZI9!cOEF-{4pzWrBN>7T4KO1KWX1h^RsKv%L(vY3Zg_h(4i
zmtbi!xE`?zfP5afCI|Dcr0(HPp}5li#W-SQApc=8Jm*v8Cy+K`f-8wlFcRA;m2DtK
zw!PXP5Wqq{UVBT~UTo39ttcfl*fRCJFOxKs8x3Gb5ERte8feGuFUrg#`*^)pOMzS+
z$Q3JNWy@gcn>?W+4b~JJ_la<FEzqSX{)?ic*>9ah@s;Xz4&XgDF1ni*1h62z>O6-L
z^53;udLVT#TZYjrl{vD{iLCkUwtfi*2H{FAN;)75%7U|&>%oOZUq}Fs??Ey5a$^}y
z!EdKqGbD;A>x%@Ki1C+ieTL#NI2Z*AFgePyB8s<Y9Rvq`posxu(kL+ZmbQp|#?!(u
z@wI8^x0=nBC~}fj21$Z+^V8kTM`IS3tbiAUaff~i!?gA`nr%2YY916r9wRvjzo}a+
zMvgLGPUuFW5Y6a{pCCkWOtQhtf)y}%Lv%$HUbg!z(rf3x&x4K{Nn~K^nx+`aD_JZf
z_Cu+*Y6VwgPqdu+ObHmDyp1wiUqr;H->YGL9$DSS7jzcGU5=S#=uL?i1n5Zajfuba
zF8$=QQ$D}!nmKmA`a`Y-MP(8j?!t9FS;!jHd7G$tpxnUF2B6mqHfjcQ;}Jgs5}qe(
zVxi!jdl$&e7;eiF2`UrEy+rV$#33xZyE?zSsoYEtHD30jj`j3TbLqYc8|LeFNi!77
zd=>Bu)X#DUXZvIcss#ah&tsB^@gpI%RJG$aD&$>EowrTGhPG0uvfzo}E&0%B%Kw=@
zjNnqio;Xm;7yH~R0_C}1E;0SKdkO`b#ZGk-Lly%Bcn>~?R0Ir-=;?lwAL=>laD8_a
zdnG#z*XCZrS=@Va-pm-q1hO#3&Hf&+8EUuRIlW-Npr+Eg?M~N&G|e;sMi0F2pNSOX
zWisS(&9fJWGa#B1)+!`~=M`Td<wtSU=WjPObz+1#hwmRfc3cd}VVI^Odc8viaUra?
zvcDa=GM+;^$hEG=uu}+}s6T*GtJho?CV;OJ_Ug+&C#Qr7bed~&6`TBXg#99%f7+|K
zv^}lfQm#9z>P$d$U~Pbxl@Ta~_{P+Iv{FqRxFnKaQ3HrU^Dy0RRx-ov4Nz!X)#%8$
z7gqX!;an}MD%k5rKu$8Gb>M;(I1nvDp!N`8I{U$u;_weE9R?2*A+*-k>MXRn7WB#-
zFa!38l2sx%;RBT|r8e@@y6bcr+WtAeF;3Ui#(V*_`p?gJr}hOc$qg3aHWN^=aZFjO
z6eZDe87Ae-oOq%}-UUWp?|Q#!apk7)IoGC|E5LB4N!b*hbM+;0mSU(zm(eR4$js!&
z1eXZa6=$!8fsT^f#zimW2xbut+xc}B^;#t51ImNOQdHhRn%#a14Cb}?^EEQSU<huJ
z@6T#iI1#p3aq=9bqC?cJQrG!_d9E6@Jd8pa#bCqKe{VK11pK_B0)7tZ*Rn$-Lml>E
zcq`#g`Q;wZKs^f~9u`~3*5=uc=d9Jk8w{syIhz^QYB~3#oU2gELuvG@wFHz)>e9h>
z{h)>5F8kI^yhB}o_s8}9RVPkObWT%9qv)bVtfOZo51vq#QzyY_T5gI{UnOeQQb%F&
ze=|Xngrvf3pZe7X>L|z(6rOsQabkgd<+)>>{on-20;_pYRvN93>VoSZ!_9U{0I9Gf
z7j<I_lT#8Zak1~(6wtso7U{?6i|lMtB3;KLYkLU5#k>RB1zHE37df!#(8|^`5>*sP
z3xtd<d5r&RVSGuxu;Pf71dJ(^2g~gE(UPH_ZPhYdjQi2{@h;AOOFgUji{>6l9Xn-+
zXB~{lz4ZLtOdi#eMZCLV!>r-CIcPUN-aG)8slzCZ+w(x|OG(lPv(mUff)c)`S|4dt
zFteiL-wY-(ygs0POOjd4k|l8%=Y<1bKJD6?ibTqvM$=vewG$tGu}WK~e@6n&4TDGg
zSx9THzUt1&UK!^~II;nuIs!MGuOI$Qxc#;Ne@xJi38SeMp12~ZR_sN-)uVCefL|Ev
zp;>0sVgAw|1^~1yIeQ+XPGFi{E5yM+5hIWIw`qAm@bCAZrkx3wDq`%>1}<ONIx0bh
z5Ty5u)NrMUwEnxPc3o<mOVRx}6Bk;l-?Iv%k=n@KW7$$IPN)5iG~20pay}6KJOa2B
z&Oy2$Hj{hK<s@1r$ZUsf=@7has8y2mcEZVkq*P1$itapn?)L&ELoB@nCglU{@tfaQ
zW!|8MAL}WYF7R#!Ze8kr%MOP6i&1pl<A+*&qomulqmln{)(!wjTUFOm58MDg@ai|S
zi8uS5Hl6J;X+Qzu3<@M@SbT5@CWyOCeRJhCBBd5)JtO}Q@nBNc0bX*&^Xyn=Zfb9A
z+L<X1hS7lykkaw>AXDhIrc^cFg%&+x5#W{iD_7I`8Oqyq{VCWUy=;iL!oIp{cTv({
zGuhL?j#?hwTO-lUk)iD?*2DMLRM*yNYlE_`KrLjqIpqrv<NIj}P@(>_7&a9Oq7XLf
zZ_xuz%{h83q3EbyPoM8-tTvZ4_`9}7_-opb1sM(@TxD>2m)HfO%)h#Sp$gIp<nSux
zTW<1<D{EI@t@ydL>izFXEu=(iF%|jk?jtbEidP&A=D8c$-b4C;Q&L)LYD90Jus{-h
z3F__MXN%%w<v=JpiDnhOsxzN(*<6EMeXV5G;UV&oAKM0d(Og|~(AOa90<Kgbt#GX|
z8sKH{N#TG0O7;6vN)h+#U+guVF@|?92?exP8f~xoSB1keiFPufyqckVXSrXKn}6{i
z#rz|u(Kd$pq;c3`?vPXht{1*nUvk*qj;V@xB0^%1_`7r+gnE9O?xp8n0@s%bSAYhJ
zeSH=-g(9n_qIiVd-!t1#7$K4S1pJixRHQjDZ1Il&6`Ji<zk0pcW+u=5m^A%IHIIq1
z#&<e~Pf|(En-=ezzl_%M-po>QKQq6eUz=I<j(OdC=khEAnVooWo$+>i9K&VU5d1;z
zI!|uOI0hQ3kM9!2i$^Rk<*v$qeSz;fLJfq9ZDUliL9hR8)PSctFi#2<P^_n|m}H`z
zOCCpx>eZ$^!&2uUUXs8%&A<I`qxR&_HnEZXh;)JFeKg6Cek<#Rw-@T9dD!_q<L5lq
zjt}MIabrArLB}fs{A@AXQk9BEKSjA`Uz8waVM!4|l?pD=d5sgi(-JSQPEY7bsbBfz
zY?nx$cxY&8O4e)k)KH4aU`fKYrYyycw+_Lg!DQXI%d49DyitvFNj`$<em5!KAtS>?
zG~rnPB{x6A%#Z%f7+aaTU~Yo$rLz9nuMOGqu*DwOdt}g3-8K1)AlG{Jy>1fqLtSb5
z#mA|u40z!^(sc`uHui?(mC7wg+-zx)q^Ie#?a1HnZXz-1#;I#@0q@}%{E|;7<g2R`
zYMq*oWYbg2)#wpb*hRekHA0n%+V;_H{2D>mdo6J;E_zg<w)s~8-hQ2)pPu5KA7B0U
zR*AkAW+>5&8Bjf{x4tEU29Ci*)1fCDd`NgS{Gz5lpGPhyLdi1bCHK;&7Kb01zw=16
zC9qSZTb@kez-Hm)H%9lB&>_0xFc>U5O03H_`bM*L>9e-2)D!$K|D$5-sy!39lrHM~
zN9UFT;%AKBLD+e~f2O-$?X!`-JwpT>F>Jy)>+cyx8ZP+X#M6nhLlWE=H5hQCs0Gu!
zH1<;z?iM}J`|vHnpU6#{i4ugR`@gidJ&(TRFyY<`+ed|VlgzG?JP2op{NJ^*rXn!n
z@NyLV_#$H(Y5;>>ci?!Jf~Qy@mVU+@kfK3;^BF@c_qPfB8x;SQ{@(5a-Twr`FP)Up
ze~;hXU&7ua6S?5tUQR39CDB8@4}<W^ARUV>V$oSbNK|PIVOJd_)b@8R!J(|6n6!Aw
zZ`f{&N~A{xle$SsvDTDpT@>?LGcpjaf&I{f>P4VQG(`JMg=2qDeWttl7rbBlQH_~%
z+oLWE;x2)d8k^Ru9n~CHc9cb=Cz3<=718;0v%8DpdQ7+O7N#@J>b&FDDgDTnr9>zG
z{k%&eHGXMJ^7x79-8M<R5{Xn9j<7CCy@{0wZ~DXf%L!*1Idvjl5HD>Zeid-5C&VsO
zl3})0z1Mo}Qh7D>b?Qh$mvB#N_=<Z9xnz!TC(e@W%rK*ILmP)mCdpkWk8#~8$a>or
zhv`fv=DSlv%5$&QF?uLowKukIt!>hJqRw2yAHTjC`CP{~y10GPlU_A(5mrXkSB1i^
z+^$avJ9cJVKpeezVA|7xys|j@J|y{zv@^DD|8zE!ZMf4?M3nfp-V3wqe7VQQ4>J1I
zaF@7yZO>fJ`_1PeHZz;=DPg6#bR<)fTILEdmCb`bkr@m3Dmr|r3*NtxnNl~PI=-Ad
zTUfKK@4E0%q#8_%rbz<|r}e|&AsG{1`aMF+Qd90&&RJo0qrC^E(P;Lfl*wSl?}nCm
zYg>0m_xD=s%t|)y9qj`Oy+T$0ZGN*|R&A*t*n4A5E5*`$quUFuwT%lqz+C(78Sc5z
zcEWgZ)t)>eiKa<zkl_9rs=Bh?b>yb!w0qXkZm13&IhM}fS(^8H-Ru~8eqUzN9FUjL
z70^L^_M?+W_8PjbUAUcY@;@t|^*dsP)-RxfW-8TqGJ=&!uD2yj>ql6JZ*eO#ZtMxw
zOJ>S~{_YS}U9)~HN9VISWDSt<)|#-u!%0H*U#4ywUNHyu#QKLFz#+Nho)JEC*X+hs
zmX|lo2}Ob=uer)l<JC5stX{fWI;r|z$@gLzFRtZ@+qpN%=oYZ7;>p^YVeovcdhd&E
zS&!qrKBPDM?``dCazs(kMw=O<epeiHg&HtH&{bzq#y;8S5P)>O)er}L09+du07;<h
z@<y7a;;F(SsOh;e1-8@tS}x^Gtig1wm}ZNWOM=Hb!QS14&K6vJAej$Hs(AXoe|7t2
z%s(sC2+@Gd+(3iAMHrV*Jbk};W~TyJEye*kg+@D_Ot|^ad8vR>IF~JyFUWsLrRJWh
z$CqbxQSC<Y`DpptKAAYqrd)@k57QL2U>*#cq1vu9v>5p7h|?CwpdO<vzc<Q5<02QR
zz7y&+iFWhhA?e#AgX1^pC!j7oh7zsLbDoh8C-zaVeC6!O`5?47L9|;_yGAdJt9#ML
z9LyN2gcW?ztK)r$?~70=S*w6ZL446}42xBD%Y>JHG0q5Gx~#Di6Cf8&-Ttnx)MP`l
zpK2GQLn5oC=>QdjCDAJO)}g;WtjI$HKA{+-AZdt-Y3KFmmd6k6(PDo@42CVJM!?%>
zH^6uUdGz#`ngFSgTQQOZeT01A!kOz)V(-lG$RG5gc>)Ehwj^h7AsLV%rClgh`sk5N
zxs-fhfC@nK&?DY%_$<6c?kVUJ>^zc%D<R@&F9C{B|DWSDvoL_xQP7U#m8X2gr0hYj
z{1v%=IZiK>dmWZUsVV3PU!xz1r8oeEGjve!L5I;Imm!Ylw-`Vh;P&nqL-Jf`F&GA_
zQEePJ;!i-o#Heb$ytD#%7fP(^pAN1ambMQXdON2F7SBK+1jy*_9cx4WWy;orzd%z(
zIE|D7q{XvfhHO22mti~oc##hnmkPAn<@67#4-&d@^TK6ekZ`Ii(AB_?2Yat&6Z>aQ
z)aDuez+cd5Xj;g?buX`?3}Hb7{kK?>&;cZTjL(SU$6Qc_-Up)uPclLu;CYk8s!|4c
zC?c%n;VU7fS1M2(O*m8o@^7Q7jcp9PX<@+kK5%l|G_CvK8W+M13;rSCWvcL!XoQsm
z9>I*<wi2(EA^(13Y9lS-VtvDTdAKGp)Rm!7K`hq;GemU}RJSb*h%8@P_1`oVaE7Rd
zY<x56ci&Y2Yd~TsoUH7(CM&wMy*_-&+A_wslXZ|0!5ynR?J_TxIC?4ww=Ic9#4ZXz
zLBa`Ep$tqSrtrpb9p;hEw8RwAg(W;KB#z$`GPq>&Cn#X$vZ$=Am#vByf_N5rv-wL%
zst1OdOseA!rWcN1gmGakwv;g-KtOR^Xf<ShZ1?j{0MO5Hy@W^tRm+pIhtns6@b$t6
z3XpQd^JR>p@PM!RCf8qLS~`CPDh;8GW_{yBF2|>k5Tmo_d@Q~uYYQp{)d0_tZi)w-
zBG~`F5wq`_7lo;S%7Mwmja33j7Ia?LUMy{hp`-Ek@c~8tDS=Ew^x;fi-~NQ);Q1A>
z>kdXCS)uGL=tgSuUmv&XgVkqyS*MqB0R9UyV#w_4@VRvs{m&534;cpn#z+;_$!F=T
zQQ>{Ze?54g@t%&yB7v-KM`<jpK9qi$w0Vn!guo!OoG^>bh`o-FQ!%qg&2mx*=?tP5
z&P6DK6dz)i3N+aPvy|eBtONWo7q(M}kXu$xP0dE~-_S5=uvJyB>8Fn8bqT(%U3k?&
z76aj{H)^}{)(@&6wK<2YlqcnEpgRN{+OE2<-};e~o)KvK6A|RX8&2Kv_}pE%y>P@r
zp7(hltzD<Z`*9L;xS3zllc0)ROVmF+Gk$wvHV2q#oVWT9J_-ioga!>Gkkp_0x%CCi
zLi0jY6Y7k;T-|bsYPF^pVfGuKhT+mNI?+#~r6Cstn*o#(B>w&3&WKsxNiqLEF3b__
z>jWwp%=AoV`OV3o!-b|f;5i3}cOTUMq`^tG`)Ap&!jtw%6t+hJ#o$1_E6w%8$NLHl
z)!-U{L$YaD0uPTnQg75CWV_G*b{Hfe({f2~ncY$y$*dHSsj&s3bh(g+fRqRD`Y_(K
zzW!A)FT*H&<4Z}5ZI(sYWc^(@y|#SFQ#3<Y<otRjpyjh!jd4GdmKdiCi8yF+{DcwZ
zAA#DI_pAbBcXnfSxy=OT1CC(0iF{v|6b%m<7NxX!-#8mek|@JG2P>9UmG^HaeKS#0
zB(F}s)c7G&ooJ(~mQdTj-T6(4c^bqAsWF5XJ;$o)ehX3);pnK|54=ur0>iBE<z?d?
z{pIQDn=J~PlAKZhsIURoVY3qG=VOQ8-<^6G#T1@IK(DB(70#OpUrheID46Yxz_lN&
zbMmHNYsHTwVO4rnIp|JTy|S0a8PRI|{v-TuWE_)`;W2@4XZGo%_l-F1C_apb%&Der
zzh`C6kFqc8U)IvhzNXt+y|P_<RV@#3Aor{aR0=I^*Z?~R$`hzAF2Lc&UZ|g0O=dIJ
zy2ZKeHx6MgN9#%l`MQ`QaCI=#0omc-l-aDez*z;OEqr9>#oH?C#cp0j(i18(C$=99
zo7?Hh8~DFuu6&z%v=O$9>P9N{@y+zwtYa>#E3%`Rnu+wlsx!1;gk(k}%igT6cz-fG
zn0D+`c&28yZ1Ou<aV!PTDZZL>xK4f`eDFqrMF^Bg`Iy#rVv`k``h4j=xh)7-LG!Mz
zyV1T>;!)?_t1AzFO2Y$EFy&L({c0Jh%L%s@2JOC=c?z~VlR5eKLH#eCjc;RGWhZ5I
z{54$r)u=F5HA~jaJC1gZm{eHe#_&F7J*JDSSpH(to!iTtE+4}8Y-6v>X$~TXsX4!*
zK&c7+EQ`6bX$!sIPkN4}uGPV%xF0wctvb{Tgq7YNI3%e8USDS)HJ(YpbE0+L(L?&5
zQHnf(x0XHqoxKWJia9bgLOoR+Eqj;jkputmf*+;6-wYY$Rak{^(Mj`8nANwfs{1so
z^81x{;mJ4{`9p+VYrA}zXl?2@-B!mkI;<?>c^m5E!rWRL_~tZ!m`}?MLkq|kbg&lu
z4b6r|I8bM2H}ZS=zgy&>_MOZBqv$NWqI&u`yg{dQcXxMpgFiw*x*MdMT^b~$yBiTn
zL6BHFB_)&;mJ$#Il-_;!{R8f~b7t<DnLG1+J`YPCAx}<UN$ML6IFS#J)GH3@ytBXD
zTM8N$qt|;x=Z!NQ3=L1Zz;{pzIHYu|zQ+=JV2z+^YNSWfJsX%k7m-~~QcXxdwI#%`
z;19nK0N8-uop63zpDOgF?`XO-6Z<R(;17ihck2kVGO-2Q|HzQ1kxnOrTEK)aa~>tM
zo;!vD1H%gFeki^}6d5b56+%ur67sI-N;5b2&kY~72hA@M4IiMJJEY6u+iVC@l$0}c
z3Vz@y<SILiv<O>r=;=N&tIp6u;eQ6ZVq5t|w4zfd(W{>kG14Wy65$|}vV){m_axcT
zQh*ai#F7L?hXTw{5^V&&tP9-z_yo^~v)E3&p#p*zg7ZXjmpPREuMP7mFj^Nv<+$g8
z+M=LbaWIt!ejx?|s<_v+MDjHxB6oXasH+1Tn18xv^z3%*`bdT_oC>P7OLa-Oi&;0&
zOem9KH$b-J6G>ONKUj@)u}zD=F3@)QVk`G`mEhGSgbL--7TA?iiF7Cvvpbp{D_zFs
zBDF;Y>OA?N@?<if-l2hoiA|iC;gjmGR-#>C5L8qX&&?M1z|-j7SDQAv`U)iQ;E2or
z;NO!jUPkzx7=!gCgc|iYMYiewQk<8(F7MGQ{}l4uVi#)af5MY>dM|po@k-SXOcl8g
zT#&uYdgvZC*5D_Z5i9b&Un4m{N?w}5>$U2qzOl3Dk9{k5I6ZDeY04lVBpSVtE&v}o
z78T_Bt2e;q4S*NW0!5LB=1m<0ce-<;&<Be=B-KwOvj$w%F9$OgV=|gBP>tMCzm)-q
z1VlOi>eI4esf8EGwbk{3S9aJ?3nk$$naI1g1}}Z<SKrAXvb+ic(Wn3f0O38NQee1}
z)}zPxL1lV=5IWIG4=na($z<lp^olVZLea+z-*%{a+?W(4Lq2|Gc%$x`X6^h3)N~Wc
z4j-5_+YHW6F9gzS7lQ$!(dfHOJb@-S?#xRxOJs$13BgPBzjgPi9o|Wk@zqckv&zUk
zy$BiIp^d0R*}=G4*@uiYeq(>79MuKPz@)Py0Fav`r@Erepi1*?YC&{a+jo;W=U#LL
z=&x$MLCz^*vX3wRh@*KJwG}Nv5o9N_I8J7=tPyUPS_YChUbfGc*CXIVY-L|0i#DV*
z8Bb!$ZJOR1^Uy4?l=!&YfKj|=Ouw;*7-?#DN0Yq~1gadHT4F7Hh6WMu!-R1mM!J;d
zb9~vi!E^&20fukW3OSxTZu%Ym1HX(b`af&W+rcV$r<;XfQ$i|=){oQJv9C88PtHrC
z7W!E$y80_s;a(YMW=Eiz(E)H~;^VzXbnb7}g!~wrH8kh%b)ro-1^`0M71GkcVm{*b
zvQ@{r6Kp-NA*9BctQ}@3&?@@)f;jx_5U<$k`YVf}2}Mf&Lb&h6MaiAhZsoKAK=)7m
zzbDO3*k$#2X_&4IK7h|QdA{7?jQTBj|AS5Z#h|hlu5Oer#O0Z0(2$=Xv^vYf$dW=K
zP;qd1hiw%>_D`4A!}B`)!<7OCZ=3(LJu<Z<Wiq9}Ar%jzJhGz}<V|h*2nN%Wt1LMD
z{>gg@tuDX-$c_wjV?!K+cNwr#8%9=-r5Ry=tqoeem`DeTNr+!4)_p(teJ3l3tK(_)
zb~IRgCmsV|4AEX|{=#2z3&)}!Hf)pl+oSx+6X1XLhm$nqKW3lX!|w~(=g`Tl<L^*}
zTUV)BZNm(PA}#~-wyQ8MFLrIi^!He7tw%U3S`1pk>dbpUM~Syfx(+<WI{x1FcIJBN
zK<)*$$0lLI2OGvn$m;tZSCA!;9e$>)mM+g7mOC1vj}K$^GTa(`Berh}-#igYCobXr
z14+-6i9=Wdc;U@2{n-!`!nU54oeXdrwa;cpO7G&co@94T#|IDv43xr|U7C8=a7a9F
zL)On<N*MMk<lhf=L?7xMVI@+wa>-lve}Go)ZYvQ0l^yd#E%$--2C(3e{8td5m=v3Z
zD^~`Gq{rX<Ucu$2_-S7Auy)TU4ycu~t^X68x(u|+pt;qCq#EJVlNP+XFXvztAM=Lq
zjW^(z(A7b-F<8ezQUA<HD%t<IY{MbLw~fs17L3vP$<S~}c1|>6J{zK!Ks75n29`}{
zec|24xf4*^AWg_%_Dh`n7oVSytPe=Uco#5vzUi7~k^cxliw?j9h;)+Y*8w`@Z$UzP
zwU&x0dPHEt;p1}WyBDq3psat`aQ-x@fip#@OU*Bm<Q`ltfA!O&u(7W{{%&H-zI+5<
zN#)YylWNhup1Zy^fPHpO2T=M&!Z^kMN-f!&mh+Gv26m+upS(I2^pUS35o01erGoxA
z%wI4fW31~c<p!)s7I|$-Fmio&_KpMBVNvdLb`MQ8nXCUc$$Yg>rAkst;(fJ@8ID)S
z>f^JNwFD+$%NJssh|}MC;6j--n%ckK#9D0F8qMtHXh1zvIo8JecE3Q0!ZQoDm-;Nz
z<U){A1IlWY1imWD(l114U8KFQ93D1u@aLt_MugkR%1FaB0cp%N*|MRc>bh;SKD<4j
zQzaaV+Qlr%$J8oP?6XC7(k6RC+pxmzpr^)-d7KeyT`xn;tHEDJv+D}~_+<Sw;=kYL
zE|%maEvpz<vm*aEH4?smBOi#%mx4Vse?t76o%(E_xvFcauy3QM-<t+hf<}~FIRthn
zs`um=CYi|3O1+^QYmv^J$q@0wVBD1{Adz#!!9s+qrz>bgJr$<#kWXE_1FukS`38BK
zsJ>)rR6HDhmmw_cg;Bv~8i;}S$mbXx2a1LpvIl_cLT`l07p5fZc;Kbp<ck|9+){nX
zG276{8LpHyK!$zf*Oem`h6^aZS)9+|CsmIDiK>8Qcadm=!=#*~czSHKXwQO)(YY`B
zF$l{n=Htc)$Tf?AEmwB>wsT6+WjVVd`C_%A-f4UcuUuz%6ji*7I>%lVHeQuS4kvj`
zC-G={BELe>hr0|#&Fyrm_iy8YpO<!6z9uI{@?r!ebn9R$ZdWLGV?}nE9NsByw#Q-k
z(>A<Xpvo7~VQUflo5@-MEUavaLie4VeH6YGAQ5p+rOn=9SWsXgvgGtGm|+{cx+22{
zO+=$Y;rIQ1Zx0(xOd7JW?f>`a=KL13MN|(=|L49Eix>Xv|9wz>D%$no;JvOPv?pLR
z^-swi+RL;BH=L+Z=GGV1<5@`^dFx7OBJ2dd;~wum9FrvTo3Ys{W9Z=>Gvxn}n!Mj+
zn9c=MCc!9QHdeXNITyJBO};+;H<DW<84=4qqV-YYl_>_2hEP?QRWPQsjqi&^?{w2E
znk6ZRfyBBYzVh{5s#<q4t8f~fQVl$Mo}3!R%^$tn=X5w>l*ixwc}d$|hmAJrc`fN%
zJJbh#<q#$@U7&Q(BEtOZn_4r@tzq@iJ<ZXVHQG#DVVsQz<G=Hq&ycRV&LwE`w3bAy
z{0vS@u@4qhxxgo+EQLXI7m{ji#lBpGX<s)IW=AaEE(zniv&nDciGKH!Ymk{SK?c8Z
zx&bDeSC#o^FlR8SpkR(ONq?JLD63bI5RQ^MdopTTvkl@rOeB}pxn5QS{h7f)+6m-0
z<_5E4!cjmRXg{MVv{=#e$rxOa(mYv9##Aw!zERVpxoWY<PSJ(g4k0@6nYzxB-njXS
zY4j!2(D^H#NiE+1@qX`6-<P~7-tcBhf-3jYY;T<Vwx#IWOTW;2%MY)4xfR|p7G?EH
zymH#ZbJlfVkvwr$grJuucn(A{y8)Xt-`M>3oYpu<GE3k%0H(Qrfl6=6iK5xaF}(AG
zTsvUnG8bYpOdO{rUz9(#Vt$WNm8~pVvf+?(e!gNpX9%B~d@H*8y`f6?G`_MS8l&Y0
z1z<o0oa90Kb%(f@MOnF#0JCm!yik~0(bp=*hYt{E>~=cQq1f)U{7nz~{;K5XVdZ4K
zldYvgn~90Ee&s9>T|^?I=%pvrG_xm^JGQc*Sn=1olth)%?D<$bL)+8)KeDz+V`hM~
z=E|$jBx2a+jC*<553vT`RD&O8QFq-jXF=>ynv84O7^LmGO{zVu(xf*rX7QTRkNcXs
zT3pssYP2f}uR=QA6d@{yhhv*HmfPk?Dv=a((NBdSOlHbIoX4i&;;x3N@kp}FS2N$)
zqBM+P-ft76Uf{PO*#9*mXTM2&Fmc1vci($y>_`EEU<o=3q@HGe;x^B+U&l~n7DPq;
z8=ib5cI8X&cqyE#6DVRAZLwq;{2vMZDq2NlgWD|tqMNPkYl_$7r+}rCxHW?lU#XQ+
z+3NCN%Fowx!Mw*UlXCs;0Ns+NZ+XVsgIZw5VL(mP2x)?*rrXNq$K3mJBNexV{TNEk
z1goB{Zp&(PfkR$+OhFfBq>ypjCkitJ5L8#xFCyrTh0<*kO^V&tc4{#eXC@Q1tYa9D
zZBp8sPTndS4;ud<6y<DnUAwV9C%C&xMzq$XRl*t;X|+Y|xwb&!=l4a|IFW&1Hp{>Y
zP+Z<|YNQKWmRE<gW&GAC`40*weX#9;JvQEKSBM(52tkj4ER%VGCZbZ`*s~DW`k=@j
z@#{Z*m7T7*fpDQa4J+%Z=8MPl;>I$Q23}Ey$?RlzP9M^{3(pWolW9rXA3-LPk-TtV
zLU#O<@nrMcmYYh(P)=p5Z__+W!ai@B7hEo~l1*qeU0B~bVK9>TJN&LkPkh^IU9Q7v
zNIThzJ@cfd=G}Yubs(?6lfuaEbH669jZjEyk8*U#I~<sLXt`VswiXsX#N9qPm3+h4
z$&9~11~zYIIk?HeRQttC@o=C?yPQc_a2bpDX0a+K-2fdjGdp`97BKEWxUxbf5l01k
zo#+LAEhMO=3;$6tkd%4rlsWj{ufPHZH;~b&2fNX#r#H5)4t*mfX1fHs%aBd`HBTlo
zoXJgVWpV2HZ*<F4=7-LrqEG)6%YCSjmIE`CC67BEP0DvyCd4Xi+*cxPV;na^k|dNo
z{Z_m|(?<IxJ*8zFa_GlAxZ2iPw^`f=zxSo9?8CJ9^!F)`Eg3y^*x;u#_f$z<Z4vHD
zkcQzihl-!p`Xa7arjj{5?=D`n&Bs?4ii%@$53^H7?znnv|DY^PVGqAKsxo^GXuS}g
z7pB}+2;9-bP1PU45CJ3)L~!G)6l!MX$?Haa_{7on_a6sw{D1i-u?vRkQzS#Sni=@d
z4nkp4K1rI<md$S5%sJVb*FO#E?FA3`7<sR2UmGElle)0`q{m~?+W_f8HjP*ZhrA}{
z*c_1uroG0E>R2t6t>@@^<&@A#j>guz$ZgIegc*V8jgl$R57!yPy@W>nV~!D{=~t{<
zbY0PEY0KE!XA?eeS>JwjPB4sAW`9wsz9~A|UWLw~H!X~mdcivDRQvaqaAA1cW^iSb
zBik`aUMav;aj@&b6{u)*<vdR88|;U_p&`!D;6h5!II(;12>SM~MA0*zTjc48>}}CQ
zF$LBJ-@^M##ub2_e{^`ZqeCFCLNvf3pQQP1x{;0<)xitmRK=sbe@UTS5dhsto?>iQ
z=k`nt2RH3cL6L)*pXsFzY81X%TLARg3)~GYaT<7&@X@AlP=+xd2ci<9#<BunLXVJJ
zd2FL}I|MtPmt+8APbE+fu9roU`BwFxu98u3x673eD#*A+3gd9d!mxWjGy&{m{hU4<
zd_x>sJ(-_l{I}a`^Y2e3y~FB^NYi)0Dx?n{uedL5R$6~h`<plk;~zCUHh;;Tp2Hk@
zkl~feJhMq!zcoHNwmN;r)Bh{u=QlbkNdcUW#G@ZY<5o;&7(B#7+w$cV@r~4Xy`T;Y
zrV1Y&Sjc-~yH<HDsm+W*x-R#+<0>ETH~R9_;%f^pmN_=;dPkcvcRF^h(?<b2lg->S
zR2Wut^}}lvfc0b6e|GAgTY0}UsIuEE0<(;h+IX9AnzeKpEjbVE*i@Ir7yixzih+7h
zR)XXq0C#8AQri@aX7m|(FhB>c{WGh9+frG4skBBm_$oW&t}Fyfo#7TVqKJ5XP(LxG
zD(C%4?R}3AK$M;SPLS)lt_;4)i-rB1KOGsbhp*St6Ms0~(7Zxw22)kv_<eNZCu>?<
zn!1WrqhFqv*jo~xdn3uW;e1_#xL2DDYkEUDgL%`<)rWn%S~9LVY<e9f-3JP>g-Oe{
zS98s3zLpZzKJ%q?+_;=5$eivf>?oZvEeO>-`}xm_il#ph*B^ouo{N!|csQSR<L}WW
z+fe*`4E@e1`QHT@$gD>IS@aKQ6iRT}r}yM4^fL!<SCU>(FSwnD!Z=i5fhF=+ohryd
z$ji^U;54DQ6qHX(?fk-vyl+0jdoGuwghM=o9n~T1?u-q>w0Am&Z&dgj7iwpdf2uQb
zr4HQxN#4xMy{m8i_AG%M*Zn}U9xW3NvTT*ZP*JB<ah~)X2SE87Vk4QXg=qJ(Z={I2
z@NbBfF5s(J)18A8(N%J&a@scsk3ZR#EevBvWi_!xN^Kn5s)t_cBggk%R}cCC0{Gt7
z6Ol%APr`4P;PrZ<Cq_{YNDINvbFXOw@25dhh?R#RTa)8zw-u<O%r#=R`TgC=PfOB&
zKj23{7R<sZKb_+b$o=vw+xs+y<a#sC_u=7ZIR8Md-JRG)$Z4Ti|KZ#%mktnVKh3ZV
z=;rd9bc3~UArF&B2Sl9xR<dQj`9BWuw2%I1W%+kd0B5&<#;|a{^G9dR^6r+DeK9YY
zaZAl<!QI__P?A>@)fW~sKBrSVICG!jEZUr8qn$)4wI;g$EPv2F0(!|<BvB&Lb@g^x
z0bW8FjMMp@I+M)PfoOf!bf^44C?uMvqv7gbq|MD)q~YA&=QjdxhJf)pCEKM*!Y=X!
ztWVD*#Rq%VyM3SAT926*4uQBsEmG;utpg*tGjmQj$#mHg0D{q)fUk$Fst(Ka_@7gq
zbtXnhE0l1Uk8CM#<zLN7xVT8REbqVE($TO=1QQFK$Q1UhVht<r$L&Y(L!*Y>X1v?V
zHgqG8@f!N`OVHM%FR$%gRr|=%Odg0czXPFNZYrC3#^~ML#4n46q=4yA5llsjh|s2h
z@tbF5I~vYGuYn8OJpJO1WNE()trV_I9}H>Zg7m3*se8Mw0b`$Um)-N`uhEh7$FMkN
z)<WZc6XZvz(UJvctCV1}W|{cT`lF3shVI6{i=2_4#5?xI6SnNYy$(xIq(4A<)Kstj
zZvt(DE}+HZNhH>?I``9FUz_0W$QEz4(ee&|B;fJw`aGuvfovuCjTVvSLls`KEJx1s
z3@amEblD`X-}G{B09}r3*@EeSYD78+qL0n=tJkdqKk2I%)#Q)hVYsu3-a|H)-w<d6
znM1IdOSQ|hODPH9m%8qMbtyo5Tyc>0F;wM8G(O3%tMMV03O;($;}_;(i<>QSwiJ@+
zS+tV*V_XMyMyb`DsTIq4D*D$R#UO^|1WOUW4p{>YfaDRYST|ytbeHl@o!*R}iGs%d
zxuj<^b447WP^WaJ9pY_W{m=0TBnd|U3?hKa?WR(`Cky3cc2Y$;nAl3#-M^5m!(UCj
zWmDHZX`=-_{toLe>BxMCHlmmCVY_xA66Th*HjN|!_5R={Q;GfNL2%uRf;*&kE=qcG
z5idtlg^*}>I1%Xj(V|Ct;C9h`ewDy6yk`TQWzH5(g(Fg#nJ+))%<EGhpCfYzJ)#cC
z%)K_qrx}{+EdgI(>DfqKc3BJR@2ke}H(L5P!NtM$7{`X&JGPWD1+O&D{X2p}``z0Z
z!aLnwV^2qh13(KzZ)`_qKb~Z6bj2^_)O;$;afNttNxoQkH>JRLPaXE!ac5uk)L|cz
zc#GcCa-7UI?G&jQR4@{Juzge#T>5X!TU8ifMEK_>M9J<a)H+#d8ZNb}SH;IqMVH_&
z26bVE6wNgzU2351dD!mUNc@Cu`OGn8MJ~;nz$6A)%}5C>i>s{+w6c<|;#ysUXRrZ@
zeFa_>FfJd<&BNSMwUm-?Jj572m@x8IuVqPr(C-KFH9DU*#)*Meugm57EpLiC{(D!T
z+1CE!Q(Sd=8eO_SA1NbHKg8)m$l=5ipo?S`PW$s-71_vX%Rj|{dnC-gJCJ_e&#d#V
z0Mw?M0Q?;I$?DqgKpANGV*xxc$ai}6Zb&g-f;6hbEhFDPBU~krRg-h|U@K_{N>V*|
zZt$5uzCT04kE-84eVP9bGR?H?GP*keYZzxWh0$N4uM6J`CGNc}X;@d#wqLzYrO>r9
zD`?_9$6LuX3PnFMFT-E98NRes6v$Fwhm<|pyo};a`eeX4r^vn^5uAh6M+CWMf5>t=
zvDU$FM37_15|#fRmViFsTpqaW-0TR0jznQ?q%S@^poIxw8t2uFB1tNax@}Y~J(~yC
z&{~&?pC4hB<w+gn@82?p>PN~LKFb|O4*vQxx+i~h<0vpW*#30ki!|c$ZIr{twsT3X
zn7Lu-AZ~E4?=NK<`c?d^MFdbh`tB#`$1uWr&?|HVZHht3i4dU+fcNhATL66_`ivVu
zJQ_?@SxtwZcQ1t|j*{4Pc`4GYA2~Ie+JEv_s4ZDAMZEVRw0_;K$FfcY;5F$k?LigD
zqZ0&_e5{6~u--(XFZ}o;wG*%dxWidLh#WhzAJW;#xC;BCx7)N$uyv5z`~}7)0}?%x
zIQ@R;;|-p0{_Vfr`;hDIrr+s&Ao9A!3gu)sLR<i8`Z~6SgDVH)u`7d8etQfdRd#tU
zHpDnAJ2)|RfAe(ZBBk&zO@|GCeF0)jS!sXCF4w7rkgVrP3g<`_z3T5_fB5~2cbonV
zmS;Bd*I6EQ9C0sFMmk-JgZ+nr*g4tb%B2XZ@Rn*_oG9p~iA2Iu*q|U-o)kgId7fn8
z<XZqLnDS$g1P?4i3%(prA!|QrE@e%8+wrje_yu_8h8DotO$rBfx9z;x6OGLNX<?nk
zaHLp=aD%=S>^qWyMjjv~+|?{Z6>jkrIg9&SsN%qg<4x_ysCzN8i<NU@B?zIU^UE;a
zG0G-4i?f~T8i*0%D4@yseP`w*DC`JEiNt&epW|r>PEt6Gd<`7c=6D>rTCUgo&h%`a
zshM^O(W;-ILY0oA3>%)uX}WS{^CgGIb(6vIy!Y+sec&jtJavMk$QE3+u&!oE_~f&{
z-xUQ)ltF&yEU=51*OMs7Y0?y>O>=Wq01u+rh4LJPDh6+7i1vI0g$`rc^)m#B=+r4R
zbmp}vNv_~yCD)dednUQ_K~WV=o`insaYctHL0?!(MM<^e4P+IUkC;mEW&NglRw!W6
zg925AJ{V0H)Kuk=rFY~yne3yfP~|VGkcr%IKk`FDBdy;5_YX05dk=a7J>f*~{}~*%
zYb2rduc&Vl-gfH3M#(RHcB_F6v5(aA{t}LNhLbhni37Koy9zLw-QvZImvg1{rEo>G
z5v#}2kC#XaTNSz_RrU|xLaeKpEj2^Qt}rfXP)d*%yL*QHI2O5;GVP5Grpx+Ch!SM(
znc*lEKvQM^dr)Ow4@FQ&*czQ&OzFnv*=Wa=KRx3-=|33cATV|YnZ#&6InK@B<D%LU
zY^!mC`L~7b`dFDuRz?Oz7@%!~E}+d>Pivnc`v_fvj<~tUJDfh0-vvQCg-j?>O<4ab
z-#<T(az!wVT<z2#)++^dqKYmc%Fp+;tjNX%h4GeFWRA|+-}tMepDwUu+hySwJxA3I
z9o8s5;XlaCoyd45+NUZv(P3nM=DV3|$p5`3fX7;l#JKFkx}HSmIN~24=}_tc!d-AH
z%P%oksE41b1uu^4-yvKSD@2E)g^YVi)^dG*jRJ6^^e}{XiP!&6(*Q1_<!TkG@!h5^
zX`3tlQuPgBqWBk}r`ux3d`awA&7{#}1GIj<$9bKbVgfNYfH7d>roZGS@4mdvWIQ2k
zRM?U+;~SC#H4*}_L&0a0xtxX+(9s9#p7nFS=A+uB@+^qMrgCZIqRZLwaWFtOCW;1E
z_;4e`SX+Yuio`&?1fV_UpFtx`bc4rPKVA1jz)@6O3PfIt%kQW?adQy2uxRoJLwhq5
zoZ!{M(wFg`&lbB)(}UH6QjP%PlU_;IuxE@26nMgf)M3tP|MY%jeg5XQwPvsp58Dv0
zWBoDi0i~7bVnrt3DR#-{AIY2!VxyR!_p2MLN5ZW2#-r%{XZWZ-Bito<Blux3P~W5}
z`P89Q$26zhCVEHXmrpLY?e;sV-)kz{r)tTeMDs_hY)GfZRD!>^?=r7^u9mYZH7iGn
z=unBNoC+HfQwB!55bm)BBU_`JZe5hx>d(Zm>Kz1F4+FO1MjT>z;@h~FEWGn<%^!Y<
z6rf~*U~B^~qMPl2lUT2XC~oh!Cq5WPqD8UX8d{Z4nd9Jw^Ie?(ARhoi{B9-Gml)pA
zuoRsQ+?8R94(bD@0cF#?*#$q=Tt+fd7Qa565iASXnJ)fbr?}a|K+!cQ2}cXd31&)e
z+l^k`-T;Ihm2O>kgPEHD{u@qgI@lH*LinVAYE*{`3h$d1IM<$fqCr6i45)AWnjNu`
zBd9X}v4&rPNKu{;Y|B5IuY1AOBJ`w)$nVcj4OOPA$;v>|9cpg-Bjs+5w4gP7_Q|E`
zY6x0rIBHTN@mZaz7#yjZ!sxG8h>k6W4=UrOxJd}8A?4p9tn-gAW**F{tY<cSP}ljv
zFpFLP6T_>4nyvT3JsZ?Dn~#u++ja@VC{Nt8=SRbkE#|PEOD|ExH%EFWZi0p;|7SLg
z;}5Ek@vj|Gn7AeUh5No&NCf+X%2cS^e1(6%I2S}>S^<8itlBHByL4fQpo_}=!$EQ|
z4_`OS-`-DOP@q<5{^yl#6_#`fL2aPOgOPNXJpt7ss$0Bk9&Cv$)_Y&DpRgkU;!u@A
z1036XUWd+Qr|?{!9#0Y|N_#c3gezrPoV4aY#;#Ex2|?+a%GH;AgtGr?TebEtJ53I}
z1R&A=c~>z+)!$0_0Jdr_E`w_R#u>Yl)b)RC?BySMrf5FOCk~@|F6vK|EK5fiz$`#_
z`q%&_OgqNUf%=P&peO7;^lh+B;v(S^%F*%djs8vG@gJE>nM+^R=fa(sn)I2niunt)
zjK!yU^zM(}-|UIcS01HZE!SR}oj(<w$Ipc!wH_3<Mj;*}on6Hqd8ysW1A{<F?l5M!
zJeHi*gPx0MB#m(yp~Rm-0nAEXcUgzX?G?RkBZ$UA`88*Z^lz`FiyI{PlKw6L7s+?-
z@d$xkedCy(Kh(Ux(D^4<c~#c2A3zggaDy~#m*++nUB5YE;h!3Qr@I{m2ueH|%zOK;
zN4^VU10~EUUwrVmt1kS)-+b$WbHN~iV%uDZ@X=@ge#ejpqud`T@B_k%<TE>8_$z+D
zRrxeP<MwP4HT%~Se3bNOXDO@)+Y_iUVjw{uh0a{3Q(utU>)i5aY8z|X<DTWaD*)MF
zs8z9kpllyUJyTta$%`K8+&Lg%8V6lT5r7?89Q%N6dI~i@V5S0rrHn^$=kID1!2eFd
zEvL4;shV_YP_OTRf8l<ffN3m2V7Tdiy%!s(jc4OQapsMw*~bDG`A!|eAn+aD8D8KS
z;juyu<`Vk9xTvPcNP4Pp%4hyD2Jjk;A5%TiFY}h73N;2DW4``Jeg1s=_)Cd#nf-RB
zwKuVN#da#!Qjr;HnQi5Z5KTSl%BOEyYg~~9X{h#zbhumZlRo~_$6{e9jKJ33S;7pO
z?2GKN5VO7O9lAP6yD10L$U6H?>ZllTm!Dd#_mv{`Au_Pc&F@DKaf4db8sdHHQxM&I
z2tXGA+*&j0&9tb67cW78o*<$(*t~*7>x{T1_?v(^?<yqmdXD;(TOHi$XkPr1qQ5(!
z07>;mkx}~gqT~zXU1X0H#OkfWPZ@z1viy@h_+60HIwa1z?eK9yzQfVYPXad#6pd9b
z$KA5g$F-Lj>u@yUx&3i49b$FW5)D8Sx&_F8yB1(kU=kd|jbcxzx?{;JjJ2cx%#ijc
zs5I0z>o{;cn`h#OtjLwo&vQkQl6C9c)qblkEi{R9CD=x3T_NE|f1mW)6%-9X5wc$l
zJSKa?x@FqEXhVk3yNan2-%kR2MN!@-hxAOy;!X=aXGC^|S*2Q1wdRO%EZLBS$~!hW
zM9o~E_R;(0t!zSX7}`y`!)G2cZUmN~PR3B9u5z30VI=#3OJ2Tr-O|w!T!QW%t~mX|
zA0{iPcJl@mLI=yd(cZ7~p<=<M2stWBZuR)oH=u?X5B;)*hbfA=)DS6w<nfa`&cV7k
zdD(KCc4P48`A8E<MzeCUx<lSVQG&_ts;gfUj64f^&_jF;Z_-9Cq>B%3-5`uc-(jz^
z7(NJY1Wy~p=Bs@D^R}>rU=me}GVmb7g8>^;O7vSjQb!p0J*S40yW~LD-iOj4U7(>o
z{(=<w<DaB=)FJo<b%<0M@a5Z+k{SDL4MBG=>z9W4*tZ#WGWoaW4A;XSg~DP@XUA^(
z>NhXRfI2;Vb)&@N?YCi3j4K7)28;m(x_VF)Bx2IT$otPAD&V&Y#H{xZlyI<@1E~_k
zn(C|BS1LEiX}FT-ial*Zq$8k}>vp3<@BgJRDirZKAOLzQa50Z$mto!vBN=E?4hNZ&
zgq%9`Fw`?2+3;ToC<OK}k1vJhtK~;A;(~mH3bYtp#AI!$hTtx)X~_Acl8zujIjI1F
z`?er)7r+PNH-QEyERm18m8WdAKeT^x9!-tYp8scz)xX;+8w(-kU**|5U+x*E<D!g)
z#8Zg+Wll^HE3T3JA)|*fcM9X_<2+K_Wjh0uqYSYb(3936-yhHX4-@AD9K!SJVw?F<
zK}Kk?;d4u%hpUPwG3Qi}0sdd!g0j=zug_hpBX5mW{=OXJo9RcsKVOsb|DtlUqY9;U
z?hL9!oAkC}K$~nFdHq%IfEs#HF(@n=#yxnnr>Kd=+FcrIOt5G#P`~m)w+!;mK(J!z
z0mkIwaG|46la4?2HNA?VOtk1}R5+gd`dA=ySEs|v8$Khjw$r!%EHa6yun^-HRQ-gv
zY=BUU!d<(adiQ6iYzJwL&=`iCs`Vry45PSJ(s(^TN|0~$(M6Xa2vyM{9&~nNiLQL)
z3Z^P5B76`N_K{f|@S|6V;-`PO#|ZlSwuFM>KOB^Lkg<t!xcYrhG*T8p5`x4=k99c7
z|24(g^xvbfd#rM(QnSM5tSvHJg^Dy=`}x-@{#Cip)^u`-ZNI>Cbj~As=|Ex>$%1iu
z+1Ba(zbloLi(5oSdf^jf7*eg!6aI{H0AcrTo<9HV=J|`De{S1kUFFj+*}93aL7+rH
ze@+HV(J~plx_c9!@S}6-7j=*rNEYDn=Hua)JBFkyySS(ASY-4i^5WQM)Y)ktdD{<R
zJm+@{MGj&@lusCXF2Xh-`AqlT_YzNyS0yvSGlTb(WYZ8~zvm>)O0GY4@mw*uug-jQ
zE_+%&0{H5ZN<d~wp+fDl;#qNdj&5PZd;i|7P3WJAI)Oh@I_=~m*e9IO5E0+MoDjlC
zi_Ak#%|7!foyxDCZ&v*6ISd(V+AzgPEwn)JO<Hy7d}l<#T7<P5Ur^^z<hB2N*Xo&y
zO&#8zWnK9TJQVvicdZogAJG1#BXZUu=#S&|w>*Y^+&FM4`LB!YUO?)_nfHCN8p1NO
zkL70PD`b*va~#R9RuO#ku=HQFF1&uf!%g&=Y+z8V#6_6tb2!7j&wn`7Syla6ZqV*O
zEz@6VC{Xqi+~0?g+{c|P(k(Won((7X%+d0MO)x4b*CP5`q^GZyVT8+U`}S94!RXpa
z?Cl-mc4-GXy=I@s=~Q@R|5=eb2Tudn3`F0mY2g&}UCiS*xto4Y#|*oi5e!V;$UAY)
z?I=U5PRw|@qlD*^qW0e0-YkFrj37Y$_4kki@bCGyVOI|CO{#t?d4(RQNc;l3O!ty{
zZyC?kc01W>3UCq>8<htQD9C$E=Kt;*2!6J-!90XI*%DD!|Fdb=jL^jE%S05YX`6C*
z2u*~tt8M>2zcFZ^lsLq_#NLqm=u@@RI0tHlG1niM5k}L|`^<a4Caho%{6|H8@<VtJ
z{=|O&-+3VmS+?U(^fiXmkDZ!<wy)lP5Th58-HnU+a^UjzN@B|6;JjhjdI(9gtpFOY
z)ifm`lFY4aN3kc^1&4=hmJze}uc{9w*f`$9c>e}lvfEg<eZSn*zxP%9M`c?i(Rf=s
z7xu=XmIu?U<`c7rmA=+*fASfxD;tOGVbDDTHgZe$Bh5V~`J{8Sl+>rwhlEr_Q_Pmi
zsfBSFmoMPQ=2Ik<yG&R=ncGp#c(C5{R{hgvQhD3tyNwv`#Gc6gB#w(`2Ql_Ql+1!a
z)+t{*eyJOC7KAU}c=Yg+g$V^pilIv7rtxRL2X>(wM>h{KMgJ#0dp|b=qegirstAJ2
zoOTX1bLK9u>2B!(DD#Eqa_aq)gYdr2qO(+|Jcu010V~?63+D3*@~+Vh%EqS$uR<wc
zEVk*t>m7db3(jS7N-Pax$%4uNO=m)9tsVh14PYBw1y|QdEL{)}jQcVJm7fHJ1vdGB
zcAz)x2m`<=3aVfdWKyehn7%aK-2i|S2C9G`P;LaVuDj1z3!kK<!_U;=o0>>GR0}a*
zMN8=u23yI?DB|BqyNb}iF6kH>Zngo^14M2r=0d)Wwq5SAKQN%AIV>nyNfp)ndK5Y}
zjQveS%F`_9Gy)+iw?J^>1GK@o5^?YUgCq)fY&!cPa}=G}EOU>fHN!k)MFtEl)6109
zi@b(_ZK*>*vfx+i%%rVz_qi?C_?jp@p9_Vout5!$6J?Lw;?;e%Z!n%Cu|5EJPy!lT
zv<Q$Vh-}w?(KR3zA&*gpDn(=k-W6Ra3~}l87vCwM%d+0-iX_cy4a)GfNxeBpTwOz@
z-1`F+^2d7jl4@2<uS5t@P!N<&>hH$kf91Zr*L-&*fPGYGyU-=n+cOt@XjvSL7h!Ur
z>=lS^hK52^zyjZEsD+U~sTodrCDVS325g3-46*b;k*rwx15WphGNN*J`~_UwQvHDV
z$itMg5`*Z{pO!D<kDpmCY0d1D<s8t2$3l(zGkbDgpzsLaXAsd0X-Krtf6`YK3Dpqv
z5a1I@2+ph^Br_x4D6aF|?x6|Bd`TLjMOWs$aY5BO@o^$G?Hx-Hic`WY1I<IlY{^}<
zQ`U&hGty-_QK(S}mMlt?v3gLVp^@GsS$?<ihQ9%a7bt+zWN@hZObm*rWyaPt7yBu7
zz#LFI2s|@%=k>!AQd{*yc7}AkI1bbYOSUGkoTvpgvdzpW1KT7xgeJ<*M;}=#uuaJS
ziw><&x|gGwLYnUqT^60z7b*uFi5{;1-M}PJA3##Jg8D$#OMuT<y?QkA1`omBdq|#L
z2AGlUtG;p_(4?rEYbQeL;vD~yrFFO)|0H3y<Ak&4|3&y=__9uD3ThnN_kuxwb`Awd
zGTY?YU_c+^T4D+CdUFQOes2RP11X@KBV^K0TMU>!{a+3G9MStwY}wi_Gv-+}#cMoQ
zagDus;#{PVm8ndS!vwivVgwpWbMk+Ph>C6j!yvr?I4ODIJzB4I9}51kgz3*6Fn;?^
z=Z<P)`TQILC=-lBBjndNDw7d@we=_pRolfz!9c(dKoVkQYhvl%zTd8KfZ1$d%P}zB
z2*@5yWt&B`=#-49jr7=+5)?4q2`JFlJn(L<xq0$MscBF`UfJ5?ao*us5#~7fb-w}L
zEC2<`h(h<z{tGA;|N49^-1su0<YIsT)n};CekV}t41=Dwd>_kN2SvG1F9QJ}zh{fC
zj^HTbVP5C7DWJTBKz>iDoPVKOLP9?@h=ZYTp0ri}#?0lCfk-X@kWFVNpmPBhVswW+
zk{&K7p#6&cWvs|CNXUxiOYR?&mf}2*gZE~_f*y!Ma&F{b3wwIM60Y*v&NzyE4M)<1
zRgf$7kQ&F%cI53WULlD?rk+Y`7dFeyK?{Lys|Jb>sDr1&!+1l*UPnW&Z>PQgCy`{?
zaP#1EbQ)a;%Nr~Q6H*x{-Uc;ev0@@ulGc~cr27vyg$QHy5H><c2{x4uX_#bbbW`Tl
zMPEJz+JagUkI{fs#Z+PU5z7H2b9>&%T_<H@)xl8ILCAivGYVt|wgxMJJZ5(ed6f<P
zWJr2^I(d;iD3cHe^^2y;&v?Ulr9V?L$8vB4HHZ+=M6RXAl32}JTCR)=ZT-*0Tk{jJ
zs@X?B{u#>Jqmpi(5Wwzcxe01WvCQfov8P`e)g0xx99<EE@nK~mnC971p8U+yC>&K8
zgBn$4j~4=z#dwueMul%?)OTWpF*e3=`1mDJ;Bn;8sL}0Tp#dKSS}8CT*yo5-GcGeq
zOEEE->0d_4b=;DPFwNPf3KOJqdV%12BY>4mDc^@-y%ZUQ7-zHdEF$`g-on96txyWI
zLN5avJU?Z2<ci1f-L`z@J(jo+1@Q{r;fI_tWu)_CDwdI)Y%<(N<U@+GD&>In*mVJn
zn?wH*0_vkRV7R|`MEzm>SdDEg%uaI0x`C5f94)Sq&d0ooIj9t|5?|LaB?uOZ1MGni
zP9eGR3)0?r>9Dae93pY+UUV9uV)g{pi090+c^6KBC#WpC+mc6{%q~#=oFqH0>@!M@
zf+8(Rsx&w|dU?+B`wpMp1x+OuF#-lLUN!LXx!a}5kzqGq^!!kyh?2Va!!=ZWVq-)l
z5~Q#X{}uyj3n@pv)`V(XaPG4Et1jZ873g<|(gc_IC%%t}Cx32a>Dj8?9o#^=pm@aS
z_gTaso>YvmxQhO)Ctks)NTp}DB|wdz>rLG5pFm0l)3NYk7n0PJ9Iq#b{A&;Xk|#nu
zn5e1DFN>w4lI7cgC4Za<@IM`ZAO<hcpdgYu-_C}pUQ(TSuKX%*NEtwm5@{LP(wQA%
z^coEBS(h5o28CSuUKUngz{?c0Zoi%n{Ehv~rC1;2RP?AT6nYu+gW^s?zfi6<Y!s!C
zV?zN7)FH!9g`c8h1YPex`+U$Ez8lm(y?&W6_v*7`_6bXZ(%Q()AaFHQy=~k{%HoPz
z*X`o^>;nhP0D9DEogUBi)hw-E9NhD!8ulXpDMbIgV^;=Vu66RZZ#Q!zCW=Gh9W>Bc
z!DCi_B8PVJvD^<&e+c3NC@1beOM0n}hex?^<|`ca=ZOl%?n&rG+$*v}3;8<_hBH6_
zcBoTwWWAGXhX_C%ZC1}$vVvxB5gE5qO~ve7`X!sF29|o9NsSUhlumVE!h_0(8Hfz5
z)ZgUaqLW@J=wrgAn)5$Zu$8bJDnIc9pEU1nDiP8|1ukGGRBd%7VbHX@qaDCFDvvD!
z@9vx_v-PX{S^U+`ha-}4eG~2vDpOGAfT=d7iK3)N(DeuFkH8e(K@XAO+*ySR#BWaI
z8d?|!JcPiE9vvShe?fBzcZuKL=O)BjbM*652HzD-hf5PftQVshD98`vGUQvw6H20=
zH0H2m(V`rGtyQc&>^j`H$5m^jpP`pLUX~24@doo|d9N0g(3~tJbM|U;u!SFGDe(P@
z%ts|q=Ug_Xr#1Yaz5ps@Bl6+|n5r3HVupt~l{}uPpi;C3Swqo~znEiz#ZSBchfdfo
zeRf^g2bq!WGLJ4W#y!~aJJh<z08@zOxH0xaaw_wK14w$RS}RcJ+ct*%t@NuU#7jAK
zStRt~^Ll{tj)!YN!Cpzy%Si(CRDnt4H`K=FQsk0E2a_zyS96gNYlQ4jKo-C=WnfZJ
z?~WJdKvCNUUY?B_v~6H8+Dr3_&WW2_C(12)dZM|*Y#oP}PZ==gi>rx3KeBiLY?)fo
zP^Yc)?a?jln9pa=d@<5|?|;HrFF|*hml$TwlN7i&CQi_xX**4gbOcT-0ZO1vp<^H>
z-=%T$@g=T^y_wSBOxa-0v(Hr72A7Ip${LA+s08?m^bSS!n46JQ7a9_Qmjd-EXN;q?
zADW0N%8K2W!@1xn!Y1?kE%tQkiKQBW)gdo7y*rx=2nG0{^;LmsQfAQ*ZEwpg7o{s8
zr8f%fQ9s%<?DLJ7`M>`c;?>fPVkHsyeIW9?a55<#m*EYPC*F250tm|ElDyKyUG5`$
zWIj~I>JM^cf?^zX%;KDae+~?pkrcQ@dQC^w<LLSi%!tm&T2U!l{A4v+v-><ptoF4`
z5S>@Z&nIbrE4%V5k;RjX`F;@B`A|fG$;a+*{Yw`)Q&c}(ZnD54<^D^dh7>84o531J
zb?Y_a&FG{QN^W=i12TEu+ev!qIuQ_4Cz@GPbEjP+iqWH!X&`YI6x}wr`ZET@nsl%F
zPQO}<z#%?p)o2@3_nlqT;|bG3z;yk2J;&kwcptaG-%v%3$k|RLuvPs%GtPS_Eaj>l
zO1fsU-134)A1A+=vT@?E1jSM>We-@%^t@;*-Uq=@+dtMC-avv#BK~6rFu4{|biORE
zO3_)%{%T&Uh(p2Sru0-k3@$5U=#aQw1joX?x51z{<i3IWvWvqSGfs_P+olupEe*1E
z*Iv4M0aZ{Fnk?rcLt9{YVe;?Eo%<w6%e6%Q=p3G^ZSs@P<brD?5yuRFddMVaPe!>0
z49CNY3^^`j6it5M@@W)F)fK5nBzInjGh*TwW*%mUao>$T9%b^|{4m%ZV^(FhL!&XH
z9CNA1G}mQ3O9LykdKFB?B9?*FVnFgn0KZ^~VA%P4cC?GP0H0@IED4a6N_wMgHn7K9
zHgutL=Yie<S<)+2ISoq;_=XWKjmew*Psv+1y^&Bz?<>yZ=xb-i&2IjM2o?6aAg58y
zQF~~mnjOG7nVveT0n=`EpD7z2XrW+$DpdXx5@_$=4m<xT+x;E@P1Lobxn1!f;8Ofu
zF^og=jCr`x2}xb@@en0-o#brA7Ew%aOVJ%bySk(%Sr?mQT+IIqxC7!|<O$6lsBt@1
zb|~@8YT_N+9GI#LpKO^?`ah$!PKxU3-TD;bxH3~};j<rbk=#BxNk7Dc+m*X(4tH`2
zv|haESVS^;y-?J(%v{$pFSus)u_@@U0vyk_ujpydF0a?BYct@_Ai=;(OLl^dk{oD0
z_lZ&_5Wt7675yPj@?&IPjq`W7+3^KlIFjQYA$0p;*tL#b9#(awP-i@sj8N=^eUmzH
z=4&F{hr3w49#a;2?W|RbPkZG2wc%;D?W9}(c|LMZZ2;V=YJ`4ywnTh=Sh?9jivx{K
zuJGM+YEm6!+evj!IyKE5yMEGxI-AhFNqN@x7ytE*bS4Y$*tEOmceR2e-r&Xua={U1
zmOTLFkta;3$5Y_NfD)d)6<WjZr6fJ_?|gTLX&rY`E@ED<KBx^rr7(87fOQy+yS}5@
z_ByNe>$Zs+Ce}YT1!BEL@UoXGgYfM%vxPkP0roptLWSF;7-g~fa&yU#jF`8W8(4^b
zWf6D~4HNzk=8d4UpF<gJjf++Vb?;d*D*XKdgewVD+tHbJ>|^xg@X;$mx#$XRoP^_S
zuJ3hn{&A{G&;5D3zZ9B_SNPw|k^ZD}uwwnRD)-(p_a53`)U{Fv9g=`4vE{I!cYab_
ztd3|tSvoY>l_Xws0{-Zpqp&e-d0D}{&sm>_$V;X0!a90jh8!joAZDqaP2R$5Sbd0p
zzf!BJED#SVD$HV3^uGEAw_gBY$|i5*yYV{=BR^6>(X|F#55ZOFwqgP4gAr&&pQOZ3
zYWoy<DKLj`ChprhbaNAl+Of+vi!wT?o-9c88a+r?W03e))ju(<dyx4r0h`APMo)!~
zD{@59X|)B8%3?#}rHrcb`VaL5@EJG`3+@h>d*gEe+CNyaVLh8azVOm|URH(>_m>ot
zcRE)_x%eVaf()Z(g5!z|zN+;vgLe$OGec~-s&{>X86x4)RJ{s$-W|Jg2jb41ibk}~
z(z{b=xIQKdKGa&$<J&`ML7;|&&)$=}akCWpB|(;vd%d$uvDFae>2DZe*ex-!Z^Y_A
zUyCtc9elr*?#U&IWLcz15?ud~#HlXLPo&1nMiZHtWJ`J9p}L`tr2I7TKB$rMjR>G}
zb9P-2EUN_k@X)5gzdB<XL$0x;56Go-%QpSA`b;tBC^VYPP2(xjz(+){^T2MXmy(p2
zEvyqD_{AYBe$6<DKWk%x7U}+Uz;$#fiI`8iaN}>{w&mY*y*gb~VDnNyg%xC9N|7S;
zxRE?1{r06s&cWK-8mM2kk{b=8ETNMy&F7kqv7yK=Pf1j!oD7)mvpM$Qw^%$jmQ$?%
zFgXEN828_EhJE=l7^%7UQ!@{I(ffU+Zd%^I|Hz`+SX49U{2()RD`X+_#3xj^1ZVm_
z3^ciysCIGVxsFalP(OmIDW@n4kNyeR;mB4i&^}VvUO}AZ8E%^y;AEsQ4`EJ+ON1De
zB?wf;RU*Al_L?|$mQ&o{R?ojk*7u#hV?id@B>)Bn-F40)weA@4hm53sO-{J+qIF4D
z#CDvTFE0z7W156@B|lDDt+g$9GBhii>sX9rnGiAj{`{R~`C;$xd-;`PrFp@fEaP>%
z|A<a=r_yOCbo)Fym3+|~xmt%mMVCo!*QuCLvCF>UF3!4dO)VEvPMM5ef06H{-=sYF
zyDv=cMdC|f_iLqzvX($_b7?MKNIXTfu{K5}6RX*LCi|5=qIIfo6&h9CXZX|G{Hl4c
zzKzaf!m}(mZTQ<I(S$1Oim-ys?LFA;$G0rFin`Kt;(K4BogMcp^E8R(qAT+kbW@0<
z;rO~ggKulN98w-sKI#1JPus=@FF<X?*;XWSLGi{58-=V;bwzutI<hJ?_Oh3npRlE}
zzW!p`SNs%Xt<#tIa&+7fm`tiPq`D48vL$M6aM;^a&l37zMa3z;%5D1SVfguXGUG3D
zGwCk`qb96;g|g(6dvTYvQ^x%z1jhw`2KDdTKH`)7EU|8kF88;!;8rj(w7$v}iy;hU
z9Pe^q9QGXzV$tb$U(XOKbVg^xRo%LvKuh~0^A)Rzhis{Gj<wsOpvZ81@yA8;Lq?Jd
z8ds7@G-uw)CYjAzDpeyOgf$W<RHJQ|KrmZb7x8+sLa+u?LaC6B5?XXg7_|tf>Mg>h
z<V{{^zLUy_%3i$vmk9UJFZ2Z+FSg9t%m3nduOc?>Osz9D1)AFwV0Kp$sg_-_LNouv
zHt}-%-%5;PM`GpGAUBsYZgilLvbR?}5j1%U8s$$7$`s6AoebvkBf96Tld!!Y^pWp&
zS^3c2#<-d@w6L>zJ1bD96sP=?tLg0}<ts(B)#Ldw%r^x_X^S71Z}s3`R<fZ<&bkWi
zA8)#AZkb48bQU#rDc#=AXHqJV%JQbvnoH`ip|L|Ub<@5^e!|S`6kv8d*({QJBj(G=
zGDT;`5yoGYd3|fK`i<16{H#@zS>k086>i~BlG_OG!(TGqpkw^n5pa~+)*ZuJVUG=x
z2*I`PBqyXY<Fa<gk7vuzULcC=`&c84Hu5PkpBl2x(RcB(S-R29vsi6wJp#n!730j8
zZmilCxrz(0PJwwmCnu~pCOP}ltPUJob!W;ljSpGhO-ThjiE&SQ-?>E+$5=Nu1=_;6
z?TtWh?clgwp(D}rF+d+)O$l(u<Z6Kz_>;@7Qa*1P+pa@&<>rn^V4L#fAQ&>=d~7#A
z$+^Co{{mwgJBm3bmJ0BRudr{>WpA$%!_Z%P*FlA`!E*Gz&^GBFr?x`+<~jA{fsI~y
zZwqkfvvCP;K|Rqxt(<PQ&L+0@G&$jO+pLFMtccQav<vmLP=4HiHW!d~vJIzcloaDL
zGa&En)}mt=cepcfW%pEBNIE*=zrAo-{k(_Ty^p`j8@qvK!}_aG9*5r^V}MR(IgK{s
z`5g3~OJQS7V&>4*fTnGu-#!+)1=O<bopKo(=**XFzWiY|%x%u(Eb1U2hISD9WD?!_
z6b$L63)7YydmYYgAEWUTK&ILx{2n+7j}-37kdHp|fuB-G0R9tKuTlh$wNCuNZRnYL
z#;U3E&VAjlNm=?9*!sa!oOGXjT<dX6L|o)Y3ajB)KDK1JDMJ^%|CI2dAnT0l4_9LK
z>SkEK<dc1FXNw50^tfw#3izAG4+q#CG<o`{sUp0ozK<u-A2HrMiIN1cI4BwL$g$eL
zlhrid6S2Lc;i-}<yvy7k5lPnf6!=z-ty=Z=sJP?(-`|#HmhYVjI5csqn@U+soRvPk
ze}UUi-&gQTb@sq<0G`7e@#?j?aKJ-XrR(5H)&yjWqiJ_MYjN^Et?)&8QYyKx9w+U8
zl=0>9g14;m2VOrd@{DhYL8|{F>8!(=dLKT#F+xe{?hZ-m7y{BI-5t^hNW%uwAl)D(
z9fFjUFpvfrA=2Ff5`xs&-hF@XwZHbqIXma<IoH{q=W~B<DkIw`_dMGDl&+$)d)F0|
zui1G`y+r3D`W1@{TVU{6&zhc07Q+bZFK5<KS2*42v2keGXt=3`%}c-G8X*oa@m=v@
z;d_ajl?sKI)Gg?m)GzSYm<v-Pa3xL;TEkZ#BJx$wn_2US5=#XpBTzZ2t-_#Yi+m;Z
zaCasFVG<9L`SsNWHeYt;GrLBY4-wqERc3)L3&%vjL=4r>*qix#gxB{PW4C&SWrw#S
z;?ks!9o)oOLTY(5B7#xEos0|151g3|gsHo;)lO3Gp03=Kj-7*%n%CT2NMag~qGqf=
zTb6r}Kfa6YT6cx<x{i@|LFxtjKSZBF*e*SHxL~21fC_{7<i+2k;sACJ(E~A|JNRy1
zT1miC@BNnD##rBx&dG$ppq;56ANeb7bv@wQ)w;otlaRj_Vs5%rn&lQG>erZx*n{kh
zBvMzhOh3Y|Qesk<;&dDe@K$UF<}y-2Wof~fldyqsaO{)!p1tp1CmtDUDa=gB>_7{|
zrb`jZvzjtJX`MXpI4<#s*Q4T~rAbzjb9Hu&TNxS0y5rB%ltAwr;`Lp9S-RbA1K=YW
znVb;<5pMd>&vYGMf}=g4l0S`#zu>#s0c!sYYGIo_e>p6xsbA)7oL$jd@sloBw$J&B
z7K&AN(<#y0DJV-GuqN3*6!EDDGc>^?8xeWNa!vmj<3i8;QqkMo34C-B1TvZbPICS*
zt<r_s9|_^(>i<v{xi|b{K>9+Hq_;<sH~()-QdIC*ZPWW=e~CRe6ZJOkiN|k>$Q#3O
z5=P}8>VyaS51}S1M`uJwHoM&?&-}@F><6tMtcj}&4nitBsZ!}M%SlkNAKJt4eLqKp
zE4bwfQIZhmWlp2Aqxep~->hIrCGX!Xp$SWR**wl1`CTG|##Ko3esk^WM8pq1ZpeP0
zE0ZnpZBpjF-lk4L&Ysfcq|!<9yvlqmI@a|s<{Uozk_KsJt?P;mvgUW|%+7lE$NKQN
znC5&SqA6}!?r9d!zmIP}l{I27HaD|Zji`GS&2iu64TcdebY-tUY}I@78(3aO78Jeg
zr;e32Wcz@}Y<8?DyPTQ~tmaPjFdOraAtCd3<gl8}fn434Pa1!-Fp%0*t>IOHC}aJ~
zR34RIML3EK#9PRp%KYnpC`^m~(znoV|Fn;b7)zT#p0320eT^>eD%x)8p=Yxe<a)xs
zVXYDWePHu;wqnvBANqML(xUl!dt%;Mbw}-Zmia6zc=f)kX)W}(iTfKKxWMi~VM{5G
zaFm#no%uI4^Nn`t)zgPAk`e6Qp@}`6*iZ6kkA`t!(tO5bE%l~eJNFh=P!YmnwxqmS
zCm$l*Yc}}S<2{c713`+Yt!^!#s%QIPd<HUxw+GtFYXg~@_vQZ)Sx*)P3=-z$%+=HQ
zL0gip9^sOSt}v9CSE2}5>ihydq}%sAoME52lVqtw9PAk5Nj(z<w2+43QnSPP8mz^7
z<@dfG*e7P^@bK>KSTBl*e@c77l?Y?`oJ;-d5RA-^@q7&DsC<!xkZm^?g1N%HsLPc~
zxcMlYeT`~vV>I9sh8ksS&`WCpZ744x#}V$F0%I_9@3N$~J1+iaLW|_oVwvNeP#Cpu
zTw|d0j17fa4v6O?;lgfS>T{vvnf*t$19^e1FlOf$^1pA*BPTB1!>G9=N>KT8<RPtw
zsq!-D$r{<q<4LC=iF<f8$S@*2B<S?>kx!v#DZmM~xu9>i6ehy$(d(sh6o6~<4U?u*
z;rFJb^91GbgM!Xjy!f_;?1lEp%GCSrT^kZ!g75Ie8k~ao@6(3&9|=uG3ZEt!67XKK
z7)#;Tqfr%pi9p}o5xxsi0TXF~nT;%N8Jdw=@Z1hlsYn<SYfp_HV4e|2^po`TI2Tc<
zAO4dzITVfXG?^Mv@<(=JY9o$?NnMx|_Z~sol;Cz-hfe&%(`NnECjN9~k`}8RI?Q)2
zSF}u3dhYl`hYJ2zW}&04@P6$oSmBi`%_UW;30<IC6ZbAE6G6*3K0tG2MpC9z5Y%u9
zVFu8Q2aD`?rXc@3W@L{0itZ}LYzI)2$1onKqo|K&kZM(<`O)gPNN06)2NG#Q(>!ji
z9z#>%CxQ&RQJq}h4kYmhy)oAsIkq0l1Kkm!?&}Q+2SXLWP`FaIH`N0JB&bCGk4kKh
z&~rwgRs59To{}+wpBlcKk>4Ap_Ac#L!ZeJrLl=dbs^)H&VNW-#GxB~p>t$`L=j;^f
zxPzpG3+q`;^6n0SX>ub2cq)C=f{A6!ra`i=(l31yWj`thY26cmT8pQryLI$^C)IR8
z_aR8=l)<DHKSNU~BnQLq$d!sxcuRslQX}y*>-L!AQ~5?H>E_@oKfSgWqDke0<fFQ0
z3P3uI<QPPU9>pZfA}e9$q(M0U5O%?C%(e=951kXOgb~Pp9N5CnY4Y4fq5AEwgHgH?
z!q`MhU4v6T>zddXrHIoEq|A={E-B5qh5JVgT?K1YW&j=`K1{M{;#iFIqD!M9C-d+f
z{s|~nEU7`I_`J}V!Yw0V9POGuj=M}ClQ{4j7a6wAX26S8pOK9m`Z5mxatPsB@q8U(
zJ$RRZT|d1cLa4$=qR67)`n{_cxgW#mO!Be-_a?8Q8SL=~6+Z^gI&3@zmvhfH%Rj9u
z{d2UI9Y*aL$c}5A_D@d)@HEy`>b~V!q37)f;k@`&%wG)yU%`8Ou&UuIu_hdMfHhV&
zOTtU0WR2MF$E+KBLbBUuix^^(_6GW2Yt9JTgJidpwf=Tv`bRt`T-0#v=KFq#T;Gy7
ze!e-1pQ{xpo9UL*M&`kHm|@y2rS)ax?zfXsYh`VE<}4r#t9R)|p9l{u`e!aWwP)5W
z=W#P72&mEACFzC*94_~mco*cfPyo6nm`A=-ndw1AHfhgBqQ~Gsp`~);w{XpIQGr-~
zvXUPU+B!9gPB-=TpII2Ls8wv+35>L%^qaKa+XrYu1)n?8xQC(1TmG}^tkB*FV3lNA
zNDD61^y==27F{i<$HGJMQoZ`OG}$kOaP8>w<UqcBd2N?kB4{J*KT3<fjI^|G7jP9X
zr;S{j_~-s8ZUb6$r0M4QRuLiJn9*lxLn~nbkizm0biDQ6A5hpLJkSlt`gigOS7Q4?
zv@qVD)k>1=yFTMG=+W7DYf-N<mQ<m1CM$O`jt>)J-iwNV?vWt!M13#(7^9Cr$ntk|
z@zt3|Q0sS?if982uK2pfEuksvnl%zGDtS7lQ+(O7(+!MJjgZ`wwbJQ2-mMY@tiZ&v
zn}2sNYge|?clS&Km0`pJ$FkIVQ2Ax2Wdo=O)+U_<wpav#4ZQo`>i1N+_w*UR6e4bd
zmQM3jQ#kZRXzu^RJa|0Cu)AQy==yc~0l;H+VXZ6f&o*nHmF9j~%~+}YpO1?;?bXjg
zKp)|-Lup{Vd_Yd`9DmbGZOSWNbn<kXHgnf4;r^HEeV^X*pG5q&hvcRvVt|!Q+`sSZ
zcRkq~-OT#vEBaKbu2M}?<u54>trPfNptASyK({m7w+9^yfHz~Yq!=bZ0NQlo&7tkU
zzwRA9mL9f_ALe?w!EStd7iLO#_XRH31at?)vfua3)PHq!*J0wBW`&*s3SggSATM#;
zbM9%?X_k9p3G)taA+CrsKnRS|W&&%zLi=5$Y3wiG*oT~IL@1V6OlT8K+-53b6<aI|
zMM=g@`I9yrGn^}1Prr|FqHhP)E@2ZvX9_)&^~5NEl~Sh`8t;2f^VJ||aLbe4^~P5M
zV4?)mX{}|sGpD3|7Hj*^g#_zrgmlC0_(c0>2k+PjubeM`jo0g~M&Vd%%d*2?C>qUA
zR&plBPQ{5Mg1RX<(--A+RDaj#6y9U@pnVKDW}n9bi@dWHU)Iz2<HJP-nuCZCMqLy?
znVk}ib&}tP@t8(2Ec9Wa!GnAPABPtX10UMGXk(;~P?+dQJ@BL!pg1KW?tuNxOu_k}
z7w}&)x*)jp)%b(au)}a9cT@BZy$vHebdG^=q;z%IMX?dZnUDSGS=YGNMz^{}e1#Rw
zGLAwcyE?o$K-40P))7Q7a~pjDB{fHd8LA1vFKns4Whn7Y5Sm)4nq`~_%&(=>Sm3WF
z^adcX&^s0C$hX>^RJPOd=7<ojGC%Rda0m8*&?HM1irU#vGw}(`>*DM+y0p6w17e1m
z=QA*uSvdA}iu(L+0mgg|yd^r<_VV+g`#fQLn@?xJUy2qk5TTU|hIhrjY)e<uj)3et
z7t*kGAbRsWptfYbP5MqK%b0b(`0x5N=S|~rh*>)7X=xt^(V0+Mu=ZmJp8HX>Fn1k<
zRQRW|o37-I7ws(oK^WRv5dQ%wgq(?=es(2|fzIit{Io7ebx;XCWosfghNJN+fx+T`
zf{#0eZu6<wJ}W3v8`3}=1Tgf)-P?-+Do*?*rE30<@dm<u)*Yyj@X8vARS(f}RUZL*
zr~7tvn!Y<7xnDR_ihy$=oRat>yx0+R@>&Dy^D#+mpqYdG_5(EY6NuxU^~d3zmH7u@
zhYD5FHH~|I2%7m}R5zxPB5xp(KW=`vaFQlTc@>1?&I`TTh5C9JEcyaZK-TCFID=Kq
zep{E7On0^?_Bs0M-wweG)VNyIzt<lw6wTu`_#NVRR|SVbpc~M};0B+8p^7{V;2g)u
zWMSt4gn!Lt`O}2GHSudt();z;pS@pU#q3DwdtS1L#hXq23wk6Q)QkI(TSf7Z`^bV0
zuHg#zf;GB+%R-6mvQhNDIeByV*zQ;HMxDO7Q=52#oZfZ2QwL>i2+O3N6T)RwdMBL}
z)(7YVpn$!Zf38IDTt;iINbr&)2sY{4&YF}UqtsYpGNZ3;+3)pjiV2fKPzp%P(@s&g
z(0kk}m>62QfOqbaiz)@{_cN^TJxG(^<Dt=+IzLWNb@Ar0$GI*Imcz4Oea|nzX$Rtt
z{Hb~T2alnp*JzVTHr2Uq@VeBM6%z12AxVX^I<e=NMV02&*YAr5^yvNe4x)Nj17}K(
zgMS!(WSDEEZ7uqAlrnVo1yO4F`0)(Id7h29IfPe7a`ef1u8T6SO4?{c+CLu9aRXHF
zk@Z-k!BK^$cj&PY%D<-CAj1`tq&j<DSaSeJGf<%aoE<$!@|M~n=CQ?3VT|ZkPC9V>
zFm_l^Sm@}R%`OrcsB`>D>+Aasu}Z~X)_=)+XM!bfI;9avChU<bfD|H|*Xe9y_7O}g
z1~t2Fg)!Ui+mPNq>0u3aX1@<9-$>3XSo<<LVQqoBrnnb9wU34vcioMiANY<z&CUYY
zk~Ibj*M$C7CPU6tF2QLc|CD=zi1>$58GrPtp<9$wEWd4X*3>Ee`Zi{P9aKgoCc>P*
z<h);<u9RN~S>r+s!|n24ipWpB1+&~s9(OnVn!9Bj)5Jc0*;a&}^!o~Po`5V*9F%WF
zHL9Q!^FOv%4{l)Vs&^@fI#uC&q_L)z1ck&+3z`|J-G)+AT3*^0Tpl0xam^Ip*&Cf4
zcx2y-LvLV=nW&jd{d{G6St(CEus-BJqQ*y4;UlLBU&z_*%^d|o?dZEfVEV%H`^LeC
zumIA>us%-*@~h7df8;=n)qS7tyq=C&MBQlO$KhGa9ldkK0_OGTq*$FX&uKt_jL;O=
zSKBKw>Yu>{hO!nhE3Z+&MZUGs&oZrjoFozwH^vpH(KCK72i0z@&x=BPm!Ba(2nGZw
zO(PSrZ5O?ur0(?j+qh{X+Rm~`WTQw5OJE+3OqvcjszMe%{C&A&yoYu@u-SE|ya@vX
zAi$OHSBjMj(TIb#D~r&ID@kPWni53W?1jIS!{yoYiUXT+fEy}9l$KHO+hg=>2<u}W
z;xXSJl}Y_eCz65nA2{o0)n4J}z=vW*I5@9wm2nIb60Xp<m2~}$z^wSDVpl63mG5_o
z>j%M-n|WB|>4dP~X+8E_a0{fpBZT3^5V6uVX(}Z1&M+?cH}~)MFOURh-FjW`6E=)+
zn{F~~wy=At-@CO1Xdo>~gZ)(z(wiBUl@bW#zZ(Xv5#sX?1AiHAean%*gWbvw$B<|D
zw{L|*XMZNl1D9s@w=zX4i3)*T1dt@+;Z9bCLub$Rsppr)cN>8}V!MB!NN)o(kY)oT
zK5`yie{fDL<`9q2@Rqhi;6~i~L60Q|3zYtge~YYM@ajQaZ&hNu6BOI4M8@efz`WS%
z-U7oM&~et%v++rT8i^4~Emk}XlU-iNv&uC93FmtmOD}QXX_rh(--Aec;L>%4;=xmw
z$HS*r5G;iff)io-Uu;ifSKoiMy;-?{<lvqtWOG-3Uv6cfg&B<n%o<CSKWr6&f5iW#
z4{j=6auLl(;@KpMR<Ld|pRKuxK+faahLPvngy$a+zM?g1Ob(aRtI&otgz>$|zbG9c
z(iQtGzl^;Dzg=Ssi~_dXCnlIQS8V)Lm|@udv1^<FOPko8@#G+|9^Z!p=}@G{m2_Aa
z24Rq7y??U%>h5C)T<Bb_>z*RwZEsFqT0_C@#vc1#4vf%?Yufi~H>|2)`uhw7M(N-G
zyQjN3`lES4IZFAi52GEL`m~t6gqeHN!~Y)5$83y22J;U=HfdiM8TEZEJ`K}fW=y}~
z$9SIoxsa%H=n6}@cYwMHh*>|cC6w<^^MxQ3TaF-hLm4H`WT&;fr86$#^ULKO>!T-1
zF?@mjjBldC*QMYC#wT!0G5uy-K=O4tY8H3MENo7XV0v>e7Z1kvOb{vQc-DG#cq{f|
zH0`f{$*WG5e(hzGGZ(MHU^#F)UYS#_&WReGt{>`y#RkBs!-N2q)|%(QdK6&&dh74O
zZbE{b{*#BHi`dtR^p88u;f2O*)~S%>=p`-d*9VTge|y+UIKtW=zMgM1Gy+@De=(_`
z_Ws2mDVGC4D03KH(<m(s2=|<!z)MQJfoL~)8Y{pWzhQ(Lp1-PvLQhZU+wiU1bA5q}
zeOy6<2M;v|E@4_xL`$+c8|W6<uhCjFfb!XH$l*=(huZ`iuTx9(0iWz{%dJ>oU<Sn_
zp^mvi>*6o)l1QN+ZZihLdLmHzJ&lg(2VM6`td<2osvmFe&P)U+LR6%ac6!Ozy2bxO
zBE{RME+5hF2@~WlXn%=AUDi7!@Lss2{0s6w{H=YL2iS*Vbzd9ZeueLUAx4iM#_4;p
zH-LJ>KnM)w0}#4R;1JZ=qWvT!#CY$Cs6QY0_G;UC7~>}pZg%BW3eLnkb4P_tZZi;0
zq^Kyt{G;Cfvx}OHwh;N`y`tjSG32Vpk+^a)J?@1n%WefN9<A{j`TjAn{<O1JuDFzK
zOCK=D*pzADEGLu}x*mpJv2PHmNuBQfduNtVs}IzP*a^ApQ%8yq0W2<Ce}6mlYGxf}
zY2uaq94?MROK<eda-<h}IUm2A_YLQk{V?vJe6&A_tJdJ|(7Nl~ncLhK&DA*@<z3Z+
zsLp|hgJUjG38y4?&yPXLb&NY%?3QW8-FMbo`(NL{_%a|uvjOyDD4~RsS2oTvu3;m8
zLF<je%8y_CPUcUK4UrABY~)+rIS3{91L^~Ul<~fYSulT2`7YAsLHgH%<2OT+UPUA;
z$%YMPfELnn!x}kKPWc)B{0-4aZ~qhA#Rzle`+whJe83JJE@4a_-r>Q#9YI1pRf7g4
znIC8FJm5LSI6e$Q0x8Q~IFIKAw=I1tee}%ZJ){XYJH_ojHy$VB0T+EN*!AY_TaSlC
zruulh3QG^4zb3Qs4mm;Ihabl^eUjbK>f?spG<-cU&ehXK#URW@MYp!%&PuHJAa5RT
z&UJ`{!#0AC543vrk!J^x?U$q9?yOl+-99pp(pL*tD6erHHUf6x$^!}n{Q&$N!U9Hz
z)YZ<S!Iyt^X~vo_9~5g`)Rve8AP>`E%MuLg8)DTZVMV~MxcMzW(xvIs$JA_56tVc<
z#a5AQeZn!-NU!^<r>eSgjjjgknh-8u{(93B?0c28XK{YEaS)_eUDIfe3JcTG2ZVsf
zzh@{#7YZ$vXEJ>C0}!`s?1h6HQ~X)R#3xs0ykNj3hNZ%hpTpBLhD*>9)!Yr-Yt9oj
zL15Fc(GD(Q6rUjKvxtW829Tk=DmFL_a-Xcf>I>~0*RdK!H6bwNt}wCCda9GVn>`3u
zL2)7i5{p!x{o-GTI6z*$zJiD&4}(W99=rEcj>O3)aqs#J0WsnNB<J^p$WIdNl$$s%
zMs6un+VAZK8mq^j7nWYpTTs{q{=J_D|4EzJwxaG*4tq69HiX$&+H2Q|yf3;M@oDiH
zlq%Lp6Et|@X~B3&_*+O>b-@ChfLuj&Pj8gPdRD!d!m&FPp!!F^>vQ<rrsa3of=aq|
zwQBcdRzTdX5%OzbY%t8a^_g^E*q4Wtcg-c*oM-9G{{B0Eqfw;0kjb3BR5k5@xec4_
z67I`|-Y-%jN)R^FvMxFiaG!^4U)ccJ5*5)C>3cXDvx+G+Ny=sEwn?~5KERoLp(8lt
za0#(`p;9w>yG^ElM_b?dZ^<)2;PC@H_!Iqm<C}$z1UfT<-0kj6TW#_1Xl|4Q`qH8h
zMOP}JsIKx3sdT$<IS5)j0?5=^D&yxyfPQ8%E?&|>{U7vw#$<|=vP1CwP=0vd*f}4c
zi|G^*WD_H~EqC0510(hpnvM4byJCpI=tw6GjZCH_nSQrkbycMB3d1kV{&6#<+S1_!
z%rai~9Pw#{8U8z_Z`KY)Z_|P?mPF=a*SB<1@+Ng|uTuq;Nfu8IOOdcU?3x5`#OFDr
z1p=w;=Y}K;L{p&uMz8;<Jimk_yz)E_kmf)~^6tFI+o2GC)2i+Q)Epn$W;OcJhhdig
zfO>5d#gVxB7e+<^lN_00pd=;qWGYp-dj}Z?2=<5QO;UB|A)yBz&WDN;<lbTC-S15a
z#U~(%uN1ywk-MUVuFrP;(P?_NJMa|UJ00x7kAOAvNG{!Q3hqduy^Y`Xr^2XL;ltI4
zzb|0a1>ydt<$WKJ9MUpQK~+7a2gEqd*IoxGvTKM{!mj`F;QWyn{=lm3MG$}TUUk3R
z9!l)~*xe`BHV@bRPOc2$@0Ek1<~e&Y@j?vGF+ebJ-{{U1V6@1lx5riwB2=_X+_by|
zgAbdNE(EA`_s1zlSXWx<2uNW(P1-bONdP)*uP<{)u58y@HctaF`TkD&8?J&EY75B2
z7rBm#LBApl#ndMOLI-o9of*g{xtK<1szLZ1ie19lFh#6iv?n{dWY<C{gUvt!)iXN!
zT5<FSO(K1^#$f<tT*2<$R}oJDi?mCXPs|z((K-H9YvQq_Y5{r7lVm=_J$aA`#;=Ij
z)_%jh>Dv#M&IjUcro%A4^C##7A(jP+g?PSZ)K=H}r;Z|P8onv6B2bsy?XTH=at|s+
z3GI0$SO)kn+kgPx{-h{-8{EJr^@O5bo&226IyYWhC`>d|Vw*F^wuYWf;}s+f-O_9$
zzT5y%L{x>MT-xJ{^n*@{Kg}<U!4*BX%3H^ZE~|%PwBxBO?fwHBfWIOfX`pU|8Ta^@
zu&-O!m3<Si|AZzS_f98L2#lW*S^~*`!J;<6Kx`Su0ydK(FpmGBUg2taAM+7J>G7k(
z!e8zchdS6o1Oe@KnRyR(eZuA9%ZbJlYKcBeNHGu&#iFs%#SB0%|E#ir^<h%ym?jkj
z@si~ZiMQ_rn+^MeiW6ZXn^;GU-ejxdHgweh=~r|T>d2}HavK5@YWJ5(;$$T*Vrvc+
zzQ6yupqUuAP4=b%u^(~De%3S_y_tcjAfcuwJ(yO&I)letfFIT*DO5~3d*XlGOAj`~
z%=-ci9(f!SySdo+yE6@cv}RFBQZoFne=-OH0f!*-?=tx4i8tc&$YV|iBluH6m7rUh
z=J|y~2uS7=#*18xw_(c!6c#sLWe|cu0vViNR8dM-o7szj*#PP9S1OrB#;~qHZ|(&+
zDGZ$AP9E?JLXQ?gV9VGb*-A*ozQ;+-t68F>%os&;5mZ$(GCN<0I3X+Sw3!*m9Q2ty
zo#AzWw~D~Dfnc;2FLqrZq6$XhVKceo=>&KXzkuMI;8<a#xO^LqaACschf4mF-K&tq
zvGJXzJIB8z4#eP7$J|_9aPb_;DcFVJnp7bgzWfm19m~S1wKvN5EkQt@^kA$5bf7q`
zfm0DC05iV8aN2(ic4)8)3w>!o8oU0=oPIF?WYF%*uYg=&5e672&>?qFog>n|CY|Bq
zM~YlT$~sgvyn==7zF}OX`)A98R$kx+KCMp#Wq7O4y|})adw$u1w$FqEvg6hzY1j7e
zkYiexbLhb?j@H*^y*X0a%nkhW%rPVv`2>yTphY2GqhQi?NzZV!uw3Qnf4ZMTMR8*L
z585XZI&V2&@H%4iW56RI0D;{S!$FVMDab-wr~Y<7-#I4Q&<iwS?qnu~(%TSLHx~(R
zSN~<3dEm79=(70~V>2oV_*;<uFsv36#>frL5N<E~;Y-Q=H!Zf}T12Uj5=JA*AWXXM
zw9=x%?TJ4J?q5Suo>S#I<!FBtbI5*POrM`gd<Dh{$1VaTPAVw5i?sQw&@((<%Kbm=
zf@4FFo5oVUf^s>cImMTbAmEvX!}M!4l$TDvoPwQS)+PZ((O4hGEp?#ssmJUai&+M2
zvWUmodvwVBoD|*tKfZ?RsNN|8n<}Ofsx85i0Tj&u1XQc+l#zzJ!PPTY-8T>gPtk~{
zCAb!2bgvd}zlLNW5cu^X$dp5%IsBV>kEkX}0?h&e!Z8@;fHfIdIrxVPm4iAlN7fit
z7^r8&Z>HZDvjc7xy-cjYI5JpBHKfkWyJoV@%%6%~-!Db9A0P8BH&QWR(LLJ5YFRV)
z57>c|e2bWFR33zB_%(eGtVby7Kv2DIxW1f641Y~VG%dQ@A05;O^68r2pUmcE`+BX}
zBI?soXkKgR!B0y1`;WvL3`aDR^!N_@=V}9?PSUSks--6gcMKxv!C+rj1c=H$veY)~
z&v#Lk^x%lx75RhyMm6Gc1y++Pbt~aK>%~$j^a){xV-}^|%kzxN76tb>iH=BnnJ`1Y
zG2M%rsN*ksQO+TX#!uOXoN-OC{$sOL*!uI^Xd*GIdQQuq47#E@E)4VyH-PChnZ=*6
zUM_AIJ5qYpy}1#@G?A3a5^wf<@Q3d=pJ}<N&^~|_8hl}#cex(Po$Aak`pZH*gJ}dk
zfbn+q@h^V)^P(>PU65sg3ItOVbS(F1<cuq!jwd{@)5QiRwLs<nano;a7`N6qd{)GZ
zIz~Sk=+Lqvoc+=E+z;~4B}Ib<eZfN`?2W(Bk|f_v;x*@n9mAmQv&<1knS0KmM|0cD
zieV)uMYHe^$oAPjOc~B#zFDK)lL}Y8>=ogzp$tY&)IM^GM80s=Tu#nbcBhOiDi?Ig
zdY*Y<l_hQL2uENwu3YXmqA3aNz?aB_o~Tv@XWYrEVvG=WBpjH+4Z$<$NCik~za7m@
zbI-Y2U@!jo(4~32Rzmix-0d-G!7}LbZ1<p|ClvNp?Oms2#YA`8I2x88D+tQhsPZQE
z0=6qobGpY}ocj`>y*dDfS+DBU(})*G&OT!%FHwp-aQzqo=GepNSwbi0Wb~GxdY}&a
zR9U-LB)9v;4DJr2j#63QVOSvdh2F<fAN_5(c=}5$Y#?pN<NG;v?<1*Agpc`FB=+>D
zhqbihHe60Sd=uuQ#DQ<Qiscq>Dd?uYjd;+7)|PL)>YWOhTACpyR<?-ViaN8#9LRtv
zI&EZEAgG1TM~9$v7XyU~5f+zQ+!>T2Fa9#E6pACE%OQg;kZ8aZn)s@+@M3MjVA+yr
zVrrRN-FjGk9Ola~PBD>lZ9)xoY^LrO;!J(hbO!|jWPnA24@F}QI85mcqq_q;>4?cW
zXd@{=0Vp33>>YvI18+9ULypL76>HRmYx8vZw2kM@g0W>bkzj}GT9(29u|Nisy_yTs
ziawUhKElaqV>=Pcw>W_*SX*K1QzEM2^`v3h+<lvv;2ZL((w;Iz_K|hz&2~)di=!LN
z)e@JYCCRPp(5i71mK0OU7%EQAWN@*QsJ<oZ(>ajtv2If4hdRofgY?SBChP9d(&~rE
zZL2EE{vSW4@JXM7J(=*0O_dc+aVqw1O~;wo0ZTc{RQwaVT7)L)<FCUIbI@Ck_koH<
z!RrEJ=+zJVUWnyqZotJHaEo2Vn%<RVeyp#KL2eUi&gnMA9H|e;eZ3-%z|Rd0XCK?4
zVdkn}!Hzn|Gn&IgnqND8sAwE!3V23&gX+T0%EW4D40ayGY*N?=N;YXYq}DCkhl*|z
z$9nqF-O~g=C;p><u~~SwN0saIXE+&XqYAprZEs}G8G1>m6~(7|y7)m{9i^|_t400K
zgK9t6oKcVwkwqzkfB1Fzn66^v-w#2Dw6R2Z&*so46t~+5LDAuagLlZsKL7C<_$$%*
zA(WA{@ad1ck+*NrKZ7hl4?QK@H1+hd6=(Bh!%mZoSaeZna}DU2&yj%|RcR_6I=Xn2
zPn^_d7kj0qz3ZRcqlA*uN2RSAQV2iEEhD}E5a93kAv|_nL-XPKv9wdN3fjTtg??BW
zW~UsmuTW2XGk#Yz02!3l8NWnjVz=%orkrlDxg~4_y6EF*_R82FeRoCGtQuK5n%{`V
zYLj2pAp}}}>1OFC*gO-0tEm^jjOit$rw~K-M{7+fch52qZHk@r&br<2zERGRyxa9L
zRfb^;aqgSD<MK>D5T6&e9plD2WBb@~mmwMSQ39FBSDVDZj+c?73?JGR-H0UO-B<Vt
z>PW&Cj%3X)^}?z-q|5$A5d4Ni!w;Y`<oC?C#G}aP&4UO-0;sP%gHVv^PQl9Svea9B
zTLS;@!`c#e5SA}G^Wdn|jF03_HodbrToZlsVS=y|ZyTTcBrIpBV|HL22ctd=mwGY%
z;r^sRLHHBlB*sL&8bMD6mcLc*KmVmR8lhxwd(G^n9YIIQ7#cyGdevQ0)ya(4F+7Dz
zJIk8&9MUC_yy%oJzb9|L)*qd-d>gFHgTI>f%FCfaDcQZZ$k<S9mkhcBm!DZ=l#U6x
z2vHXTjovuBDlcXtTA39;JwMTSg_Gl01#l8r$djD=7pFVL!tO32td4UfT@$E!)%f~a
zq}M-M^><+|&iBim>fq5SO3*V|3`1Z(jI0S$CX<cV4A$(&fUeL7_HmUk%1tnkryyPQ
z(9afqI~1?`BiRU(R?eZd?a_!`Y%LvIt8p$HiczpevWX!jo}u<zYLC|-M_;UkOiAA#
zxN)sUyV#!4S?1#@9#Wg&Fd+NKsruSi+GolJZ+IAh6FlHVd>PhG2kEqsm<RmGkBBZV
z_7``>IKQgr4KHYSp_fcg^0ACM^cf!8A^xFkEP#LU81G!iS&dw9Zbjhjls~d?n$zBi
zjllJhRfZ7yN4JxYO&&N4#(7T)9g2@z*XkTE`^K*gA%8-q1Y7~6gT)Gvf^yfiXtDPV
z=qoWUWa5aZCxK)yN#>A^ObsYJ=tIbI=P_~Ce2EstUG55&r)`aNV<@Ox2qrh^D!%em
zq1-=&Stfqk&0chnd<F~pxBe@%Q7Dy5Y?rHv{pnwvD;@oa;oZ-IAbAiJ*8?U~*8{?8
zpLs+6R)+WTazE;LLRlHbx1w!^NLUO2KhJkn{S>Nr1c7Zfvi$n)$3zacOUEI{Mlc@u
zeO7~r0d}W!3n*-t<vb8xvj73Y3xV?wd05T9n)-og`L;ybTtU39RTV+&4~*(ZqD6KZ
z?*oO`B~ZN=8~_GdfaZjs2uQaL35+#PLf1Z)>a+~1dvJY{Vp%T9ToxO0@y_u4_PyM&
ztnuzS$K5`*5({{T1H9u-SNv>Y0;4$Tl9KrQ{vwO+D4dNJ?zZaTE5(BP^PQ(*O<08F
z{gG#4VZ+jndsdPolpjJ8*T!>vCLC59H(fVSFdaLie3M<eN~V@;)78ttW5Z&?9^Oz5
z^lZKe)BD4tAKj=y{iXj0KONQ0Cmi9D=HHStLv8bh44EceTL&-pR5Aj-G_CzsBV|Eh
zg-^|08SZt#l3s?gkx0I5D%5-Y2;5&faJiLc(*D+uixpVZ7>xV-N(M_}*L+B6Kq>BN
z2adK4h{@|>(#R6um_OWx%_j4reE5ZQLc(`2g-rgME*7@9x0%yRhWDk>r}1{GY^6%_
zJild*<}f~P2bmSiD-h~JeVIqxp&=U@A^<$TP-P671S_E7?2NPvhCvR2d)y%HeT7eA
z!^5MCGzTej!S<Kp%GO7RGHh8Mo9H)vO08VCeS^k&SbF%;Hs^eI0d`jc(;-dEw!h8o
z@M7svg)kiAOSNS2Hl=QlB4rU`k^Z^WlL*9%j^d2^Z$>d)R~Wx-GN`27u4p;l+)}Ax
zNoDz|GCha_m4s&1T>C_)s#xv}Q9SYaR3#CJT&*`_hP*=0Yn+JJLbL!aQTZ9^GE^(?
zC|}kAu!V@Ajt7EDs+`>5cWP1x>3%VB-XHF_qbhJ1iE7K7^AKZqdi-fOP(m2-X!<Kp
zHA}?jBrXpE`UnYk{p1a;qLB72gR~wohZ3!+76!b@7w-(iD=SX5Kk~<MzEB38#L32L
zauqqZyb>l0HyW&{JuhF!d=@i0T=89)#8o7sLDMIGj^ywk0jl!VEtcfYKzw`LDAu+<
zw!G++XZ35(hDdt~P5N;uqSX(eEZkuGNGI)SUb06Mwi$!6u{xJyShNV2{%$yvy^1@L
z76WJ&j7GlIJ~uvGL=J7|ZNBBo>3$_NOOev>s%$T&H-7O`q4rOSulK@C99oyFb(KHr
zO~IsI{OJ=<aS)80!LJnBUzJkd<zLpBJJ_--2P75P8nzTlg2H^5w0~mWJdoI>=CA(E
zWj{;xaT6e!C3-#qFVE!vxt))wA#nL3z*fsd&5BaAb3B<yvh7IhJPkYkoY%yRb7qhF
z!cO}vk;wKyQ{({f1NUi!#MZeX37PFdoe__!Rz%%^cO36VR!myll{DZRz8XA}4aJo7
zqV3DKYCS95yJ8X?%@DWA;9aE(TytfMPf8V$eo4Pte*ax{*FBULe7;_14k9QcV2H>i
ze9xa~gRfZCu?I}8LmEBSD@I$HzN*BVKU7HpW<6PxGQWo9M7di{p+sMOCI#a@y4{M0
z4J=r2W|Cv4ks<~E_22v$kN&YKLM~ZHmY}*GP|}xS_jvk6mPX_qtNr#y$C1~^_s)EV
z=FeSAE^Ou|-^V!?ot4aVDRedb{eHO~kwk+Zitq0+Gmjgrg-X|&)>~jh5)8I9j<W5&
zeAmprCT3JX6qWJJi^GoJR`=-hMCh-sTePnk8SGO#t|pL$JBwZCnO_@GE~wXqCFynJ
zv3y!jED};NV96c(5tkWQ3?9vyUu35mYqlta8nF1(sYzxV@*IjecJN^;%4exo!^wLs
ze}scm>$2^a5K?J^r1#IeyvBx>U9@g(g+lo8eu5@;ACKiNUe0dXX?)FT6(_wTcyc=h
zz^8dP;waGqY>U=0Bf+Iog`;IuK?5(Mde0qQOv0tK@=h9FGosz?@r)a~fDdov&3%?Q
zmDWy=g<tEYpbU}S`{IP<e#4rqERlW_|9C6KXB}=mYW2)q=!AmMGzQNFD?8vASpatU
zSZzpH=fmKoxA*fyOi8q)ZYNg-clA=CE7n>}q4-haQIo_fz75CtobbI{_;M5a|0;_D
zT?q#GbmJZItw~b&zJ?Oht?ApVuWUDVxeGUnLZk_`_|i1ht{V6zhWPS-x}bb@E_tn$
z7Ab3q9U=7Nwf$|cT5AM{0~Z0?an#+Z76}ze&k{L!X&VM~bt~R}#zsj#=8jUv)AX@4
z=+e?=4nhH+l~)7bI3DHCc(rfbX2#R>6FJKQK1kT~@(U>pA7svU-07OWD*+HRPb1cp
zy58u;$Frz$6Ot{;l+e8TX&~T8kB0Tk>_lQR^T^)*0IDXs>P2i-iim<ruuZFja}wni
zjZL2I-y_OKjpbRHGaV(Rn=wA~0Y-b$gp}nu8uEKYOnF{0k(W4YNlD&W%np)ZSNv?D
zd6qX}nARHGrIC$j>ao-M<8(6~`kl7azz^%as}H|$!|j7Do?bk?=83%&=zV3ns>SfM
zj)by;FWao;>pP<86FDyi!reSW?vnW@f}$8k_-sP-1NlVD5f*Q#5(wIZr{t05O5anH
zeN`jA$cn!*3agRN?Bw8bSn-T}BQ~g1E&pwR_S`o+f~#@BRGzWJ^K#>Ser|PCZflf|
z3J)WHU*6ydDq3SmzHOb{F?}t+&bi+hQ^^=c1FbEAC;^TV1=&{NR8Lr_Yd!eHjD?@6
z{%kH`hD6MV#kY_tk|0-E#+Ce8qJDB2M(71KBiqrPMNowqd@b6FOXW(vX8`(C>SR57
zw{Jfq2}eIBq~KgS+WFn@>epm4B-kj+KO27-VW}$(&1u_JPG3*{0Wis8BgR4Zp{GGP
zGPoL{I3et;{Up^C?X;*Q70|G$9TQ0$Q4~CnFFEQCq)!@;LzdyoT+_7so}(FGmWKmH
zap|at$w`8iDF0;58=^p$!rUEt!CTKh4B2FEDc6DX<#j43M+LdW?zQT8S;`}9QkMm_
zv@#zpEo)HoxPdh?+>u`FKd)+9&40QLE47uR(k13795-En%&uc_3KQ6iX(sr8)65M(
zWBU^ir~$Otb=QQiq_x^Y%F>8aGoOj&NQ?_VDTPyNw8WHFfB2)yjrd70`vMN_q4|;D
zX0zu6P*{pdHy|(8loJ0LFzQIv);Y9Hce3?qtZSex@~yVLou^~Jen6MS^KW|=dVL%#
z2gNU2H1>IIm?%ctU$%7ZS5qw4UE5_alghUpu}9M*{8K;_e0acGH>RG^sAt#Y<43O7
ztXeoMZ{wISgIl9zWN?;hb#-M)-xCjv)G84+X4z5hWS&}E1z(;AaHn-51k?eU4-Zx0
zsRJFj_0OpVD>-Z%=bybG)*Byj&DzlaJW6CI{2EteIi@zxMZ0m3<fXL+v#Ul^d<_oS
z`QMb^{OmIGX7sp(xLJhU%%wdJn=K{iX%Rx|i?mFB%~nO?UU#a4tS?_~Z6eP6OiZY8
zU4~LL)$~fiIRP&i`HjhN2Mc(~pKI-_-%$?5^?!2pS;t|C6I6*Wo8RF|DtA!K;!)&Z
z0c$*|eiAsgtXITIrkl|5vO+@2xb3TzVIXtB3A2%caoLFS<4=blR36*2Rrn5yXxP!U
z5|61lJ(2MZN;Kn$PTLv#t#%5y_Gz~W<XgOFaf2&J-+esoPH8{=Rm~I2h#rM(wREKk
z@ILne22m&1B<z)r<l-(#hZFtl=za^+Azvc%AC>)I@k?*rB#*u(*DD?Aux*u^xlnLY
zh-?GB_fhT45y3XNqV0dPKk@t%NNc1W{;k^zH_5%On3MSn8GF)RnWXFQ$pr2sBE*dc
zmbSRR=x~ch+}vIJ1J-1KbIw|9c@!1gRgZposL{tN7gXV0nrudK((|sc*I}gB$${Wu
zME$OE#Wn$VM}FcQrG~@(HbY+CjpG^71r=!xebs6We=#^PkR>*Tx6A?ChW)Le1y!88
z*p&(^Qyj~P!cQC2i>=x~y1UfcH1^#`S(E2`UUs&eQPJ2hYsOO^Ur$1F86H?(rA5S_
z-nAKteD6$6z00?rZrrjzasG_$QQ{;wY3$)!J_C?{&YPhDv-4kdg%CrY??rw6x!${*
z<FKtun8PkDmN+C+T@<Rpum0&vVNgvN@rvEDE(_5N!E`yso0Vlkf0A)O{)pH|e?Cc~
zC-;7FrbB^6yx`@Du%$>HvszsDtAp*^sGoy+>JprQjx;B=@nh^@{34G!Ql1b0?5KkY
z4t21!@L68qxSf$aadS7s?IiC|#DaE|-M{<in%7pqN%v=-8TvZ}U7R5}y)$t`{-YI9
zZDq|9Gqie{oGkJi4`7VY>*b)^!pUKH%vp`3Invn^s4bVgn-{zHK>57RN~+CQcNICe
z4P?5EfAQzLaRCg1H<HLt@C%A0EcQ|>%T4$CtRnyASl1*3+a}D_j%l690R_6VWqJ0K
z*Vcxm!@L&g8ahA63niB~bhk(2>$ovxGN<VX2}Dg?p^LV>#}oC-4Ng4i%(7V;f;W>}
z^n=7t<Bt+LTiA-Rit;u7ko4g_p1uyp*kEvoON=z(2nigseM61d$<ONvK2iLN?YMtM
z2xxu&HQ#Bw-jB*Ax|uqj$D5z{<vy=ElU=n{NU@ZNAQcu32~^Z|<|G;G4LxazQh7gg
zxbTTv4ZZ^i9~%5;V_`ruj1Mszub`kJ2mw41Tb;7NCIx+vUA-6c{&Tx=;N8H>4KYQ^
z%i4n|R(c5IVvP9!FBRr>r+k+~I*Zx-n1sETuc*DpJ~5XTC;bxmp*pac7Pd~YKwL5v
z7RJwPs=_Zpi=WdC_zX1wm=MVGv{gjS(P?`W8K+PN-`vJLGi~i-V~Nae*1Q+qXkwKl
zBBd3vuY5G7<5)0620T(4<)x?^D{TF}`l@5cCn7t$*0i{jUSN2RN<vIHuJE0{!O8e>
z-L;QotwIJIDlbYKzeYM_7K+F<Z$b)!oV0cqdAX!lB`Qe+RIGJ&IdfV+C<C4m|Go$l
z&<y{)H_)F@QqpBbVOO4sB4m{8I6cz~%~eeuA$w{tPWW{b_wYK*_Qg3q8-`#T-kyKE
z#&<?vSa2aU0i$zgfT+#8{YON;M_aYW{p%R%v|NGP<q&iU$TtQ7_hw2}_W7S#Mpkh+
zKAyCI&M$BlfMTR-f+`Kd;`fU&1_3Q!nYf`PZmcq8%M}MJawf{s*Q?q?4dc7P9g#@9
zOMz?K`U7XXk1~>1axE?~mc;TRmZhnraSy4!3@gOmgjZ`|LMW?^m7Xdt>$7c-z9~AC
zk5!;QKcJc+w=8E*Oh66tXC0L(BIV4sQrX`ZEN{8)z2J}bz&IJ~-Ux31fo&*U3DJ9z
z&2-~e4NV_MxdaTSLF4eJ-k28o7{&2V4tfdnD6WP=s74Fyho4%P@+<3?j`aS?|GM)r
zA7S-(;sx<jg~s>~JJG};db1lEG`m|$CH7fu)c^n#pr-WFAo^C)I-E$t`>f!yH{ZVG
zx#i8X5z&97Z!M=<Pk9~*ztgaE5`R>SQ#<ita_L7A;<UDN^Q1`tvIzv=oxCmD#Y(*z
zg^O$grR_Cjh=yXUy{R_yW*|d&2mB4<qh~e|GK{UI2bFybu%~;@W?jy{7a=U;DHF1E
zzD2eITOL8Ng=LLd{uyBXDmU$PX;IK>C8BMQk6$*vsf3w{fU*#K(Fh=&@wCA#d>Z3h
zX9^Cih;~170{aTV_Cc9~!r<o3g$zKJv!r0w#qedC#NH!?bEP~zVYN&twHl_{^UN<a
zswrn7dquuSb3_-=P$$C7qx8dL?sMvQO#-51AM0j6HK3d4ezng+l(MgmfNy@L<6##7
zECE5oE5BB~dwS3L=zJpef=qz&#)bT~+L=b&lNWzT9>*_tZoS|Oy?hyrq(H-5APOF+
zV^JejN4-;c>D&K09ds`c5w=N-V^G^gl{82~XF0YO8I5rUQn_L#aS5N5TQaCzq5$|0
zJstCSTfzbkT$ud0($kD6)S!Uz8LfH!fsdX%_OF#-miuaR{kgjXKiE6f!)>EN#L`ib
zAYp+)T`Ka5-twqRJqbbjd$KA&`xg}rw0YCP9WG7A?N&k!nJtE72}hX4D-!pGbEYZK
zWUA+?#Ik8ogWOqrgNJ>)@4h%Ey?@UPDnWE_zOaA;=tva!l8N?G-j?dzt+dpg@*l$5
zhC;;7->VvE?`;SiojtwyPVy38W?<V-TOO~2$Q^|94<jrOZKwb?MMZ^mTAiA!K?x6E
zng(XxuHtdY#u`9-h0b$#Ji2Fl>?Z-Gta}n$pfwl+=k6aLB2iqP(tud(I>07g@{O!O
zTQ$k#WZ3E7;89C@&XDJ%l>+R=o{r3kmW97hd<xxPqPSz}tVcUv&r55x(Ev_pQE({z
zN9{3!lvtDLoxOnvDQ*8;l(IKVgndf?15l;ZD$lfY^FG;uP~jyc5zu`lfd}wsdCO*O
zkU7XtneqF!Z$!Eo=lDjV<Wi5szs5w5GLlnx>OmfOMYRa*ovFWOC*u<3wgSZ#DhHQd
zF;3UCy83eyr_BY^Kl9MrL2(1iSuq%!zU#QnE|8UePe+`Q=!fXd3lc5baBZEMJ&0+2
z)k+h*izN!92&qKuS#FN*R<XVulJrRJVleCN;rJ4ZR@zXX*Wu`(4=*HIj6S>n-uHF5
zqSq}~`t*H~H`z)2H+S>TtKaB;4^Q_LZXHaiA{j|A-ZSqCiGQ9rHtkz_F~ch)ABOs>
zQQYgk?(!6T6G3{hRTgnfLb;Xs*YXA(g?F~G7TA@*YL962Nz3kJ$;<T`xqrTyK?BuD
z7GKAy4EwF9#>qzK&@*H@APAJ&)N$TR)iYM$ihFPcEDPc4L_FWb6U4#v;V~BB<N%=a
z=(vC!IvqsR-=FFggQ?~LFRPb+nG<)T<NxG~>G29ccFFX~CF(oWt0Q=X{w8nJ(@9Wi
zp=-2(7a8<x7COp_bIUcSy2A`ABCZNhL*-rc@sbAT1gZ}GR<3*+(_0`l@ypLU1t7(l
zqn7Pj#qp?Z@|g@K9$`4;T&GUFD8{lS_}DPZ_t(va=ysq0FjBFY6EE(3xaU8<gL;Z4
zUbf`t-{PiRD^NeJf^nQvHhBPoU)6*^Ndigg)Wpce<Rd+5{uLl3Ow4TdM{=uTsYnuR
zOp!Vryc)B3b_J9gTlaeSqgfYcaXZnJOGGa0FX#3c1j<&>HTJ9`hV<$~!t7r;v;&!n
zRz<Z=+kXfL^hly{9C+btDgLFux=cb9PuxKvJg5z*0L<R1y9%qiFk#X8L`VTi1l6+1
z*cs*NJrctYSZ{pzF#y=_VR6hL{YbO@!i?5i`^*>Tw(K@Yt7a98f-Rb2+#I@IFqetu
zY!tIYcMK)6B7=yL5Wk+n(mPUgshxRO7Jphie?};f*TQ6Le(OXUtq{rhCa*o6pB-}M
z^R$RpQ}hf3xww<wJ_gX~3h|S=BoP2R(e;~HX!Zg`>k8AG1yAifGvqF=J?p~@1%zX*
z7ZCU)2=n7y@umw3o_c^DU@Du&o0jp&LYzTP$?3UzmBHP9@-}n|pW?Rcic<lSA?5I+
z)#%#)&!?4vzRCCgDYbWvsTy<+zXDPcH-H=JSK@mVmAY3P4!>7Ti#;}F&qoAN$8=4C
zW6z_Cr$p}VKxkMbXG~}K`v(G)jGOgsF|o58un$eVSZ5&(I277)6@U2|VEP5M-I;GL
z*xvKfsn4_6<pY~-gN<Rvn?#o=i@hrDgMLM{8m<?;)Fpt9d>2@AQfVIMr^d?}4L9$;
z|Gsns3qiv#2I>+_Hj$ur`rI4XN)YF;4Uks^!M7Q{-KYX;>Njq$h%~I|b&(rrUPu<A
z-a0DQJhFHua=y}p<e=jh<le~?^~i0yZ^7xoL3Wq-m1;_~MBv9DpMD#WMVmqbHd&h*
z5pahxZH@dHW>XXADuO;Sa<Kj^So=vcY1*&N#T_j^uZRsOnEbn9-ZMkeDoQ<hzwq@C
z^p>EeIs}n~txNtTErAKu7V3L@1gj0`7KElH${EOR6UZT4vE=+xnwAg{5(5cX2X!y|
z=jp|o>vdPF{+>7O6hLDnjcAbMV?Fl@cD_uyb6Z9n08Q)=!{jpmIBIjf7bETGz+(%=
za+doDKpF`{^B~GHm$Y6FJ>M_!0va{@++g*-wN_bXw#zFO*FMWP#1aPgYeq(~o-BgM
zuX#Oh&C!)=pAZ4IRbx$B?0BkuiEHR@$QLD`ex)KTF8gn7A;~4d@(;?I{a$3&t4f$x
zPg#{pP7;X3y45;@@15&J!{Lhu6(4X4&?JQbYy^Fv$tZ&44!lQ%j{HU#9vWl94FwRA
z0rbXoG(PidR=BRR1E`>9+4#?P`SdiYW{0CmH$~2Q_cnI={t|nXV6&ZzY~0{4V#)!$
zV(*HZ{mq?kZK?`b$~J_~32v1h*Yxj2fK8Qj@onl>=JhuMXh%#HGM}j2X|;C08$CS0
zyM3Hs>XG2qT?O*LN>ZBh5KOeDMX!s(_(H|GYxiW@@5=!t58UX;>RlDszo?yMOkX3s
z2xLBp<_X@$qdM^=ML1<_m;dEQjN?ezNV6mAZ?AZ*?|W}fdIWUBg~#E!o5q0MxO0@l
zGA#6li2Vj0Cbg<4OX93$uY%@cD9^M1D4Ig`UD}#Pg|ce@kE62=YwCUY@Pd%;?hpi#
zZjhk}0@6sRlpx_3X(S}JAsy0<43H2}Qfb);=|%x1Hl#y9KwvPo_x#>#|83XK_MAP>
z&biL{+@Cw{IO6SreGpuJztWX}3$87iJ|nzcc~Er^p6e-W@$^Y>ch8(V1P5e0wk;`@
zyt3-oXA|~0aD0c9V|Dl8(#E4~j~SBQ)`lj{qU_MO!>gO#IsV~qJufn_7I_*C{*y_n
z{K~G;`T>1>I})q~3ag-XF?n^@^c98U)CLhwwhXWqb-lS!3+9Q56C~*x;&>Qfx0#Ry
zoqodx*WO2^wA&vY-2614;S`ofiZD=pyqWZ}xM=D^mHgLtLiUt^{_^sTokQjg7GT1X
z1Mifl6^vlckc*=6XqSmEMng$SM+bjlB3F-V8!<nguPA?-yG%ctSs}(KQdlM2w%<Dx
z{~tjW-f(FCZ|=jXnECD5TlC0V{o@C*K8h;Dxo5T3dK+)$@2~VwZbqb}2@EDDSWNts
zkokR?va)KBK<tD4=W3(|9e)o8n(-tb31pHi+WM$03a>BwU{K+tsf)r(1!;Ne12u&S
z)N_OuHX0=cZ^nC7enn4S5>!8;E6!Sqd^nhh4Av2EPl;;Gx7naCUiQSsL2)+^nL-%C
zL0zDM>*Uc`vKMOa9$bt|M!Z9zGUD3K-)w}1yBuzOoWT1gh-UsP*>RY<_#+=egq}C8
z5pP^^sq`8R$d%6NF2mp4gPng3YF;ks#XB4T#Fw7IWiqiiu=d%r6)uI(jtCg?j1?Dn
zj(^<J0M&E<IH|h(XYw5^gV=>_ORwMjR!S1zCsjIZR2|J^qYAcLonkLp4s&MDeDjFY
zcQBsHBaVYa4LlceSr5J=FuK=Z5WsP+kH>$^l>q+6xKil(5G;+j+nF%&-rdU#SwD(S
z@r$P3`IV(xM>DxE?hk)nBA+^7i5LLnCj@0PnsDiAw7^zGIK>;iow_fND_Dy?)t3ud
zyMim$EDeaa++El@TxQ>D$%o@LE#Pc$seoyh)js<AJ47_)>6Im_m?*~FYFDpCN;%r>
z#PY0TmtlZl`#Rj_DM5P|$a*_$xBO{!#&tcoil7W{Cx;2GS1Mmxb(!ONiF?WAj+e&M
zcSBtWar{MLaep!noKXmg@iggPs6(5dF5mKRtAZhYKp4m{`eR&yg;OD{l^h_M@0X!g
z^Ju31nL7tPTiQp7U6fs*s@|;?tz3+app$)MY=!EsB>@r+KGl=2r`&@I1-xVkK^yd3
zb9%krQB$}DBiB-}Z|S>~volyrR<BpGxJV&-`8~2h7;M~qb0fA8L)5=e5DUfFZPQM~
z6irjW$%43cQ_Qq;pMhAUk2`U@8DA7^i0XVf4&tym41MWA9{EoM=Z<E{1g6`VI)p5o
z0`z%Z8=2&0M*yfHp1*z-Co<Z@(VLn+dA{ok?oc#HTxIHD4g1$8p{4fwmVqiB|6l~w
zdqPR8S&LL_mbYs#yhL88M3E%UCgezYxo#U-@9R$Z$GuD8g^hcU11b?|muFPxH0RX3
z%jM{>y+%!P_E5ArX6<cn>Zii3;KGxb@B2-Hh10R&(4dg)eUFO;3_aLIz!rcWB%g*b
z%$Hm)@EM?I9CwH9K$0gVrdz%Hkhzu-c)t>*b`tKT5!)i~_U){#;7^IidzG!i$vzF?
zT2UmENaF8fhHDNHzb91K6YG33P$IZ@pi^2iT?8qUYVmgSP71E8t6x-hfQMLM(Ft3-
zSj+jRwA`$COXsxX<TbtCh4ckgsK%tQ?8wciw}!UI+8gP+@Fy6W*X-p5UgD{zd%{Nx
z+m0Z&Yq;5SIL7*$#ukeGw>@ideR#(cJ(T%L%OEM#2?EN-BmRtEJt=lvN%2eSL@A}K
zx*cc{P#YJInqNQKU0;b<b1zbPI37Uy_4$qoSy;=*IuK1KUu}Ph^7Yt1*l2zr7&q{M
z1v*}!>QptdbTZK>sJ=~6nHh7bQktp-xAwOCC>yVu*~G1*Q5?2g72>{-yjUOlZGX2{
zV_RVL3X`OW*|^uSKX3vJ&I#EvW85IIYw_6K)F&4ePjD<DR4kHoirHHK;K4Gyh1pMn
z_R3cxtMM<&2}Z{~^=c);o8Aob?O|~QQwWV6mNhKA|2P;Z;ZNKh1Ak>FXq>LW^+`3D
zD1^@->%P#FIgQ{Wcq~qCofdO*!?~8iep&k7P~0`jsPHt3nxQnW^Iew9KTMA449le(
z9G_ol?u@Xn2%;0n*1jRIR1Ow%@94iIZDI}K_F``LM|JJwQ0j@1c!O*{HRD&Z%Dcz`
zso3pY?*sk)8owX&_li-=25><_F!k1pi<q$kAQG*9z?ryyIL%hE=|DiGY4odWe)x53
zOW<GlIy~u*HnM!`V1(4ZnC3)R<Tb9<nokTbfzmC9m{$<gn15;e3*hS-t7{~V0*1;b
z`A!D*^q>2k`wj>M`>9+hp$jqoch1USIHK`60k+zj74{eV>#3|_*l+KP-Oi(m9=kzT
zKkdE9^O+KAy^IH)GNeCOE-R$lYn0j!fdNK`kk`r+D%Y-phtMxYV=c%I*e}9*d%Lak
zsD|M^xtqT`wo-o%EkY!(O*$Knnb<74-T55cAxU0}Zd(3@v1oXOZRnDI8))$N6nh-5
zk>kVFd|8`Xk)p^{ciFY8Q|(qo6j}`j4*eHMrIRZ7YvPB|UPJG2qWyu-W>q`3gH*~}
z)1u>t9fRGT#z&mQ$qTAyQjbN<@zC_5#o^hdOzPNAuvz`%;5C(J3W1m_lGcUG6n4#L
z3x31$Uk19AM6x6f=nzQW_-g#cal{Gb-e6Q#d6&!AX0y}%Hhk}az^gbe?{IDZFx$u(
znqL&x&p6Dk9+unVI=EOL)V!RigDocksu48Rp&rZ@eWgWS?H|$4-+K^7Bz@P`@fI{X
zkZ}<KpB~(7{bWk)zaXU(9<FC(qIKY^Yq&)kr;{YZu_){S{Meh=_1fb2?BIaY_7CTU
z27OM@Fwq#%k5w>3^0FC{ieHzV`@McG&-x(?)ZRn!;UK6U^RKihkiS9ekO-sXbo?T!
zJM%Phy_WU9Lw(ovxY!Hk-)9H3=@8}QC-H9^?IHsbwH<D4-0Eb}{2SQ)GmrO~pG{(~
zg_M;)dhA|zge93$hjlR@>`z#ws$?hI>?hOtEx((j^(&F>pUtdGx@x)sHi^{|5>dtF
zH(s@TRJqvQdsN$q3+?va4Xeoez}iinwD{+@NnGLjeM`}o%62vtFM=Zk@YUw$J0B;5
zMbQCR);lAna`r>_es$n|;wWzE28?ezw^Yna=}xB6C-s>&EZ&02Z3~DWOTlI(&fzwR
zZpl)kuG~w(<u?ZM)|-O9)`T5lEk3e@cYQ*4p$VRel>jtD3;dACgyaY!Hn*&Z@uDu?
zv88hB_;;O>Tj(c~#mZ}eaqtD8^q$<>V-NTH`psu?XA=G<pyQ7$DJ&q<M?xB%U-e(;
zn%Mjnct?wB(r<;kr~4WvCL8ld2fw~{iad=-96iPeC%v$zzpv$<zVSvln~ka};pyy7
zWS#J)6cBU*xjhFhGQIs8#-3k=c;v*N2ki&f&-kT6|L)(gXcIxvV5DTb)n~YN7cW~l
z)M_xaNo$!>%#QjoyIm(wVr14mx&lsLVC~o5x~G5bmQT=FZwatWN{{&OBjfnRuYh5&
z-<c${IzGRW@KTk|%jel^&jgZnxjm>KRKLAiIQqIRg>)>%tem&%-DKdDMWkoT-?SA)
ziGUmp(~tJFjD;&<#(tnT`!nTcRcqsnAj`)-bLGB~;w1a@VE#OC|GZ*IkMP0Yh{QD-
zLZaWsmTARc`2EcIl}%N=yRlB#nZ%JPP2`vP+b;!mPdm0z>+D92L;X7^>tZpsq$vnL
z5C*m?%dH7$XOcr9O^Y(*)EWJ5(K7(0(?a#2WJ`)vwbV+;UP_<#SvCt0*Cn&f{3pRs
z4X5}IGj1Fc>HgIk@e;>$Keb$QRg>yF!V3pDH$<G+Z2=JEB@i!f(%mU~`zpIowl89x
zwVwJTn85>7<8Qq|!wBtA88YLu1<fMGF(Tt{4OFo%uf3Y9fTQ^1@El~^-ccS1()_cH
z|I4SsU-y=5#sOwQvVK&rG=AWtec{_xc$nxFol|-t2|=OwwC2o){zSb^H*y1MH4W<p
zUv2U-w2$cJ28v#R+<-=U7bkO*{WwjJbwUCU&r5C$6sS&TwZ%QUr6GyixZ?#KXc?K%
zNk_?1!Ge&c+KNYM3B!QLRqe1Sz=szR0aZpA`?=_iOb8#6g-E_j_0sQEuJ4wxx{L0N
z%LFQo0g`rs^^&ZFuQ_yi13Ct&!Kl|udi5x;|Ci}C`wovK;|HZmbY@28u82*7lzq|#
z?vvEF9ZSHizwdmJ!;K8;$e-p@SOZ<~-1ps)_M2~rlJ#MhRV|q)AQ$Xp-M&{lFY-%}
zq<4%RB5@ybhV+^EF%tf#yqz`n>NkmY1DrQ(wqSn9%$|Is3D%GbMgzr#qjE8(8167p
z9-th#`EL<d%FzH`%M%b3aMkxK!)8_}ANDT~x^)!MhEsKd`I}~{d*mK;@P5lm*4$-;
z27F%@$FMkYc{IX<3=8Np3b#-YerO#~yU*<&?5F@yb)IwfDHlkG-U3__&`#X`7f#)d
zI~p!V+bI|XXC65})CZAE#^??zQc*LA`w+%*H3RG<wR-xGpn#&V)eTxH!@1up^^o;I
zP$Y<d2AW^avZ}fDJgD|8c2#w*1YcB$hKxe2PS3}EGt35)2d{0v1z96)Fad^j>!@*r
zi__~})7UvcKU+^h0MHEfpC0Rt@f?930YTAKy(b!YJmq1A@s8S3m4Y{A2*IE1Dib3z
zOcK#6X&XKNpzzL5K7%xl(4vflIQ%V2d4e7^(3TeC^s}xLK04+O!LYwgrdE;p-?^gD
zHi;=j4{GOLcr5o<(b?#B55rr~EJ6hUZB|MqIysplwBe#$p=KjPBugA!zKRecpr5Z{
z_3Yg?oR$FU1X)XvoC<d}&nC7H;!`u>;_y@in^P1;pDv@N(SP1c(J;VCffT1jyKJPI
zg!o_Yu#YSPZcGUDyE^5BZuDh~>K&uM{WC?dao49YXgbaLFr2#)QvYl#tPy#Zi<p`o
zDXj9UjJW&IvrK~cAu<x$wG(aKyhJ5O!KYg7;fpsR0J;Rs0v;ze&RmPYxH}P;tjGAz
zr)2de`?mqH%p)ydBe(h1sKsj+PXL~X`+vyAE_vJJA65_9vR!(C(<H=oU=1(;0+BIZ
zx9rGy#r2f}gP5az7~lkffhM-QG)}zaZ$5da8NpYHHrS3ZKJzl-_F`8Zxal(D4TOJn
zfmH@Qj9y8~W~$|4`E{|%49o+TpM+5EN%~!`Rsa2+V_A?Qf(`dSJc3HIq;1dt)j5}J
z{~4NJOUj!7?0bXO)ck90Ij{5zwE$F)b?R6gN2LPGBs*noSuNPa>kR^WllwS{dLs~7
zYLUs|Xy3X%ZTxl@n!*8UisWir3o1&fF&2BNi4Q&jzBHr`!NJQ@*mO;6;wqRc-?aBc
z1<8g}<3n(n@CBYsJXmPAV(T;|w*><n2&kAjf{#jVi8J@jgIKSUQY;Nl=`cQy_QjrI
z+*H<aq;l+b@GOb#P~o3@BF+`ZWFGme_lN~Lf}z2Q?_(jBAvPGY3RO2FRS29P3<JTy
zyJTnt*~Iw-mQw=uhBS_}bE8(Rr2gRwvOE(bgi{b`HQj{ADi9aX?!4`9^=e9_peWHj
z3EFUprF;CcOD}Ue;9GJ$C20IAjGCZ*QY_^BsLCQqCZOl(6+eR;VgQp0JOX}=q+LaF
zd1i1Kv|7L_0kzHq!Y0sh;}N2si_<EW!Nw>t<U~e};78$Ww9n^nFS?=S*dVxiy|I!C
zSBHF=lllR0h%-=BNqN57`b!OfQQZn%FUObY)Jz(Ce9z0)$nIkxQ<vAL(84a_RnJX3
zI=!oDwH1Js%?7YcgY$JJb?;1EX5IBR454^27ZeIH5dJs`Uzd}mtJ7?JK=<H)1+W`H
zLhgY~=ONilrW&1ml`o)G+aOJMvL`ExRq>5!h7&K{^N4f6q@{66*UnXF%qurAZH;sk
z*pCX{{7TS13pEl`R{QgH!`_brW1T=$i$%#G#eWl_gA$$6mD`71y~6Y_7$e56igBN#
z@(vZLK8NfOSXDV^vEpl3;VX8}@`5r2#-sT`y&<Y%Xfc3M!N3e$Vcfxx-;ea!4qrHB
z=GSoms~cSLV<UHsHI9B09&%!n-{m_C)UPyu5sG?o-eeCsd=$$712G^C@{1lnb$=aJ
zeUtq)HE|B%imM3mF_sQG($Ks5lTr>7j_jgv0PjYjnAjxBF5!Qa$PN{B@o?xz^94Jy
zEOQz!;-K1+88<>gjA6FVa&YZK5;dU8WEY+Ik-+2r53?@dHiWR1t|LE0C>%70>O2%e
zBq_p8k=G63(sbCD+-V|Cl=GHQS*njv8BR6h;YsH>Bnz+e>{F#<J8($R{=fsLZ1h|6
zSKT?y$)y<ZLD3<r2VaQcF2~VkKe^4~?$96U``!(06LsfDET6BW8~7{*{5P0S*I$W&
zJmkVIvKoNvG2ix1oBzA8AqC|{5eJwFHmzbAq5a3aJZ-&Ew#Y8}uMV<!v+U9ZIc(21
zF5ltNrj`qa`Z%~I8Ku((7jsr$FRIv^0q*gG0)}ejG8jiv%rh~KcBz5RCmO!~U1thT
zOdfJWu02>Bd!Vhch86xf<iQ%Lpmk!lVdbLHF7_g;&TR4kG&+RkZc;)9I$Cqxq2LW~
ztpTllCS5=CNo-bsH*d3T+Um!+@ak#&D*EQvPYS1sKjrb&3D@>BW5_}6duI=J;G-%G
zgK*ECcIP|oUZ>sJ-VVMbH?EM=HR=85PALr7&3}($2zAr1q^2$t-p4SZ_rdO3dy{d0
z&q>juMoHVyZ6E)mez_z$a*{Y_msZC7*SiM_8-(RNoz=5g{K4##qfUvpohBGJHS+!+
zt8bLzzw8VsIK6Grv*iiyu^(Iu%F_vU^<1-tQ4q-qM%YrxuHaGh!uk6~2qoY^V>y(8
zXj@wUYAj0oUdi(K1|ZX5KF1<ZNF}$4#jZHk-aID1INg7D#I(rx!6Y)`^==!@z@PT_
z47#M3@^`jKT<0o>$;Z0>a@s=Z8_ul3{fXgs&;f9Vrdg)HNGq!7n^}pL%LREFe?qAV
z9FCDu`z>!R44%onc+r<2p^cdyn!r407dU+e)uyTPieS<;0Zky_kUH0|UZr0<`F2%T
zhptASPgvrx7&KDcf73`RIAjy(R`+DQUx|1_RKdVRL${dcCEjXl%z{tJ-%66I00kTx
zIfsGdyW?BZQmI1s4+<zW10LQ6Bpigyp}4fy3(oRAdyfQFvg~uuNeno)`o2+j?bs^f
zRX5k`_X-uH>xVL8Xes<$E{}G~yNJqi%6#1aOd_v=iwgVcEDq48_lMtfx&(kA0DcK2
z4DN-Eao8%XTD2oj3El1#fZB>DMC3@okkLFli9<doo6_aU4#pCPGAfp;hl}0&OSNX(
z&z3!~N}9l+5mm`ocN@Ce_Ta+>#A-LcDu7}`12(d_zPhxVmg72gO%y@p1O}uP?!Ftz
z-NRGF;%Tz40U~KSI~@!VcVHq_Cs~fGp{?Kzc5|Xvyn_&o1^6fdT6F&!?*oj!P)5gn
zX|=C6=<dwz3)S~^J~Hxm;R@%QroROp*^}kXlgWjL_JAF&5=a;?&{h5QSNT@tla!bb
zqxwnDw*2~k{*x0Q`lw(eRZOy_hK*YqRNofDzW;``b4l=Xvgi1zCaa-%ps^Xe{=(l*
z=t^k`5$bg31w6%gE4{qLE^b%dli<=ELLSmH*tFjv6O>rqlk4~{jaG|pAGNE=?Q&i*
zOG#T+sg~fTcjf`HT{GnoZiroB<EPJ~$U)v8hTkW-=q=Nc3VIM*OJrh5T|iR=txP6a
zS*+iiY<Cxtu9eSKrq^^T9AUN`h93B4zjZNgla^!_^*4D>!&;pF6XE+$DA?+s@`gkE
z#!2Yu4Pyxs$2?ws)p340SHw(C77PGj6C-p>G1gM@)AhZU&7^TElxRZ?K@my5-D$DQ
zXO4$drWFyT=|(0bZ>GEiUnP>6vpGg`JhJ3t70eO!`7|uxt0BC9BKbs7ESWTtg?eNV
z(&v8#jfc^cf5yR6I*!sB81(q+gAQPnAHklZay|Nax%-YRT1`Ba5Ee+zYCrsPs=NsH
z+6dNyznPSk1%dCZ?Wm*6494izekVxE(vN}}5&~166714O9t-<Hi>@ioZ`d5d0zv8n
z8FwPK+Rx0AOEPoR-dH3(L|RLHwMy?8MJuyw$E5VkRD$HNFRa7kFmQii4l`q4?!`PV
zMCjc(*JA&Mw}N>c_eO<F{hbhx?O?rU4BxOyB1JQ)1(1zAX6Y?d<v-fi10^oo=SPcc
ziuJlIC4Q&Hrill<m$_Y(nk6pwJ>;aqB6?-x-HH!1eo@X`*HvmxOq<R+l-6V@iGTjk
znPk3l0$LX}(?)fq;TGO^_FT4@@0ZfCIwiX5X~5a2Ie7Zs6*zf-5o}y$D&vkyJd77I
z^5fP&*xtdDXBR%JFy=Nzmn-oH-dG!OdY<6wUFbFCE-kSM7Kj7i{D5I2?cFv04@+e)
zkolY2>#%VeChnz>-uX%0*GWAv_8ms@IzHby$MLLFSSxwx09si+EV-06(~&^?d@G4(
zb?c0`RTsF=VsEs<8dF3VZ@lyAKZ+NzTT6@sYT4XTKSAU*ZU%61k1GYRrVc4F9>)w-
zVt`kcThuVf3}G#(;Jl>VZS7fd%$)`(-Bup&cg{%xe41_ImEhh(fk*_Z0dP_b6_J%)
zl`a-%O)r`pm+LI){&msBEA2ZrT^lN!7cUpsE)J$B4l8oo881f7WVt&VTxgT$3I6Kc
z5~y0`$1{9R-A6v^kiKEr^jo7xB?5=lp@_=-s8EDhkndem82Awc$skXRadgpU6!dBQ
zRwr3qoTty6q!lX~iF}^XEP*LvjnoNsR{}A;)z{eY-<~xc8R8pTT<=l)(XFPp^ECR!
zT{8KSHkHZYYn3HtMuTz<{Hg_>hPAs<fUX|S;Tfp1bbN-n%3IBONREMGQ%78zc+owe
zpPZ?Gl-53!4#ERoM<c|&DwByosoz){x!7r&4@N|7TiKPO<PKE!7L+1P+cb(ZrLsR{
z)V*g5^)&OX_m>1jwUgeJ!qY#mNyVjSR>QpcQX8UG-B^K1$IX>)zqU{Fya-Gj0-?2r
zAewFY7-ZCarb%D?pllKmyxoT+jLQd*!B*T*$rORsZ+`N<-i#C%4QhH{Bh8ZJFYjEW
zwN}oOBG)FRaFQ;=t~>03V1sq^lhm+aWXTOtnun|FX*0v{IBLt!64l{J6%j_#kJsin
z8T%SaYz!`ehIzN)|Aari#7Um`+@vFG5YlHn>~P+IRGyGmK2r_A07YOaw}ZUb{M(-L
ziaa+M@%<fkyL!^+Wg3MBuP1J5)c{hT;3;@ntWYTnd;jg>6)01z4NM1zjwRH~$cuYB
zHA!wMxylFL&<3lIS9>^*U+V*VEP$RRQh&VbQ;<ie)=jUCMxIDfQ$%Yy_mUDN<-#HF
zt*7$STx2RE+SA+Wn|iKL^^Be4KXYb-J!l|)*H~s1pY)l{J<Q4nKW~_Kvvv844wVOc
z<#|db)oi7hM|zSKXfuwajZP=rl5^Sw^A&IzXj$J&lt+%|fi$HN3h}!e&rWrO-M*Iz
z9ee#ZOrln=L_a>Gc0CTj-mDoa$x6~beJ7D_BD(do)MNP7sA>XVIJR{$$ZIt25SDBF
zDgK&`OJKpYC`)3|`m<=f(=46r9XAe&z>F0it*kTpHEHop>51wuwHd^AyTX}wZRYNS
zRJsq|ys{AqG>ViQ)g*ryo$UDZ(O%->8wYX?sjB(24?_xc0#HAn8hH;9kbfLZVb5ZW
zEXHNKOp&srr>j}6gxhA`>BYXO^v9(0z<`*dHn|us&WcLWlTU}_*r<gn>0l8C)(|_{
z`e=tV3%xkY28fv3Ds@Iqa<YCn!wrrBx+tWQ;RwjNu<R;Fv;JD&GtR6&^)Mbvh+_A<
z$2o9AT~4Vi*)Us`n1rbsmd~ShoE*Z%6;uCtXNpJtmqwqOBLL^7$xc^$qApHlE0^|A
zg7H>}mE|9K1r+VWWI(-XwDBg|3eX)$J3eds*v%B9y-)Rq{Jk*xnmdz|5tzSf@KjRs
z?-TbTZ@oIx1R4Cpob)8;Q}x^(>(WE1^<Jo3cfWzZ5Umo}c;?GPhXISi)4{vQfl<MP
zp1gL-BbHB_-;t_r1CI7)ze2pb0;F%&_<uRYlqlk<+2mB(>Xg%}1Dj)n(yX3iJb&Nl
zBZ6Qex5~#jN4X~Hxl(K%FwkCCZ3z6W9r^H_4JL2ig;3Vqs4uTvqPNdp`M!@7GWa4m
zD5&(ioIkhWz4het#%(orAv%fXQnCh#^cvUR&pP?x<>!URo%ipanBv*<>jlZC3l6c-
zovhSu!sHvVb4b<Nct|wyP4yfHmBPnwpD7$0+1hW#aa=2J=kWGDTPk+7%HOX=J?!ja
z&WjOKX4`lBW_r<U^GZZEr_q^qKlM3WGF~9fCT-ymQoYr2;sL12UU24^A;uDn`_!na
zUpIKUiWcS!EyOw_?mVrxkuDiE>|#H-F+n>7v3~a?tF8R5V&b}aBBx%K#Y!t<0i@>B
zTr<s;bj%#GS+5L~4$6DP%o5$DSt9mjqZs_5CmtEJwOfK$fHHqnsSrgY38E|~Dj1n_
zpFel<0h@J<uHXjZVTG>XR1Pvt-lppGzdH>l4l&@S_Jrs0{^kLp*WK89?5<m}6LV8{
z40u`N3GQO4LPIs@@r?eSDd}z8PM2ayiYH!m$b0mxcO+{Xj?g`y8pYi^jc%tcHTbj~
z>HVkWYrI!HtcydNSP3dLv#)TA&^7%iP3^19DrIe^IH{rgd=)@BPD@%0I{7Bb3J6RN
zY~MvL94GG?Rfzs0iT{DU5;e%FeYzK|Z~i!G_VJt6-=QQ5jz#|ox?D1SM%)<<730BI
zg_+0JxwDcAH+8Q`Yo>nt8!t80dvA4lzaWt|AfU-*ddyRsGcK~%RKKlh@JFQce-Aga
z;tPgsc2-|kA_RrAEs@_L9J5P9Had_TE!S_{&;8}#F9oTGwEsLBj;6cg%CI|KbRZES
zMh9ces@J8H{P!31!paPqL;ZWbYb#%ZKk}vMFP~z)-%wS;mwP+ln7gK;H+(*Ga;^(6
z%nEU7`<tj?KR(~fu##?@z9SmcYvJm)F6?+YTG%|t`rO!)#WWm5bG>n9xwGd_e8scp
zvDTr*7ZQKXvOi~eAyS{XVIbYF?}fY8eN+Bt+_KWe&DXnFHY3XT+QT8Sa#D~p<4_M*
zdOMM9I5}t3@jvOnANL%EU*B~ty_jmN`+m4G_4Q>*Ow_~AOMz3WK9>c*c39z~lp_(Q
z!xq|Yj$Cg9vu;qYLjr#p*9pavtA7)!^LxHnzm}j*k;B~_ILbIyNUN#f7wiI#cmU4l
z;Vp!6Q*Hh*qig-~a*B&Qb;ab-Jxh+Y<`AK>dlk;FX`ae>$TCQ?x)sP>dv^A<WpjFi
z_q^jH<b^!r6!gEm8RtoCc5U!duaR<F|0?71v7{cOp9;-;ea@PqO@I44Ju#1nAF>7q
z*Ml`1WI;a|H9zX7{^yTmzs~bQ%TTXw&Zw%PUX0%nX%#zqA^9X~zpQ7~_bHUWgG!e@
zikkV(i6Z%tU7w-VsGU$4vW$dDes&`z=kyz+#B_4v!eI3E=_vU15*{w(dxYK&=vk+n
zGj3IxphMf-ZnkWGsFwTvt30|Ka@hSIM4A0_kVN+qYJvTF*i{!D9ovl*^xF%2;E-{j
zhAy9ACK6x3o5n-k+Fn#m2wGFX=9GLws;!NQiC&y#=<sDJ%P=B;l-=*)pG3|tA<U_t
z{pcmR<ur(C(nMs|7NV-J#jydeXWr7tKyxI?FtMCZf0#-ZlcT=YD0qW8iiKEZcB7H8
zW8(W`{>`o}34;&wi6N)QtQK!)I_;|=?^dpb=d7g_Qx?jz;|r@#+sAjov7_I2?8e+1
zsy%-s+zKzq3Wd2tKfj^61Dm{Me&6fdGylD-1dhga%*|HlU$P>{*yq-sa(B$!V?&E~
zyFyn#QoV&Fe|&!^wpWF_SPFS7cD0ppkBp#ZIG*oSEJkYx$-@h#Cwyn^sdD>nWFg%Z
zi@$$+xnz5siUawHMX2e?5BVssXG}jUcx8RN51QVMAP4JqZkOK-?c2z0w723(yEJ8^
z*OzAPp)A3`k+)Glf8Tkg7*2QIYu9z1876W6+sTcw^tv8FlT&hXd6(F@ZOAC}v5&6B
z6aSpyUJ<DKEvY+ucR)|AtV1z(@yK!+UYb`#Pih;fEOMRb&Xb|{1rdTH;c;TDUuqtk
zj+uzXqIN*<x#HCbHP4{j8B!+?^>4>!efV(lgiIc{MDe`H_pxpdLMRLSCHF&uLd^E-
z-)@xB@_1pI{|tE+axa;f8MP@OGzHR%(^dfGlrCB%StgyTw_>HQXsIv>KfCd}U)v@6
zPdRx^KJPTIf77Ol*ZN&^2S46+-l42KMeW){iR?xSXHj9pmlIp2x|a~_tcMJ!_1$e;
z@@hP@ukwSM){NE??020CO3|mFXV7s&p7_%m7Itd$4?+p0Ekgp~tL%mX;l`2@_RDm>
zPfr{VQ_1tv&oy>(YI?3~=a_M7l$VJKxd`Sgs{YQ=%pyhqd2U^Xn24D!(;oi^r~nd2
zoD(;7{zeOuB;F0ad`k!%%StM)6n=vwH0aW;!`QB@bl9wUTkWh%DVj<gp_^tet`SlO
zqhqR9;e|c!f!r+*(!US`wm@|EP%I701-Uf3kqUniG?X5K{?2xs3B(cxi$hYBoB7Rz
zhAgdzMa;!VTRbEy@=>3ZiUu2zkN^gjRrGBi^ncUN162>BF9Odq<|{3K&9en4yR0D1
zQN#K*wGxNqp&mbWrt$d?gZ@QPF9y+UaoDW0vWxts)0TdMV$_yFbR!_&&t{}c9a);o
z2N4|aJMO0isDwq^jtJ+^y%%1;wl~6e*o$|*<*aRMej5rM3H*XkD1p*4d+Ul--@i#-
zP?eJ)d$vZuzv3N~xR5dI_5S)<?_W}0lbQVB8C`hR+*_LU2bNq@YFn2a(*seFJ2|st
zM&F@QYX)Abec~1%#o()=;O}(nRbv{N!fJZ|O*~^BUT=)nN~7)L@q6;?y=b;z^xB$@
z5Q)O+QOrUGdeixE@n!%copJQ>aAppZ?1ufM8v|_w<<T=S`FHnXvT~af!VM`u$V@z`
zFA3GX+)0`yca5NrH|7+v9q-SuU<@%<W2pbLrQ)i8r4^l)ZBO&}6ZbErgQ?EyW>t_J
z$>P5ogRWOGy)IDaOSX3kj8(cWDeRl(^mwfQ>z94cn~$1eTXDIkqi*HlUDuza*&JmR
z@EMag#-5-8RNoetDH6ulUyVInHib_9Mtr+>vE*5%lQ#P%y6LYeEkeEXL-p+sg-hS(
zWk<)3R6&pTAB-W$GQKqsQ5GI=ecAhkeD^`0b-cEkzRaB$U;EH%|0T}!MUHvyw1<7{
z<GAl9?<)k0z`MVW#n<903=Lh=jC|{Qu?L`S?Q*|oC}Zhj^YQ0J{3Mx>r`D+~=m+uH
zLq^}DIYjvuq%vxKK%>F2^-J|arKjrT`bLrC$l?>^6T`(4tmJzNZ5`<1hEq{(k#3`%
z&k*gJ{A9fgFd*#>PL4-s90KQ$msXAqB&k1|lnXm-jGz|K&LoW<ctB5}$XG`Cy+Vby
z(m>TGge+x@V%M{TG4K`(Vtfp#DlYh+x+DuFX7P-Fee>b^W-`d(Vhy5>L=#y$?CTzY
zxb<hsyY-B_$Q4UW$e+9+vUB+0dEyzkEcU!K)Qie(`cdq=71rH9w8&V7>D~RljDqEf
z6X!0uOP+rM3#t#9D5L0^UkPwz-{Y1n);0-7d(8v}{9ch8Fd2v>N}tkcqnhS{y=R@f
ztL7?Wy_fl^`Wj25yKJLuZU?=Ff7DV><gHNkWB9c_-vVCDcH{y^uF&I)2xV4ZboXz$
zgP<3M5LIn}hhsPv6r{%^py1#?uM`z-Y<3A`4L=VLc(f$r1C4V7smlvJvfRn3nk@fL
zURS9u3u?!`^s72~sF*2mLGOy1Tjgv0C(cX%(n!H$yM8hFF4XW+Yg^z5P0kC_AihLe
zcQZunYNR0xbS^G=_uVFH1gfg%jJ$(?1q{RoaC!fzU1Mg35yvDxg}GsjB7+Pvx_uqC
zCuzy+O}-=e0#xT>Ud#`U8l*P84@M}oV7<)ayY|$`^o(Vs^`_yNU1qPV(gT(EJE+7i
zx^38ccyqHT<Zgo~QfejY@(1LIR&~?2S6O#3UO=|{Km4Jk$?ENg(4fA9nKZfFqarT$
zm$+YIFn8YlDmH@AVXyTHC?FN;6-Ush1jnfEGP%oquNc^k0&2?71Fr2^k`$PT7ESjI
z@k=WJmbe*BKr=#XU$W3Rw|4?X4XWfvMVUXwuSd;#f0<-Jg)Nd?dQt3u_u@;7avVIT
z2iHDg!3Qy(JVf5Z_L+c$$BOuz%LCZwr^&z_N6Eeh#owZ3U(@<UbuW?9POn)Cpr`{k
zrOOkxAT|0|t%!TFL-Gj?isDcBZoX^wfh;s5_pcVhwV1eMYo1f!j3V3WsOv}md3I#I
zvXpRd@b)h{GD9u}-V?Xv$naATsm(W=In^ob52bi<+%=t3K4a1Swy3&6LGrL6^o93T
zp{NAg)G>Fl2gW4+l%u~?VfdtN9^{2=IR6(lC$cX=q62l5wfm7%usF!HSDdtjuHvMx
zhHJC3MX<$U4*|lf%V_Y=1-?ZeQUK0Wxx{eSKb^U!QAGbU-5X?FWt&|;Vg6wpBzj5!
zd-)H*(T@ab96pD9b$W_naUKS27zy_jUj0&%E1|R<oKUO3kAMEeJGyT{usCbEm-`T^
zYJvYr8RWr|9I5|RK7H@Ykk2D{YWWvIw&q9YSwpI;x8I>fb`9>Wyf_9<sV5;p9vF)2
z7p5LpAEo$dLe`!k7d}RW!SALopLyj~-2E?uV}D5XB<CwB9+zn_&|#HBckS)J#Zn@@
z5s?n_#OPi%oq`JHHu{0946jF=;gh>DU6#hL9BC)NnT@M=`uitEhE&^3gb&cmv@AkQ
zj0|33_zz#JSL8oiNY<)==c>%`bA#M@{klv#avs!qq-gr`8_~>DbU_#T5%l$xA7`lc
zGmi8d&7c&faTs<w?f*^${0uMBMFYNMWiY=ALik{~eVy1fDNPW!5-YyRRha5I4sn$T
zN2x=V*SaPGKK#`tmeV?%#XAfPlOmk><^``dwDQ35&nh&cfAs<J&1QtaNzy0&GNrL(
zDPKsdJH_PrL7=c}z~`4Y$$odK8c2*xfVYGF>>QIF_BP2UhS2YrNi7DQrvmvZZlN<G
z@?kTy^W<`PefeJGR@x`D4t^<UA2kKnN3GpEXSjoY*c0bf``NwmDeQI`A#dQF{k{Bq
zX2q19B4<|)`U-WkmKP`dU4Pw{gs$ujf3$ZBG)5bpU`x+Dh;_$hX~o#Fr^<aXjvAO)
zbu*66#*oy)I=un^(%O@t8^?s(1V4I@2f`lu8-2?MH<lo8SmAHhbI5vO_4GVRgfIG}
z;=gDe%F1gWB<U4J!{wlwteIW732+3?u6*8hUT9jC^0Q4DDek-kX;!Mn<nAT4s7+Sc
zHLw=$B>FS>wR9X2g@hqgDtPkcO$PHbh1B}bw2le)0cJu5M^|6Vp)RV4QkS^@-<N7}
zm!@HMAeQckYZ?<V4ORXx11kCXk0=MT3E}{Q!e}@>*q#W*gvD;U1PJH<V?IGf0NxNs
z+^zaYWmr0cf=yS74b_@|fm_Gm5Snl$-~e#5b6ie#_!fTcoP=V&+Hc`0l66^)&kB|B
zJLUOb;M>*)#O+HG3zclyXyI0UY=V4=qXVA*skwEJ;-Qfr8@s=)#l*@eB@Pbtd2q7H
zs}Il@^&<GPWSeaQjtHN*q&13k3z32+kD6{>28R@mQv!C8rZG#?(5?u>WyfPHnbj}j
zC<F|MTUTz-1kkJI=RE3S<P?}ZbEI4bppS!BLIyjj_g?59+ld_=bfSQ)!8$r~rZRaZ
zVZFppo4M%UD+U6``L#W6*;bOLwOfzvgkgWrvE1R?i%WZ-6T{FVA3J&r-7+=%BRCYG
z7s$iq@G3;p<4|Lv{IehA^hi^{*wKP%o<{w>Qdq<zMDrNdoqeiE@a#i(L7i7_^TQbi
z)W0k?HOjG>aK5@3#&<MSJw?Qba)QtKMrmHWl2G<T{g}@*ydW&{f9T4)59n2S){l4c
zzvOAI>joYm&<GzBqR5XG+cz;!Nc82tt@NE0r$rL~sODsKXsNGQ^>TxeUq#N|{FoZF
z{ZNcc%Pk1Gt+sR)fujM3*!z6v^40Dk1Isd>{E_qNqyu=ChbKS&Ett`Hsw-;jM3Ibv
z86tzLc~y_IezOp-4px2wAXVhy;AMya(u+I|$-61r!A6ZZ31OOg(uJ-#(V*pAHDetM
zbNhWsU?d9U&^noLac_3JjTHeRq1Ia6-YfztE1#J?#MLNM?a!so7sqX%!Q?bsNdt-F
zWgtmR(+LgAg45sJ?V_Wpc)Au<k?4cr!QHO6sz4jy8(vpigIaOD^jBS0>u^z0P5#IP
zz;IY55RGJ=3@LaXOIMgOcY)wm>#)+J*|n0E=5h5CBDfR04Dzhd@MCe)=yxrK-#FSm
zfhTR9G0oppL!>cnDHjRlD)Z&4`5ysdiRpjh@=m4wDBB1AsFzVnfTu?Mup+STE6vXW
zc_*<-o>}+8oRtX_07BI#!ssqQN|NZeHJ%NDhioA<os`NC2z8dk-Hgx%`oA=fVVhsW
zxvJ5|iDtbr>VcdZwNMm-0W&9gmDW&csVVa#bAx#k5m8BC*p{KyHk?eTKuSEFp2ABD
zAOR2nV^ZxZ=+2Nw*LLmvltv7`xQ_-egBL1jUUr*RNN>5Y-MFb)3--C@R8<kD(1w_r
z{uZ$;7(R$QqdNr8-+Wur7-(-F>(3e3oPTjeO`r+~1sQUbeW3r5x8FQ!Q@Hgndn4=<
z8De&e{gP7L|IWe7etH4qmE_fFxYAK@`n#mtFU~rwE`(%%H&3<`CblD0L8QmO(Ld-u
zJI}24@@<^+zJlK<MWu672ipD`4&OA!(JZxmVFqOZrS!RFV{deG&lVZaE<fIbTjx)u
zv0O~#s|U{>v((#UkeEqhugCh%7Ad+w2Y@`}LjS@FZxqtL_FIT94T{N>yvk92Z*V^6
zk~<LFM}Jw7!{(X&0Z+Ae&uEjq_Y7br{+T@Y<Y1`qRshIU;hz98bx411{u1MFFz+e_
zeKa?q1q3_b9dke>y|8DVfBwWoIoxu|T||g+Xku;Cy8J@#>dN{znh)|mK;t4DFA?^=
z8Hk}dVlLCqGZP6h5G}sQ5#>WJfJ32Zt}DUH*Cp_p`>yobsDm6+B0Uee9#cQKSXeO4
z7XngS>-*Ui9P*!fUR|H~DZOF%5&J&E^T*9UcUo=gI&}8=rh+5ygBB&enL539p<;MV
zj1U7>K?G>FW{c9`>*LM)xh(^HvM;aRY{?Z6k>{R#%^bx|`?y&W(>cYSRan2Y=(DW{
z6GLjp#i-WvvGkL(=M<Lco&B8v*j89qT^VY~$#K@_Pi#B!ED#m+;iQmYTj-ZGcd*a=
zbu9SLL#)rHnc<d8C3cF$eh%SHUW1#4*JYKo4O}z_pq8PIN+k0#*x6TN0QEsI^}&6o
z)&GbL<?AIkFG=Jf44u%f)N<HUykdG}Lcq=h|I!({76+HCI-;~VnTK>iQ4(+-s{@*8
z$!F%76%oNVU;;N_;x?0Y<|Ar6Im$?%@93!`EZfSTzt50uZ?MSJ6Nkv;EEMuPpQ3v>
zru=KD<GXcOHhY;CDezI?MfWBQ;#iv|pi``@<YV)3Q{(sMH`gOe2sR+-#ew|kIqk>X
zk5$t?!=40K)!AxKl?p8^?wo6vV~-`mzQOfPgGZ94@A^BYEy^5bG=yoA!aG1Oz(l0$
z+ucnyX?!3;Nd?EGic5jM++a`!0-+kcsoH0l(6J89%e}|OVh9AMzf?^Bxa)}k#1T|)
z?!_LzHlN>A;)(hB{*5EZ1B}JzvN?D;xa2Rh?aY4Iz@DRN?oH2XOlZ`#W3*m|xRjr$
zUZf)Hrfu6tHInQ$F%%<nRk>4e@C*H#v0F}2`S;FJJediU6+YY0C(q7>d^QsYHIqta
zf$`>wQg6x2fHSrG`LEloKfXzhzK+M=aVQ1_MDm8{b8;KQsrM2Hdw-{=qr1^8U1Og3
zga6nxI^TVw4}5fmhlID(l+>UCa2jcHPvm-OiY?d+04G<?e%%z>^sZfdI5>~YLSm8F
zT1qGaU1B6<yZKSKt*hrA3{7MZNF|s1IkF~9zRHu?FoqAyJP2_8t?Xqkhoi|7bsrG6
zt{=s6G@*O@fp$t`AVx1H<>pBX-N(N&z`0JXnAMH|CUxXwZVgQ?74YkoNzL?A#-<>0
zBJT+CVDN3i?Tv#j5UjQY4gIL#3j<rP2eq+B9Qst87?;3DLZ3`X`sCHjWc!;8cKus+
zJbixDjJnQ~_aM7*75epNF$0p^!AH5bj&qV>@yCA4^uQEsXrXyS?N_rYj1aw<(s;92
z<svuqn~tRUp7h9`%@l0lQj9$wd3(5o4f}eBpZlt*Q}B80kCWoh4#l`(Q@%WK&T~W&
z`1w@o%Q2DHHE8EIDvLjlZ?_B2ehrqmV(D}UEIxACbW8mwjK8_&UaazLd<OKkY2pOo
zWzy}c4qBeyS{vX^Q|@rIui3CVkpYePZ#(s9&Dj^DJ9aFgZXtKR3!{HWndI?dTeFC1
zxl5y#M3EuLexuc)#YJKGG4p}A;m-=V{zcsii5BGu5+iJ;&Tj;7i<c0Vf0Mg2{q;a`
zbO+bvkjR~oDh{I{rZbY>x#jA*UQM9WW%BA9eYpR(<5IhcHcTx2TxFBYn9ctWxwaoX
zxJ_CuJVbY&$sCzI7W~wr@vAe0oMdxt<xMm{Up*5LguF^r1ND6rF*?+L7I$A}y|q>8
znsqV5dKa?-h<(cKTW>{bcvJRK!3jbtKPL-i!+iAgucPNS)d(YB-Sr)R=DQAgisRsK
zxBJUBb@nOWIeKFkos@ha*RB_7@H7yIZSHlVT%rg~&zwnO0I2XjuMG6rApA0xP}lU<
zE&1e$x~K1-;|6&5974C%`Lz2B0hOWg%YPbf9r)96fFuT72sn{$5&U|3)c}V*kS<i=
zyaSIRTOt|2uC*Us9gQiq`H5t8_Jkt4a$y3Ahd+?X1kaQl6!n#m_+p3mn`9qvw##5B
zK0oKOjfMS=PdQ1c8Imj)Klg4{r3@Z5+tj^G3h=p_o5DJ&By!<qD=st=NwdQsga5wx
z+<=t^W`5u94bmGSt)EQ9L#t16|IS=~01ZL~Wj<{EG7v8-yiDD!IJhBZ@Lb7XmTYU-
z$A<L$EBbND%<2$+@QmkbmW=O0yj|~W!1oIs_|>EIryX2ZX-#IxlU}!PH*3pbY3c<-
zX6l5ug6PFdz6>)jl)KKMRc+JRdOvCM9eS9w2oH{d=bOT6uXme2%6#)aS~$K2Z&L=#
zvG7nYAU-&SP3|(6cUrEHt{=htNt7xp=pOyO7C(zfMnMu4k%ts{7cM07&p%zj?k3Rf
zR&(v)8i|>>@Y`khpu~64M;k5}LDtp+x4MYHn_s1@OReFmwiqO#tkXSz(Og%r!o{zX
zY6k>X;d>~@2b$NIuBc>%{-MJAavgd;rln*e6))}antIUi@iTW3Y{1WB<mkRhr@5U<
zCAK{qxSA<)@x{J(@=w)v-R1hl39#k)K(<L8f*P?fxf=!}%@8JBDaK7_d??G|CPYXv
zAs$a8VoMyutRf4DW-lJ<61`6&cNLA{Iixh?O+2;|EqxINy@((dQfr&k5O_&n0B_Zx
z>1a@Q^er-&IBa%%k>dbYwJx4Je2Fc^nn+Z_uEWypoKAzcst9D~KYWyT9XHFswCUn%
zVV}g!tr_Mb#5l3ZOcA%Vd=!M+7keN|6myvwA{tG9<L9OY^b^Yr!0m!me0FDCE|Gm-
z!Yd+YC~6}E=<g*@{2A3o&3qj9xeE&cr`?9(REK5<iCNQC5=nIpZu1+cUA(}9$ah1m
z21yg*D&_FI^hV{2QKw(1B7N60{b{_OR5C5clQLh-*F9)*jynny!OK((j!U5DM&QO4
z;Y@h!)119m7dn;aV%K#;{Q`)T-?3bkH$E-Z-fo0>h>`}@xI1f;&%S**vg9BxfU)#V
z54XQe=w!KPTm(ozD)yX(i5;=byh52r*KNX6E@qq(8z%@G_`%btbg7~=sda6O=(&-J
zydut;1mdH_vsZ+owNHCWeSl+vf1pTwnvd8RDJow;mV>lnH-JcDYFX8e)71=f-v+WR
zlN+M`h3y{+;<ps5X}q~HyuLiP8|f{G<J&=M6ICk7F_L|+$=_J9hl<;l|JU%+#r+0F
z?waiY5PPPtzjwkTdmjybVHtaWGBy0hm4Z~p1hi?7`Bx?tTVDz2$MseSk}cmI*-o3+
zB>eLF%+Th*BcZpMyg(#SeHAY2Ofy7!#6Gl%^b>SX|AWwNU;c5VFs}Yr%G9{hVpE-y
z{pbDL0%qq$gjQa6la+<vvLaZvjLiVIs@$JtKT5zJ0VRvX`azC0z*$93&4M(5yjL>d
zWtVk$6jfs~+GwINA_m>fy?tX?(Pnre<1?qL5u<>A@vIU7;OA&;!Ij9gh*NLv1;42^
zS;I&%#FK|ze#PA0BOz(0UyhnES3K@HL&|Vdiu5dO-E!s(N{#(z#~NI21=sj5l?9$d
z5F@`S-}2RN=ElV)f{hxTR8j<1YtM>gs<@Wtgc=rJ8bs2zb{fK=PRY2A!D`B1<xMwx
z%WHc~JsUj)f&YT2J?KnPM#h`ZLSR|)ML-`CjC6HMqz>$7y6rvrz<upF6Tm#=4%2=C
z>G)RVHa}dLpI&QOl#ncgCPgvi?8a*_Lt7n`1@r4LYwG&R1Sl2Zj^Tso-vePOD?zjj
zk5XTCfj>e_0XjwU<})e{ayaNE?*5e9BYfgDTgX2247srP&jZk(QRWHpH;l;$+##|0
za<rO4H6L)Py43ja&GR~&t>eXfc|stXpqz?<`d;M)Kc`O~mLD|_&Pj|R*A~I-KPv+I
zy?0-}dB$xg1PexV*pWCrrD`n?DH*{}wBc*VtC$EJm2WEb9&H+n)&R}(OrCEfT^zkH
zgY7yro!0gV6ymf>jn60Bhol0Oix*^z{7LxYbCzP3_u=0->!}whyxFL2N8^40h0e2V
zCvh0&L&VW4_mah985IV1uaJHM^F<Ys5@Q+pO^R{y@)wyhLg1T3^{|a1LIXfWv}9#C
zUbi6VD58062GO2>11``;P{s#^XQJ!kHChLEXC48jhHI@l0wvNcchNu_W5q=nO$fCn
z$h%>n=>gFP!h*SQJ$JddM4UW98KAuOEsLtMQ^Fnoo$*JQ?J6#>`Ah%6%Hino!Aa;(
zl-I>W2G=ABfrJhQDTs03`CFk;Y1y$X-1y*)9s?2$j@&oZB)m@#JlD3vJ)vCFP_6?o
zE_E1|(;MsZIC!@1?p~u?hZ>F_uv*d_A_i)qI7sWaX7~G~<w0!^>JSitnn1Bwf&X$G
z)uH_wKguK?;R#4M$j-?_z#nBHzMvrLQ3J!^d(t;RIS6rJ^i;lu0h!#|47R13ZF$b{
zHDFI&Vs7HNd<%s-ca^s4j~A5o-vCfFr8js22)tVQYdj-vJ#6onFKjID+gu|(z^SqM
z-ray6Fj^=$g*|IpAoHaJj3(fH`FP08`6ca*%;_T93KIFD#vy|L+3TK`yK(9_jep8r
zfTbV|0BYmnls%eSZtRoME?Yp-12!k&8%I$xqP(D+IO%2;$$K>s@37I`;R2<Rzq>x*
zdelU#33l)!x%GjHj@Aq@rB-<Tivvg<(nb-a0GfB(emLkV+WY}Xv$_IUhAEC~YUb^)
z{d(6{g9}r=X<$Zjhfn|s=)`K-q`h{d`~0j>S)Hp0@<sBEb3|c4!!SaXPwhG8(Zm4Y
zTan`s7>=I3{@d}f`o_K%Q$0iz$hl8clbBu-i^RY-TL-MZ3?X-TKKBWJ=lfzSDC}Bs
z2ev;0^8P=L&N`~e|8L_PqZ{e&MpC*v1q77tl5S}>I;FcyLK>8o?Tbh^NC?u6(vo9)
z_Iu9G{@c#E&-UKuzIT7F*YyGk)FB)CrI~EedvJAL$m^;NfEl{+WU#E>#JVQ<q`j0%
zRxoV7<7UilK!Fs7DBvQqR>l2n0^+d-yf)piRbRF`{eY6~GT)8M9O2odKJ<6~rl>3v
zLc8GI67?>+%rn2_Gpr0BQx7ep!MGRS5?@u2zI(;|4?Dh=;n@$SjG_;4gOnOE@RXeK
z51)jWHlHL;ACgY3Gwa<zV_0_&2;t`d8;l@KIAK2^ch%l_w`hh7;DPxS1GDJ-2KSyk
zXy|twTY7CU<unGZbF$v_)&uiJ3a;Q7Qyt!<524ji3-1KVwF6)P%tRjE*wDKBpP*f5
z5_#mM(n>xu0$U=jA)(n3MG~Ug+ovK~vRrfWIO{;G+0gtvv0Y$VVT{a6U#piji#SXI
zk(99T(g3`A8BC@R@X#1IeEh;k5&bsXk18O_coiaXgGxCQ%G5krb0Vd)>tY@a<z5<+
z0>Z|yCr#m|PGp?<Z{EKR2aG@uQ7s{ca@4*!qy|}?BorI!O^2+W5enpIBCL4a+#zL{
z^!b~Sd9at`WqtrkY+yM&hsazH)D<~$Q*IojHScnYmi9UyLGXEIlQTpPBexDoBGjU(
z==rhna514Q75qVW@NaN3i;d;?jYf7(`Lv-YO9$NFz-gA}M7(#x6ZgBBf&9;xWZlxQ
zAeZ`aig!KVw~zj2k_v*>v8=p7hZ0`~LxLOsK52N4P(=)PKg~emU%jiCOw)jHNcs%5
zaLG-FcdOrB%02(h=D(x5`~Ld16lh6Hx6WhBuX$kDH{_tfDqgh0Dkq}r-W*Lq7f?uN
z<{-O^g=-^}rLOz*4qi~pO=N*@qUGz%V`z!w^De_Y;{GI_^&yVB=2cp-w1)XHlt2cf
zR0N*ygz=N0p}lmMKeV|E^-ly2MqjVG`PJV-BVAjR5|=8u{(BcmkCGda1*vo!fb?s{
zLkCL_N#J0h3|cOA1bbtM>AnAdeHDc2^T_P81&P!AC5*i<;IxsyV%oY#`1|M7<lcq4
z?!)cwS!Z_xc`2WsZt;UKWU9|yU#g^Di(Z`<2Yw)9JdBWf2qs*&Yu)N^@yY#nBBLI`
zBkq=c3?9D+M_HN#)F*z}@2y8uwzz)+1P$&P^9A1S35RppD(tY6&^@PhgByHDznhVc
z^dM3<?p&^*02y?v2h5=W646f_YtFL!I>#-b2AzRs!P`<Bf$8qW#kC>E8OTsIQj)dX
zv{#$tkjm=f%e0U;W@B1**&!-UT}n~+1_nA&Dw5NvfWE%8DxF;x#A@buMin-#rN@_L
z1bL;gA!%6nhlzeBMyfZ~(D8us#vt+jND!L-WHT&ZgjL++9Zdv9<OqtWk(2$1DK56h
z)SurrD<)Eb0niZ+{hNt(y_YX)^HEcM0>|;bTMN26Py^_m*y(<>Q^OAsF_L+-2BbZU
zqz;SSNrTXiBE}MnF&G5z`*>FmA)gr^?DdNY8Mz^i>9J|li0;kQu?L}psx;G&eCY2L
zKOiZ1O=^0VMCb)seagk-B#*y=WcDXst|advN`>t1NzSEF<vjGrGQmj6zi9*n)Z?b=
zGdQhHH}aJyd{4?!!nZ$HDt_1({IQ+|(*}+HIcF*AFUlTFpx%g=IS>A<`(AnPkU8FA
z(E(ZL?%KN*EBo|0+7sz<&G9L+OYK|B0!Xd~exp6pH<_hvg-&Pe{xb13@Iux#QXQIA
z)~6#k^rh4jB@#>xQr%9Kl^>b^&z_?UR^;p|+#;W6_=MV`;ocIPJ5MdXoC%U>P|Kp5
z4Zph>q06VV`{qhkq%a+3aWP1_Od*U?t%UrpxL9`F@`fGc=J#xs7^tUSTa}I)n*c?a
z@Rz9^i+wPEp?pQu=_4>plouRVf>PcF9a>B26nUIQ2_xO2aK2q~M6>xQv?_G|baCNa
zIQHZmCNEoqjnqT_N{0Y~%9ZVFP=XpLWEsX|Xj}uRC^pwT8e;sl5{rpCd5T}qdaKvu
zI@=-s9+@cJJ-~jED5028ARMzFa3=VZ`2M_09@N7sB918oLIB}td=sO-(!3eoY#rA)
zuQ3f#kbztn>Ki7YHm7b+6wl1Q$N;{_Uk65DL|`%a-2KfqJ`!MaBD~V*U=l&;fEZzJ
zxlp&T@`HGmbX$e^RD{$8Hmcm|<G^Wm3LVj2n9)ZnUl=bMkO|Eb)KUO7aYEG@HExxK
zp3Ft_ESyK(g-#-=@q|COm4u!OrfQaW%cG-^9L#krryfme`xKfrO};{j#Sg{U-5C97
zR=_k9&$(zZ3U!D14F5(<8_XbFIBtJ6$bgpyiZ`u_uap|BX>>3e3V?&|u`cuhT<N@=
zN`!-=_|mU1QXze)E-5JLI-=1c$K5>r`sf`jWH@6SiJw*0V$%CwFba*RdT3{;YylJh
z7&<e*DuM$w$RvJR_6(Hhw6B8#TXFkQxR4B1t_m0}A{O+WqXQEEhiifWW{rY?th63P
zbAKq`R;r)3lp~ohinwO-Z^|)KY-pq8QT=y-?*EzRqsKOVm^Y4#lg(a8-82iF>Fke;
zA^qOTpZLyBTG{it*CV7J9yn2@k;R?GoEj`=c7qpP>EJ448H_a7jpqBI^7%myjo^gX
zM~skTfchVky)wM_hluC0*T;Uvf~ReR(>Xp)@JHcunJUs1^n;ZOG5dZOKr+m7i(!s;
zZve~Ju~b^g!{f!Z8q^KZ^JDPM(DLV9V&C#_yxB77o^9ONhOn3O@?%YgEt1)%RPQ1~
zQ%K@bI3~Lby=bNILnsq{(w&BjB5Ktx8YnU)nt(>~`o8vUNb%<kxIWW5RAAD|@;h{Y
z?kZsZcLav5c*R}5<3s%=-?jJ?mWYssf!YP_2DNA2FH0mI5{6F3-RPww93K-3s9uWL
z3q?zeAbT?5>zw}Lv1Xw3VvQrNZ>*}NN)etN2CLPTYuvCH!k%mLbHf>6=h^Ry-QU^v
z!yG*XR#+Xl>+D5%C^7uG;D=WuqQpmdtJQrvum9ac$49|R(=-2$hSXB*4<es*GSiCS
zjqR>7Q#nOfj<3{Gk;(3UJV$|bcJQKG`t)>#`K{|fwecCAx55<bybtLCVm%vg;XX|>
zL~LPSPV|`6YwX97O_}iJ9-)RW3<uxLaC>L8eX<dS^Wfws=X+{mnWq=(MF@&}k?DcW
z;M?DyFBDo=G=Kd@y(2Q%^=Iad9J<05z5-v{B2_E;dTvcRb0eS7RT_pknh%B!{)ooK
z7aBm<lf6NA2k8~a7RFZFHbn!X2c5NfHF2P&XT_v^mQ#OY{R*Qo1K33^4N+pmy<<_^
zzHO%#+3aEuS{(U9DuO3CVO=T$C@B9|Tyb6lb|?C6tmF55tp+Y>4T^X52w@~93N`|d
zMwvH?;FAZ%7SK)<ug*=VG<IoY&S<L-UObq)KU>-%_+H$ksqnlur+6K4G6*(*Ye3h+
zIG(>c%g%5^MFVlUeLXGo=WlQaH7<n|z{ED~t@8dMVb_=KzQ)MLA5;3=AdS}me=GiP
zxG}M^rL~@Dc<hgyk~Tg|P}T(%E~ZDFOL4DyxLy*lfcW%nR-M2ePce~W?JL|26Xr+L
z-@8hlcx_A$Ndo3CFE>|*I57D0f|>*>@_j#hVtS{7mN;Ah_75!I1+Rhc3aW39%AA|{
z>+90xpc=GN9&qzY^A4uFrF!gqiZq`>oBvQOQ8B=qpU{0!c!Q1oZ?hD@3DD&2*6o@!
zTg%P>qDa{8&_cdoB!t7LuRe<Q@{|JU4_A{_JEx0)36lVHyOG}@zs~@}{>J8m_!43*
zcbY+}J8#Sc7Tx=f!ob)0rHQxI?QyC}C&UVrV=y7LX|t{I1ZNT<e?{_gO`zEpW*p+v
zYvLS#PSM<gV_eqKFthBw!9Q7)hEj1-N5!M6I}XeWPO=~$M&IC$PY@1>BsHgmjLa9F
zGg^m}a2~t-42+I-C=(OXd2b7>=hVe%7w)zgiPKZ$!UbulgyqU74q;N|ry4)V^X_tg
z#3)z?lmo=3{^<b}v2~tyElD^0gtn&lU#Kr~%R3?j{P0c?h}cZD_W*n0RG&e>`**|k
z!$wkmgDI7Aear{!dNMHq;SzU0rtxi*>B=QBW`oaaIRpsQn&L9>wx+ou)8k{3T@B%C
zfO;-pHmW~%B~eIi_x1F8&L6*VKx*Z2w|>LOnqukZr?aTf>Gc;Xnh|_bh2>Yy0|yF6
zUcPkA6Gm~s2l<(*F0m^9n>UjKc2UlgG#3w&%xztYHp6{k!}TAMcV32GiLknGVx!j>
zX^FD(<Gi4KK9D`gaV+9f9{z8d=9yFp+`%W-7B5FF6lCBuMj-B;3!s|7rp&U%Lvsgu
zf?P$b4F|u`fTb&GOj*+}KSX_xDdPVqX62(Lg2N!<sm!GMEGcU9I`ya|cOzV6=fW#P
z5Z?z=pt(vOV^Y|+y)u&hBNm%Fsi{a~7Cy%NLJuqo<YCOI!dbI_yCo-Ps_}74^6&A2
z(e`h%QR!}($#!g{_<f)gp6Xj?Uq)Y~tM9P$QT8Np`o|!0`-#0J)AX`;2=V)KP0a|E
zopCBEC8;F3F`tzGQi>mN@|7m{Z#gu+XUs`AVJaLGg^RZ1a&%WYf-EZKFW4DbFsL(v
z25TPFH8@mD?M($o(HhWAh}#u{-4vYyhXz^zBv1N7J;yUqe4g5!C*~&;{TZZ5L^~=H
zNt#If&{DEjh6S>=?=+V~U2pV~F3d(rGU#zyzT#tdU(+b+wjF8#S8gZUgVf-B-*P0>
zz-ITR4M7)bsfpjIs$$}JK_BttUtWw4kxu0I7qG!5GdE-5lN#GnFLpc*F*B3$EB^cZ
zL!-vQRsBrf$)Fn3U8nJ-f=qEnFMSg7V{*%vu!te=#aK<3=o$|LzV1P99-i%Lfi{fA
zxsuG(9tF?hIF*cB@(OnadZo$gY?l<Q4D=``gTl%Sq4Y(`hsxvSvq_)$_4>SPUJe|E
zXP-n10$u~x83n6S5fUL=T4EpH8Hovt0G2i>AxUW#zAX8PFvDNzV<IPVv`2^Qcv+E!
z<h`yMDG~P>?>L6nwN^8zAk5FN3)ZW<bBu=s_vSfpwn);KBsW(d<(k5%@k-6!^$fpc
z1no#+8s5mAuz2X=+F8daU4gasor|rVKE=PlT<aQ|jM1C7q#K{9)<MiIdhv@fvA>bD
z^`@&^Mj6FP7*rc4)yoR~*$GiQ|CYyzi%r=can2Qhv(|T3yB-|@krU-Fri#?!$hK=h
z<w6DhWXXMwXI^{BY9&wB@kig;?9+7qPz+^huFU-`?<c#G|MC1@W1b;w<HJz6i}G#5
zlEP+VAm0&X!b(1NH+F<YoDh-a6ZdBY!w+wMZ{_Rtp7nE6AC&x*gx6}8#$S<tR?RJ-
za?X11(Bt^BARdi&7@Jo|#XY5|06ydWTqw!-qEt)Vjq4bKIVb1gtz2}$BK})8o##9C
z2&7x>rs3fLnna5^DJ0YZETm;siyW*Qb+An`P2FI-eWLp_1<KeMx~s8sLf_FS)Ug8(
zxH8Sxevn8%ZHrI)$!_~%OoSr9A#);8#m0`QX-szPHf8m%0d|9SLJ2@r<wiJ!rrz6p
zDu{oHRm{zfKG5aIow)I$;42O%NsrebV`|9KnH;HhV_j6g=hAgd_3-~P<W0NhPp{*k
zE}?PYBzT@WDjXqs?Eg}oPFhf{7-8~FmUO#5QQHA2+Wv1CaX_ta+Hd)2BAsr(k&d9j
zSTYq*p8v8l5<e)@^|ioNo8>q}@DM=s!lv(R42p^P++Ud5y47n-sG?y@WglNeU1*U@
zoJf6jyD3Thx<IaWLTDi)iS}J?nUA8qsKP;zdKBHQ!|!|Ryvf9qmL4JIJku<RI$O+I
zwe63`*}O;=+on&$I?a`%hhZFzolN6w%dQPe^b(VwQ-!3&DPx(zf9n0u+<xeb+j+iM
zP7o4C=?i4?zr^szQ#}Q9f;_VB3!y0r4$Y8u{ge`J@3UQ}v(t;&H`w7MU!nMW6!zhU
zY%}VGm1!!T9pmDaFSO&v#}wE$=!lw}X!j%6KQctsC~Hr5>UB2PMU4m|%;&p@F&fZ*
zDCIf&Cg<givzsB}y>PSyDY)_MR?{Ni>v0_m{!F4lqVp703~f?C^CYZ!bcr^%j`T)9
zzkuZPmOXZIb8)x66y6;r4mF<&+@E}b@ne%skLC*aW7gtA#$a1?;rFg*EW8)kCAII?
z@U~!E-JIWD+Te|P0Q+C_DA>%(X{bMWlD^qIHNqhB;Od1))de$GUJEwUpn28tOoDF@
zSDKPZ-M#&!5n&OuSU=bIZD<u2*(<TBV?Vp9kbpQJj`NC$*zv?>Q=Ak>a^;>@{7{y+
zhpq%ebVrd1!I5PdVR-$VyDY_V@89yL8`Zrn`JkdX8U91RnGjl8y^d=`oT|D*N4`RH
z@7u^R(^%@nF2XElJhv>9B^6o`^IcSk^+&I8nO6Z^g_OFGYz+Q3^dHN){QVK-C5EGh
z=-gD8a;>?(?@e5<rB(pM+SXsKaR`5JFJsvFM`!)f3WsA9Gv;LCd0D81AZND`?qoPV
zWA216^mRFZ)CwufSJ8&WU)z7xH7#`%6bPs@XpIw==o;0|0ydjcZl06QxjQ6!&DK&Q
zH_4B}4a|y*C$-r#@{?IUU5!U!Xen{s>PWjX1^?Cnj<SfZo#dw`y?J#JU1f(<-aIn<
z)A-f?8XfcMy73G6mir$PJrntMTi{|uWmHCpK=rkN$J!$M_58oc$<RdVcdwqcGPjk=
zC&ksc;*}qI#`@7Fy=O4z#pQ{eSNb_cF@rU)@xwk_a(auRZbgD3wNB@62em_4Rdy#=
z*!_)u$&8n=&;M(2aBP=Jn6H7iC>!QZd~j}mZ=U~mU_XmsG7gcEQQi~{bcC^F6^@oa
zN0j=SO)FS6)Y+`nt?M<4^OKrH3UnQq7U`kJb**Z1sfx<Mz;wrrMbXC--YRPtuMx{<
zD;?e5zg6oy?K_)GHm)jzrR3#WBuTu9u~So|Oqqyp?wwuK+w<~$l1L`*%?~*?+C@U-
zj1x#zNoFsWJ651+zJ<6TVFa)$mH-r_;1~vrBv;hgT6VHBQHBWPaTy=54}Mmee9M1D
z`E-^MkB&(LYHLp-c&l9u3jW#GNiV$KIf7}Sv~1EW)axWYWvoy6^O>_qHKz_?R^@@S
zsWqFIG;kBiM=sL~xYD{d;z1*Q_gI*8J`szt^iUgb^VvqU+Q|GbT2x92>-Y}_b04X;
zE~f#(ntn==Tk%GR>d6_N3z!{pWx{DFc5G~qKz7^)SFvLfoDpgqE#KAtzD!x@m%{v^
zQDJG?zl!%RYz=|PwcV6Lv~M}+0p7lnRL&ZUpMY90iiT3~&O#>HySXl^wvk|0dM4@%
zM#`N&jCLTq^KYLvZTbtd{y*Y;8Y$>AQ-IWRsB%Pgdn9p2lzT{x6I0HQJXiA1Iz3xG
z!$ryw_)Wp8zh68w>kyCkP8yDe^U8{;xpclHr|xWJX?yq9z6JwW<G(cM=&3QZ$^EE8
zklwpUJ2f!!Lf_G@mpnRYz*HooGdMj_<1<F=s7rF)Six9Wef{*tW?Nofp_Ou*fYUd>
z%~s_t(dZb9LV3K{U?-5M`mqh?&*D!JS_(GJ{$)k-Y?8YOX^JKEGR$?7fH041em=4j
z5+(i>96Gz3kw(Y7JUoJbcf~X6zGSRUyeeaMh2tE=GK>FC`AvwO)tarZf=S<OSnQ1q
z8o4D4J$G^ZX3MGZz+3fo37(~IvoqSf;3Rnz&3j7)$wYJL-~3@~g^BA>B<x<qfb&?b
z;x!USaTxjUk?!1Av68hdyXG!#h(A}R%TxgeYrRMI)z6MCZgL_n*5{YgL$qkK7+8Dz
zJAG;cPo>F#QlDa-%_A>E$TpshX2@edU~RR3`|c|~!E>?k-km_s5mXREVmGXjfpK9^
zuRfu4qLUM(Ti<K*4Ua{^41Mwk#JUdfRnofYy!Mv?qe6D_?Kqh~wk||OtR{w~A$Ks5
zO=wN1s3<|)PSN3rx8$>;E~1PY8LAR(q{JaglZXxGfu9$^%dR&Ak`&k&`U|rmx{Yhl
zNT7aRZ|M`BP?p2Hx0~Ydb_xO$HY(g;1HakM*zhVS=WqU=qq0>6H#CiIgorwL{J1cV
z`HkuplVbPSX-`&JrrfZ>>{;Tu+kgK-K$hk*HDW_mhN<f*t0Yy0&P85F=NsaJ&wq{R
zm`30#C`7oOzcz-&&~4`q{$9!$w2yZgERvuc{NEh-O>C0F(YYUu9~M<Uy!M4Kn_NWf
zPQL2U%sU+LV<v`L1-5O^ZKikmPW{xCpu#}<XBVRfQruWi4re?KQo<m2V~3xw+73Vp
zMbX}I9THOge~+>Sn*q(tCTea0pN(nR(YWe^XeiRTcsTm){-bVX%jsizA|KKAdjj>)
zUS%&!qoJG;98F)DXpF+TiZPESkIv_qN|LCj!}|z?#&Tn6(dsI^%Ak}QnXfCf3sCAD
zX*%MN|F|s7y|q3yCJ-tpg3CJI=u$(~b|MdyvVk_qVw{?F`r+1dQ?u+ll{H~2R+##c
zFOzEvCQSZt!|!OFi0*b{s4Cwk#rdzH8@QcvJ=aXNZWL+}H;^X{2nfU~XSf8Iqt(LF
zG+({{hw$*wZX3#Y3;><X2zei&HYKGgu|8*e&k0*-h0DK1l=!MS#oLJ!TOG6~g9_@+
zN0E{b!OZ_8NY@NZeo=zbYvAhE_-}a_8g9QRWY^kXwLHF8y~szoDfiC}SJbu+>sz9F
z!5I$!xg*moF$WFoxQnv2>d!2;B-kQ~PXwn3FYB^ni68)5lIi^2l>@d<c=j+HV0SjQ
zzK|G(1~$MWPuc&y>n}zly9|@CB>vhAhBY)Z7(W*LD_gO!w39Z~YzY0)t4SWff18kW
z1KXltT~IDJsBfU#4XL0s_ic-<ON6vsEZ)4t*CJaA7@4NXEug`B4Q7-mH)D&C&iF4#
zAy(26J?juA$tcEY<j8dwXv>s!gA4jFh|_1Osp)3QpU$x={49%dyRb_V$GN@t07Q&V
zO$(!GX``<w1-zF2Cpr2A$r_RYNaZo<czCnX0c8hB(GV98_Jk`s7|P0<&q+gZg`kYX
zexDAE>Q{iJ95;(r^!;+X*7)!Hsgcd^oX0vwYq?mFX<Bgc@H)*AyQAm_$dXeTNVCJ}
z?8M!;aAxvJ6J17E3-09yI~KZ=r;HR*e5LA|16TxH>bMiLeMrFoD8{>VgC*$;J=rHU
ze#F1}ZnKRH)&zUN>cx~WDx-#@{B#%o(RG7t6BEtGKH?t9G6Lg2d^^TyGOc+o85C3g
zPY3omv{yT|o$5S5i^1fN38PU=S+v}Ok-<p5j4{`%`Y5Q3yhs2=r~+Y!#jPPM0J>zs
z=#brmbWfl;tj#fhM$$G!?*((=yqL`Ki@#zy2sv+Dl!RgErgbN#&}apdVdd4Nn5kWa
zK96r%qU_Q9<DX+pe~7G2i7{r?4zTdwsbOeW0G+#aw=b8X!bu-AS6*s+fB0}1O1~j4
zlb+5?O#8mpb*-Oawh9SYImmv(+lIP=bx#9Et<Un&=&lDA3?*N={V;tAFT3>o?om_|
zH2tKjw?9=cNHn=lcWpQPddq}-D<R*ZIlDwF7Q1o|TD3qH3InzR{}@Cp?oSS}x5DQl
z&#aR%SNyw~&-K`M*Y^1wIpsn7P++>`_Ai;r8`Lz%pEG)Q-ZBWuGzLw*>vi$z2uwlN
z7SCF>8X0IffqEB<J#6|^kaDWO=pa(G6UxV7<3Zo5cEQ_<MR5P>l)NO6bLbo=`k%E&
z52gDaGFCpC#t8@ex22?_fKU{{qKZAS7Z&GO>K7*;3d<cwLN<G6x_<$jnF{S64tHN5
z*U-4d`nj(%#1Fr#|F6Z2kl82A>7_t6=S`--^zs^-SS0H;%+G4IN0IXjFL|(PW$C(?
z8CUxC)-r_ta@RL<JLRmyki7P})~VrfC|j1J<AtJ(hr`h&Pg=m1{&Ck^YD`zo>YV3~
zF)zu>JM{x5RUlFo6mXi5bF@P>C@5XjCR(am$$Y+o)h4?5>=#&!-rs4g^2h*p=s`;w
zCROkj`+`D<oBVD9m+4)uJW!151mig##?gvFC)6W(QB}&1^p!D)(3NCILf|48t(eI=
zOJ0KpRIQ?eT6%+NChb?2J2!zettgUXmHZIh<s_Ub09L98w%#An{23|HcJWh!d9mC3
zU#kP4vreXPP!MHbMtKCLfouQ#s71jn`bvhEAQRrPv^nU=y4qOixaLH?#l_hrxP&Jo
zal|%p6etcRy?^u5I+`6+UZ}$^!ggw4DDF?^$1gWo@4&1LtYG(=0`lF10q(%Q8z3fH
z6~V9F6peqN6@y1C)SwVHCK4pQm->?SUFWYF%ByvcC}m4|M3XxR&;7SdH~KR1JIl5O
z$5x-RVSw5EW`L*d$Ql$I=>=%~M-M!ONuz`T$^z(P5TTwA8hsLzIIm7Gf%l~M4wUWH
z#j`>!Ai`>Kp(+4902M@s1pfGWv{zB|z6~aFTyvK`{{!a&gzBt%TPeQ}fk>K0g?NKP
z1Vh=jt1LA#uJZ4?(Pw@|(zY9pY{Sh_vHhy1e#uRFZHkzPKbf;3VW16wmzMb2LGirk
znEhpjb(*(%5Md(-3`Anc(IQK_b;e>SPUQ{Q_%6YE$3zk(wps^VgXgUQOMxYDYeU2#
z9k$?<2%>`O0zz^`-HOwXfj}ErY0Bg#ir2de`|>u8rDg&1m8CT*R<3Tdd=$ukF2O1M
z(GRDGs=Qd@D<+Bq0J?=RYp5Y+yc?A*W1xDc9#`jkhjlz*q+yeDo3+O}M^E7xh-x@z
z?>Diz_ZHbRzALwIaTHKOwAHtPTQKp}ngEJ1ZI&L=UkE3?z`RFA;$IIHq<eB=Ur$2_
z{y&P2_Qx%Cc0+!Invy{Jh+iEvQ!noo$;kfDm8iDx2O|m$8*|j6Ln6`S8W2kkL$-#~
zCrd-5{eTas#6KAIw88-w7RL*+e=1YptH3iF0G+GNZ|=pkaclLhZRm%nPje%6!D&Vj
z{{+9NroWrldM_oHjtH;X{A(D!py}SIu_bX2b}s#sPJ54eZ_t~!I_DaH$)x)wr=TS7
ziu#^32+#c}Cn!-T$Yu8Bk><u9d<pac)W;7-v(^fXx=jR%(D@qd_CD=xaJK^cYyYjj
zdHl`4THxRCmd&8gXCSQ#DPZ$W1^QAV^Oc1XzPC#|<NC7~imZ4_fltL1{?m*BPd_4n
zVSo=PQb%S<mix|H5!d;p4edPu4vrLu^}cc;6nY=!yu(UkiC0#<d-D;FG}{LcwOTqe
zn)DrFeADSDU3>fJ7IP`I3E1;jxE(_V!s2*r2H$_<Ch&Js!0OjH!$Y}d0F60kL-eD;
zx4m^|v1A{n!_FP`%8VaJp2r9&-Gu|#!q5?H$cd{>hXsEQ-3jW4K&`*fY~<@UtGG4l
zhW-&4=0z%?wF*%DLiw*Cm`hGyYjdIPLI5?cmX-252cq{7_P<Xl!X?M|CRYQn@7e)&
ztv_viaa;V#KST!ZA>TiA^=MGYnHSCBv}%JMDNL4We>TEBcDqk9F#`I!iNY{Y1`*Tt
zQZ?{L(hwcoCgm0Utxq>tIR*HM-JbNrgk_P)cb9$&C2r$gkj%B()03OKme24zO>p#m
z5kCC|Hqv-_bov&GB?qMasU3?o6z)+q$t^V<5{9t>I*)9jiK;_ejr4ZoKSy0A;ifA?
z8*dOX02@+8gQa!zn{RS+;ZJYAWlPyM_i?;iHXE|czo!pU2L+*2FP)a}lJ$-UzEZ&>
z&=F{IpP$a<dTdsx=0X(!UweNzqqr-E`--#Lt4$le+}v$K=nn10p$4ds1Tg9I^<Jdn
zpEYO$2nI4jN(?pwb?K8nCA01Ff5_i`!0l&!{>}3Y448FtB}7*sjAVp`_Lez+n$rHU
z8NWyvnrRP54+9MRvee=M*YD9yd45^_y$|@qi?V$fD-q=|nuD1N(g;P$HY2-~P^)M`
z%p^59Xxb@})FvMOWP%IML5glZwRUFNUm^d8tDy{MOFmRuzqvC8&jWimH$^cpr1XVe
z2U;fy$&S(YQ=W4+49M4<(<cG{wrhm~!blN$`<rHr1ib-B%imkl7a4GQ)Mm-}U4T4t
z`O`(u%1p2T7VEJ%W1a59dHe1HyEQ&^N&Tc{c<lt!zX!P)_HcP)yr3JPI44(d7oChW
z`XqlB|FK4Oi(wn7viVo!lNW)+zoG^_{8ufK?0{mDZ}K3<$TXPv6eAbVjN8rdW~T1j
z*#i-*CI_sA|HXnf!Mqm6+=}InW}P6nc_U_&WBZzhFyZO2(X0i;NDV!EOYY57uYB4m
zu}jdnrXz=finPws=Ddg+j(s(E1A-Bu++l*$v$xhyNg}s(jhYwIN`OWy507d34<=Ga
z&kCm<a+wjQau~>Y=g>8rR=hBf+?$Bz9Vp6}w^HGJbe;H6^qc|jGSlWce5u=gKLc~#
z!fQ2DmDhLr`g^vw@6R)b{7BxVwK-;p4sB@yupK^v{n!KyRydM!44<z+a&9d+!k1&X
zHIR~hgq@M+Fsu-3*1eIdfpnIzGy>mHm6@*WG7>@^{?`%o6ydriArF&3&fhh0!aLh^
zEb>G;pPPW^kwiZ)f~yf6^FG0&^J&q<BPik}YwSCYb%f5o_`~MsN^y9qHXh^O;OhuE
zz~*nVKN)bn<8byDPmIa%evAQBok(=x_aE_gk5Y0$lIMq{C{qWb$&F)wIsk$<bPpPZ
z*xL}?qiVY2BO3HCB{7zUd?Ya`An8;CV^{mr8o(p8A_a5{fOLPcW6EaV)UO$}vB00Q
z>s8wxmm$6SL!+0o<Wg!2V9Z9KO>d~~)tE;Jv2`Rp!GICq!zW{uIvM=m{a?IRE8Kyb
z-As#A+lGEd_FAltsjJ`Hjx$}6uQ@IVgY6YY2i|re1oGxKF*+d-(NC(>G!csU&uEP~
zVMO~#xrcL$>EMBWZN?oMMBfY5Ny=%hSBt1f>n;)-;}1w$VbtP|r)6}M`wcCh{0FrG
znSJ(cP+YOdeB25PCK!W|zY2=st{-u`_p6zKb(Yb1G%X8RoU;S6l5l4D(je<)^&b*c
zYT<f448w%}dP@zqXn@L(wi<dWvOSb}!44+?{aEfG(XHaU20pYbxYVIC$_Jtr;&tXX
z1aCWaGu<H<q5KlL8R=y_yU7tF@m%CA+onepl021UJf>HcAU$QIl=ibsFE1GNj!}3m
zZ^QIxJteBV#BvHrYV&b*nAH@l^kB`3r+xuefNSIAFX*6K&c?%m?ykobZ1V96I-V~H
z1ioQkfHPXwxy`;sUF`y2&Y}w(neG25dwFZ=(&Dxc;UK;U8Dh7}UvT*AOtfNus3Nr!
z9HE+(ckr^KP?%Ug2r7XTJh24`d3Bp-oL;{8Z|j>9Ac|y0Jjm@xm;h2LyJFjSmvBgr
zq=BKe;`5D`$YSC&&wM39$G*zAiSNJhQ01s0%18V4^Q<GoT!O#L-c<B9DtQxay7>Sc
z-IXN?qR@KqE^8`^Zg`OY!CU2Aop0OZ>Hd}vt6ceUL<3p5PPFDocNDVRQVHc>W}OzK
z&rb(St6opC->7FJRZsMzdA^RkgKF=O%t5jFu6>vCA8J{Tze3}Xe*HdvnZ|wQ1RiBg
zmCF~K_35n4mAv0Zbn6?A%51oUQUUH99%<oms~V5dVO+?L!G4_aO`QF^BiHATLCOa#
z^a-P0vhoL`Wpj_Gd(Me>BIb98LTf2z%U9qG@d#Z%iTaY_HT$9&(HnvBPbI*rZDlcl
zwZ&yb75KkEKr!%lUv9+m$5t3`S`G1*xLLWMSUpCbso)dQZa?c}@CBtz==#88@LDv^
zyXX_LauM0ypw$sU|D>P!`f;BuM&C>|Y%sQ$I7ZQvfEc>up@lJ!oD`Q_JJw+EsqmV4
z$j0Zk-7j0Rm_M6}W*JfU%U<&+s1IXVT!)n;i-X*qku%NVe!BPtnW?!kLuH%k2%Mfv
zCOhG;s{9t$E(!3VN!71oNLm!}^Q79CQ~v$VdPZJ9rZ$0@+um-OIehAxV8iRxe~D@}
z^2>Z1dtL@+%#APRobnz-A8GZ$wQN!S=?jSEu#<L-*ECyVU0WfK_MIK3>IQj5%e{pW
zPdek|+RS$?Q~itoP)Q0TKfOx_rKbwm@~pOAo4@lr$KC0rpOVxsrXMl1V|teP*3JT}
zdb5me<W6W*%8;>27WX(2=$=ndb6tQPVbCC73u0sR5G3b?E5lXV{U@0D<poyHQBOJT
z9ODI_*2jnbhz#a$tJfr(Wm+MW%UMk4Es1EH3`oymq{-*{XBOp<SU9>PVwsIG^f$tC
z5d3!xf%Et=_T5Kqa1sLYQ(u~>;A7Yf`ONaC!pDUx##V0D%&dNSMKIf6lW+m$2>Zt-
zYST2(^A~uGcn%mTA1kagm4~EV@S=|;e|XY-(ClA$Egk_Cy}U!l!H+W55t|E}8lK(c
z6o^R#Edo&tAAJHnS3G}R<j)Fn1_wm`%)5mC0{*Z_)iUlj43VS4OiFjs_MjXi20Y|Y
zhyi@hi<M##6XHLx@p(9h1Zp>twR4P5e`(xVdFIaL<uj2VrdeJ>ccdQO=K1omk<FFX
z_CHNjYy=&)zrraW?YZx7)hA}yH%abyeVcp>H}`Cf?!i59qX$Mn4`*0UwukzqeBI>G
zDBj;!57eUwYkn~KL)th`ko|-BqfZXP>PjyM+#rb}D|nD%C(JMR&b5GC$1My$H6yel
z^d<KcXdgtb7&R9NfA{kswddL3MEl$RUl3NB&@nGF{IDJ&B&(0NL0iVYD%MXO(;?}6
zPWF+Xc40eD$6WIGOxeozuWN#S(T(N=6CENpAEpR!-q?2*eW7AGV6{4fqPj;(rrG6D
zHgoj?x?D+)J(g*!hX>xx^I5lC)pr^ZkvJQtM`c?TpJ>wZ$|jh{ai_>1NA_vXAdZ=S
zw4VNdpGLwc*o3EedTxTrW`($oyQ8=VPWMI*k*};vCnVEuUmp<#B{zJMa7RuzhoH<w
zVqS&ZXxekdAz9VUS!LWkIm=hPB_vmA?bP0FiB8mhnsWrrpX94AbcB_9x4u8qURfKU
ze`j$^j(^{>jH<g$S+88rO3nH6>uwnicjv!a>%)1DtgdS`M?`V?v%>2|RCiOq;m5>^
zXO?Ml9HGjgD23<#g&@+#ehmFO^!=dx;5Aw?qEN`MPWC>j#rK0Pnu$$Dv(etlp-iUx
z;%PGaDqpe;9!!F?nzJ#hIOWfqd9MOcE`+@Fv1u%5CVZ(tjY{dRN=ObmYM4dca1$z%
z9R51WgU|#>u|H}XPhJYz^7rRrF5|Y*Jek~*#4$b$=pf_&v{^#=<)8SdyfL#B-71rD
zC78m*E~7&cLyGmfc^j$>b);}9p(0ulZ<}Dw$eTas(Mg8U36cFV>?9Ev-+|hf{?yPl
z-IHQiWw9+wSKT4EDGf)XEavZ;pmxcdmq+Ou0zWDJ%HvP_mO)VY79jwdkoiY59>(T1
zG&7nu29;%egMKSfdCG&(1NdDb;`}B4_%IlJ`0-Q(<7<T(j%59LA-xw?>sA^+E3Zc@
zWs?P)Mxkug7c(Nz-0_&CXSe-Y%t-(qOlFx#?1ZjuX797GqQY~*VZCMib}b^Ivf&<z
zjWR~e?>Mn8-9oC|@Kib$%OsV&(=1b^rM7o*AM+yL+LT@`v)`v~f!))jZY(eufN3jE
zPUO8PMFipVdus0^C}17y4DEkth*Edgj3RSi-Z8|@r-w)e$N<p(0`W-gS|=`ZM<th&
zXT!JTyo)R`B?FOImluD8sZ%2{P<#Vb_Y^;~GvI>+KHHZ5%-oG0O7Ee(MdF?3(O&bd
z**w_4thow|)2&Ux4|=V2&L4_@&8ziQ^a{!Bo#v6s-Phj<(cHpn1s4#i!1yk(gYefs
z0e(gT&6}1b-q%_*y}IcqkU=|ZofrkKNQRRAmRz}>xVHvTj3J+3{ZxR!*&mWO@p-Ed
z;(UxVl+qyd%QkbZtw%=j?~_MRCnX<f20{s=N8MqP4e>L$_!fsGg7mpBCawha0m48L
zfcWKcq`h1Vp!fR#`v_7PQxegFb{82+Hc;~=p$W7Ju!!6iJ(;omP%p_?iGq+Q^oJsI
z^Z+^_lt-5C`96E3xE69QQ~O3sh3x(sJqW+CRjJUIYUSUfIp_cq3X|OuYek7*neaM$
z5YH3$i~tds?9L`jI0U$+gDU?>aVPh_X#t?X6nDR_>nlzf*PPtcx{ujqeLK<Sc?HrZ
zWL8TYxWf^5b@Fj#vM11|!_slEEr10TA0_O^EcFE`o&E0)Jg3Qw`an1^2ww(^ffzQD
z50!K{44+J0B9X~QCGDYr7Kl1bhUB}_QLq#GQbPplC;@3)3V|z}jIC4drJ@a9tP}g5
zcZ{glQIgp>U-H+56P!I=I^JxeW<?JbO_sF**0qiJVC7c<*XqW-(fL4nsJJUDnn@Pt
z^lf51@WR8PFQem`2=zhZp@^8NKSGe)SGqNyB&lx;P+}J-nn2`}1(X}cBAq6}fOT2D
zIW~j0#qOPU1_ga2?0ZKKiGZdcBvBv|E-Lb!E{Bx!RtWv0;(d<$XpKSwp3wD^w<z*6
zG$Q|=C`xid^&r-$hx9A)v-L^-kD6wMzD7GJ6Ak%AwwSlUuO-TryDZBOMs~$;FvfZn
zJer$tL}@wjsw(NffGx2Ld=z`b7Pr<G$qGHaaA$kI-b1oZfAlq77pw9aL4KXs1LJq&
z^`l$OL&R{RUp_`pe_C&3$WBI)Xw(<H7icCmSGC2%Vn0&0K3&NZtYcXL^r(jlfZpoZ
z77B$H{~`Mp%R=oSzl-C^Z8G6&4E0c6nBN%^&F{kVzgn1Z&iMrS6$)zt%8-G`7LHA}
z+bTB5ufU8!V=X33Hr>i@6YEH$!x%D58<HS2RAjc~h@@@lthmlaxC1Y&MXD_|B!1%{
za(Sid%jm&={xzxYXS9CYeslyuKf0$TVdsSWg8Rcsu9FAVb4#Q%Ed2I3bu7P)8}W>G
z|3<Vj3pI=wDC{SjL?If}Mpp}s`x<Sd7PyXC3KBSu+^n2CZubjIvOZKx3+jVn%V0(T
zORu-7fe`jLkd4tdBv~-}t$M0foLlFfD&FgCPH9D%D1<RgK9X_}45%-r?U?>oGJk#j
zWdn-Fs{`qE@MVc#vXBE0)n&Mh&&WqQhe5w$Pqm#2956+ZT>0Ul;?JNU-Gep6)GRGQ
zP*hjgx@YRIYuXDslkCLT+)eRgNT(bGcQ67+d{QN-fO?EZ-0{_O3j~Ssjx_8bq)T54
zaW6UQoQ$PiGtgZILdYd+g>BTlixDb;)$q+jgnac?$^yjgt@9Tm6IK8M6r{M!>A=y4
zH91>$V!Y1?L%T?LLgp_`m@OWx{yX}!n)VrVw1?u1c=tyP6lw+uK6rd+xIzIxbSUvL
zaJ>c*>)ui7tW+_Qzxc%W1`I|mj2(PY(%i0>D=7_}jc*_172zoM1x9v66@JkiH1jgo
zXdHF@j0{YIu}O0k6ZmS@r`!<jgWm&&V?t*l+0{a~gdbnUdTi(&NoM?yr@`&7xh;5-
z-9i_kG^&w-Z}4N-iR=u>Rzp$d0M7(7%vbxpMSfqac9oE<mB{Xt0cz44Nn5S<`ym+1
z)H&(!=EBV~$KFP5${{jT3)%yQ63bHNE2l_&tJ3tk$Y#6AL7?*VGJA@7nms!%DfHyB
z6hkr~Dxu(i*<+lCiap;xOxZ2Sbop7x`KTZ^YySno>{H!6c$dn2gJ1=Z|Gl;;Xq#cp
zl@ktuFjKo+`MonJr)kBP!HBR$7dq9$Pm;vM;nPUXMbQ}>Iq0rh+}Nuk;lrn&LbQ-P
zO^2j5Ie)Q$U9ef`)B}m*W8M)&;-T!|PiOwg?`ngRwE(|7RA_4=)dm2705nw;jabpJ
z6MklqabC=BCqCMc-h2apdXi)Rers4?-x+M4DE?<UvrGVT2=gdB#pEnMrt509ATkN7
z8ABSwlx>PV<oQ!as~xYDLeOmPQB$V$+^6ik=H#iuxAMV6Q@ZKooD~OrxK|vnTtV|P
zQkygIGaQWFk5b_fL4v`qdsKLX;ZCe(TF~!|wa_%SmHLi_4@u5)zxw|U??R7JN{h0?
z=lZ}U0>9x24w?V)wUj%E5xUi78%N1>SGYcc_3j22^%H|q!F7cqT<(<(6&a-P1pp3&
zsBx#gC-0tl-=&B-T)cDG9^%onU8i4;g^Iu<0Rs8(R^8TFDA*Q8Ef`B;jUvU6ctKNJ
zfl3E8YSl7K5W2k2s|zUFD{x+3kBIBG<#*l0UaVo0$}7~u4uyb$dIjHTId7cE<`a0r
zCyOxR`T~;&b`47}i)yeMEiD7a-Uq``b}EtJ3&^=Kt(>lxOnRX|5F)xIhg#FTGHp1~
zeS3b0?>(BM&XV=ln~}yhzwN0zn=K#~BqkSN%pgM4!DC%$@Vp=HNEdsRP20zx9l<HD
zhD2B8+Umq2+a6dq5r^{u1yDx<Maj#gl-f2>=%MtNZHDI=lqv&?!cM6-KG<b(PC+m#
z<yrTT)-Vwp&4j>gamVz(7<Vm>a3}#>>-ihIvxRc|tXYVN=SIf7*VQ%eh5l>{M;GC%
z<L@`C3v;}4NTPYKRZMAr2g`3HMdP}Yi2?V!c3sP;6V!CsQh|`V(P0kx$R}*qxW^~F
zi-a-8tg=Fjs$9RTA<Y~F7!8S9>UeJ$nn*>n6$@qUg8l>N_1$Ap$}^L;(9XZ0NdN^{
z>>yd=i0-J}sb(5^#hfvBzo>vq#=e6AdQ^xXzVRl%<vVTUh*<!yx0o;FQD2<iXt;Ep
zjTDu)cIYc5WdL5J^x}YX<2MshSN#IJl`2v*BjsY?1LGE>j#Z3>IoIx{^5ZLyQn)VN
zqtx+}_*duR^@fZ9oDm;0t=mz+5ZUNogm9DkM;am_0Lf{7{I85w1<)g}w{gu`5h>`i
ztuDyhW+n}khBA9E!{`UXlny$b<nliRJ6bS9x**Go18>pvb%^A$U3P#13@gT*uBrb$
z3BWvTUO<ZV@J8VY<|`a{q?{<TQXIo@6Q(0~73mSYyebpu4-Xtmb3_Ygita)qeDM)S
z!AgBpX8$mdro8v&@~|CO5O2p_!|s>;FQ`g+XNQ3(l46+w2v^!iVTOTQLacW*6;OWJ
ze+LRdPhN1OI6YMkxFf>e*ZVeRrWP=ax==u(AqZp~4usQJqbrb<c}OX~OC&jcI{Sw*
z%z-;YR>P+&gi?f65=ip(c3NU&M(cv3iiP@HR}>pJlXmnR>x<2r<(J&UW!(}Zk3_Qq
z@CvgH5u#(@$D+9;T^@mfaDlBdSh%J<TlTsFe);QC386>EH^FDEPhz*0Sa_dE<I)%q
z>nz9;AT?=ipMUv(bX&?lW=v7%LcQ@BKG3<vKqbGr{F()rCaTHnj;hDZRKfZ$*{G=G
zZ^yWh73GtIwe_8_e4kC9)I9;uq(L6-R|NJjK5ug=N+r*4#~#BSD*US`*x2@U?fc(;
z!6Mo#=oSe;ly*%p6D9Xzz>BBoTzkO5if)DmVgvzXj2<no$6js`l0MtBG3T)g0>2-d
z+GOz3&qMLoxBcsS8ND3W)t1-$CdDEpL5Y|};%99JKhxo=OjuY&Fd6SW_GS`zS<X_A
z(ZNy_=-3%z4h!E$qI)M7)SBXkmwfi}vG`~YIm?L9Ly;PjE?5qKQSDv$@3EgTkgPU@
z-O3NtRg*kO-{Tf*;MuJ9%X_a9k=+-lJ(_=h@V@W3ZxoR)_PvzWKWWHlBuAqh3tCs_
z-67hz(1K#!pG+&qX`IL`m<I)6?T;HryXSSvRmA4IhkDR8YQ}OTwV#d-mL)aEC1lOO
zBN%|9s{no)@5j6J4<TIgykzy7BvcEV5P`B9`1%3YHzmY*`>EZ!afE7QK2KP3#My-i
zMq?7L7(8)}|L2P=gf#nu_=Bm_flhK3Ca_1x&dfo|YxI9u_hkq7p)+%ex92)qhO6m=
zJl1;6yR?NjM&$VU80}(x=DG-F@7{TyuU+WhK6&IHa8<ditr?uV?VY~ndUBEK9~2B9
zU8T0kK_h-(Q&~^sKNrnEc-6v8RHD1i5~6xFm|a+XBXFH(zXnr1rZqhJDv!xz2ov@;
z55XtyxLaazCb-AELZpk?WQ3C|V_2cR`!w(Jo1Z&+lg7)?n<~Uy%R>3_{2pKy#lP?$
z4iL8e<VIFb?(#h4utpABSIl^e5C%-@Kjs#|+a?vcVitp{(&nti9}1ic+Rxk?ODc3E
zX2EnOEJ#i=*cmq9Fih@FeExWrjCNo4!8Av%M4x}ch}KT3yYkBZ+o?!q54J{BxKc18
zcb+k;T2+YY?T+8s;43069qN!KK*P<3((3IfCtK!ok>Nwx;o}JV?JTDsuAL#_c+jI8
z5B)@(+m!%Q-1K#}jw1BB)QOt49*BLU*Ui^1BHAT6>aT%B3)SIZJ8x0UE@%@v+oZ#j
zR({M={|oh%t0p?i{+qkX^kb&RqSKW)I{_{h>Ec4ndk<yVNOqA6Q(OEI$$ukWgJ{N$
z$bBJj5dFI2?R}@)DGrF{6NZpAN5}!zue*=ePQ%pKHk^{%zGh@ADw_sgHZ1mL!;xqT
z(eY&mM+oODg`x3)Jkw&IU*+6P-e+CZF(zWNdCX_!R0UrqX`gjD|Lmo?%JAUs6>q7M
zGu4v5`zm@f>7bPvyi1~&L_p5PglzN_<e=POiy))CcAVJZC3O6Sum1;>fa?d#de`;d
zBmQQ_F!D1CAQ=3ytM{eQ0+}izXuj}Voa%b<acazo61Np*&s)z$??5;R85HInL666@
zi46C93o3ouafn#I$cDsvHDqt<v$X2Cr&ANJA(aqQ^=>Z|jLr$7g;&9P2<Kqy&ppT<
zB87qk<&TPc=?Iq!PBNDccvHb*y4)no+~+N1J0SnpUyI6&;=^J}u5;zbS{$4Jp_1Xj
zA#t0m9c4`BrV*CkNDT*y0ruob94YTocs}C&C<!*Y358GbqoG8(J$%kRHX_H4)Rt3#
zfBhrAe4Yc6z~AFJ@m`?=!M7Dvw%}`NVn!k-THc@q&f*c8iB=9L{K@+)1hPuG{^&Mu
zs~@A18NN{KSDM!rebxkVE3`e)FEjQc%p5D(JI7Mk*z)Mf@~2*mI%ZgUZ_SyI>q8K&
z$sp~j@oAc_Hrv5m$Uf;tEb<!CwqLE2I(KExw$Yk{!P$eTK+{Q7{1Qx1ulGTL(=}68
ziB!QzS!!$*wEL<&qfzN&!5a21Sb0<a<ruFhpl&NT`g<5>3NlNk^Cz)ilh`ai@sWyj
zM!n?Yp-Z?(=kL88(0d@~UU{6RsEx?{e&Hx6@=xVjMt;|VZbyx5HFzmd-ftt)Ax2`(
zTT0?}aI)Kx-sO5PdzzMF0%mnU4^8*>p0m#Ml-S<IZv`8{bPKylpXOMQ?OSBsU{Cgh
zW_MjNi^TR(*EL0Xk?BX96eP7r(0|C9_1lJSM@G@A+=(Jl1nslE67&I(8$#5BcMC_?
zZ>aDpGUi!wYWK*U`#kq+@+x7_Ti&_)Ek9+H->+SW<$t<XhcBbN<^e?@O6>X?Y#d|b
z+VA4_SMu67X=8_lF}~AS3Hh-FxobVrz}-SqWX+<!*k-=nqsaGv6j$<7|NL=mPsR#F
z$dmpnMRQdK3GQO)rL(G3Y-(?R_@rwt-p}Ew^^4zk(;7RB9-lo=u=vx;)?ThqaL&)t
z7eY)VU2L5$)}r{sj_&c~sejUNqTenS6|1iUs+6Lww7WhtlH60h(0(n7-YykSyF6B5
z1dz#YE5o!|Fv>S0IL^P(QB+|R))~KLVNWs5>3uIz_wW#{^JV6QBp1K5dXPg@jb8LJ
zWO3)m6=yBu{;xqdQU4LgW+Y^-d6uZn^`MyWIIWwp6TEb7t5Mz7I+pg;rCQWe?OT*I
zs*;tZ)d<l2PJPp~gW3$5`D3yh5BSx!pH2Sbc-Xm-QJ{iaAtRNRp#^$1K|wJ&-r-wZ
zwBy7s{c4;=yPGCiK5{=>{F%LTOq*Rb_fbMtk9EPZw?4vDEj|WQ`Nu7iDs4vY?yP(X
zqO@`xR||}+BK8%Mq+_BhLR4iO5Q-}i{-0?o@FR$P%7dkoAWDd}FyG}_M3$(X=|lVm
zCIThAIuZ5f3_G9t)vUGQ#b&xE`F|Xpby$<%!-qEpOuD;Ux=T8w>q~dHlpq}&(xP;?
zqzKXyiZBW3Mg&2+QMwu1`~2Sjx9d6ExvsObb3XUyu7VYPYY7yV<Lts=n#81~^A`_v
zJZCz~d(IiNV8qlddd4yotw#R<N*p~=#y8~T{>P%gR_+jUw2+6@L%7?AUzg4m$ihMi
zN*X+r4yirEYC@Ca734kGy(uNnz`*Vcgp_?*Jl4Q`9w2>6V5H*QZkO9FH*h2L(!1$l
z;#S-TWd8?)h4&9F${bv&PUos6@AVOvV;A4KRK)I`7k@RyLRrQDs*ZIdlg8%XXsdY`
zexehTq#`>H#fH}l(zCy;Y57ju%DVXO`-Qgu@TajG=fA8NZ?z@-l$esm9BX?n4F1E}
zk=I)Mz}=Y^V$Yu+xCJZTaC@5KrkGNIK_^rIPNjd`RUD8vEf95dd08?1O@#0<J-dDC
z<shBpfkfyh?#Dm}eOAWI?;)I%|2pj_4PZm6SZ#3U@c({4xzV5>kz4!y;suT<*(c*1
z2~!comUZg4wP#ZGSk-DIZ(QbW@GApp{a(NlBlxuQ&4ii7=IioR+yZ4ZXhd#07+y)$
zc$|_v5I*0w@}+S5ye%HrZLY{!*1r55nrSDxm9O556U+$|1)NI~cUTr$VmxvD*zl3b
z#Xe(%YAI$FL&shGB0T`Y!e3lqA{g<?ro1{`sdhM$j=bQF#j0pI$2_Lb@ks*57G1nU
z*dhX7ku!cWih9&+paHuAIrS8-OS8~0kykstGh_Grld6~fKDvAF<|f?6byv`R+ymx?
zR#CJ6k9oMrl&7rfwIe5I=&edj3pT&Srqc%wCY4$!`<Jf^pIkFC-*n*(RzAmUa{Q0d
zLv2%>HWRuM;hoTWJxP78>v_+D9AoEpf9A+atdu65(Kv?~>AM?8hi-4zeDtdh==#8*
zig<DDZ&moi%H#R2IEFzEP;t*`S3AO%$oHLVPL6`IxqH7?I?eTlf^Z)Ev_LyBlN78Q
zuB3}H`ta}D``77d|Dy5|A=zf(>;&Dk^8NECKViF=al)dAYl>Zv)o{bQM~Srn9SJRJ
z{+8<D-MC#!?SY~Q*?;)7?!g0X0@{2d?T#<X;AGRm3~yWL*!a`RmD7*;fdxiDj-9$7
z*$`nni3m{)Z*1|2hN?DwZtNrb&xDg8h&jBjx>HE8YOz1WwMFiO*V`ViPAmw@&D*(!
z)l!E_CHxg`O91$ns=XNbAC0bt=-{{%pLld;hTPFi7&(Jy7oy5XC&~Lz(Yfk|M)cpQ
z$?i#c^)+y-T6bdS&n#lsFs?0l*h<NL#o;$CV#w@@iScuFD95d^Cd5@Q!Gd~!w}!RQ
zbjV~y?1sIr?V9|zr!lv|eh)Tldh{XFz})BJxrsTxv=#}$1APOMN^`HLm8#O181MAF
zO8M1{=_se6Bac+pSLX&^ao_iWjtwJJE^g#*;}9t>c~r7yf>w6de;k1#AB4IVf<xpa
zntk2<m7-^DL+BSE;gr#JM>VzoN#gc5Gp8Ccw)@QCbeLPKAau%%&R#uvWLVvo;?#pb
zd-=mBPATDyO8$P5hjJA;buvwpj$d;bAs2pJIh>hG6wHlRnQ&T?EVfI*`yWrAIc2@z
z6RQXpEjnZWQb=j2C)dSW#!rDE^V=(~JSZPdO11Ph4Z<7q1ic8@Cr*3JJHwlAaW!~$
z|LI=7J!ka&^p_5g3^%Ll*?4L*ZCdqKn~+g=#QImP_g+x~<Ss3@7%Jza3K7Oa%FK3$
zmF+~8oJa<Hol$-cW8rrbaXXOP!gO4`fG<^+pw`EmTZL2px*T7V1AeAs)qYI}&+jF!
z$^ITPGfjP?%-HL3MvHsH1Q*oG=|(@1s+Zz%197<8u98RAvKnT&TTNk#AA`8clg2R>
zjn){Q`;eF@1#%jKCyq>TvXs5vCaX7IXI)LBJyjB;*^~8xGqxq8g4r028gKLO+9CZ@
zs;m<QPaD;AO(6ye+7qTMHGyFwU3R$bZ?~~dTo1sSf7`motmu?^6m=5?`1#22xsS=b
zaq!(q_{n>q<!`Yo`f+&q%lZuyyVh7lEhww=g$QWuF@9Eno3?|VrZFb^$ixMN+iC>Z
z1@++(i!H0p3WT8DF=!El7e3W499ZqXl87<5?8X?sME%R!u8=Rd{*a6!YIuR31-C{W
zSJl4kMXv1jkm7C88x4x~l8lUzhm2@GTQ<1$C3jT=cNn2v$rDj%6em;5@`o~IM^`7s
zyPul|$43U@N)neb_aeFW$~vVk$%Z)Y7@>>&OBc3$!X6xEy1R7Pgtrk>w`&xotR!FG
zp9Kj74!~6c?<r@khDkJ<7lnjwWatccV5Os?_H1wN$GS>;b)TMi)2UbVITw(Pwx}GI
z;=I6<3g+XJCN2Ovk1eS)V0zp4YG|xt%L-ATMGUJk)@QtHrqm$sv$wWUAVH5p<}3B(
zcNHRc;yAzf3a~f2eB}i2VPD$Rh=YaC`o+8lg-=`zMC@zp?RfWzC-es#PCzl{I2!IC
z9W!y`i5)zFtZ)Vzfvm)Q)61g(>`s?OyyAa`&?zawxd>0P-w3MpSUs2s@x9TAz8M(N
zeOcmlo0dU&GhOtL6uVYMdR?PxL$<bIWr18%oS&@SCBkU5%*8#L%ro<!xyrNxnN!AC
zN1Iokk=ooJ76pFF&l$y>E{K~B-GA8Z-^VkuTRzqedw}$!C4g*z_d@<HOPAo;^+i@|
zh^5}H=u2h+o_9xGvNR(Cm78r8NFjz%%(tRz@I7+yP0x3D|I(ziB5$tb&}-Wq*Q1P8
z7W7wDmDUh7Pi~XS2Q$#LqL2{9ToP`cdApn>i5`2U8QD(Af4NfZly2kF(O8fCqV5@<
zd|jX0qY9sI<;Xg_CM%I+2EW5=&jG{MM)V!v%f=zYC~kHB&r7?qJq;tNn?w;z3rLCN
zes9Y8IB8;rXHDF3#$!V7F}G=eMp%3mh@W0mj!uUJqTtEp{%Mf*ZN<+<Rgf&Pg&4`-
zCQ;{GB+YcgJW4m2JU5`FjNsC#&}-pbUt~9`S)A&)7(9f?pV)LK`zDhS-$6WG?svf%
zwh6|kxQ*Qs<~%sa6K^#)H~Pch(^5JM3S<v2^t)>QTWRz|zG|uhoroc~!zHLJ%=Y6o
zW5Ud51`nUkc@jS*7z?FvTZ^-@VBM61J3q9nnxIyw+xS32V?Tg$w3dKO6vU8NLv_24
z)BI7+XTy)$QeVi=)j6q8-_N<*>2Ut6cjBNdNuwyLtt`hcZL0PR{ks42q>B&5KxB&n
z=Y??Fi_21fM8(4UiKCQ=n_4cUKG`bOs~<7DUpAsuMUzg^!~0XA4y#KXzg}UEvlMQ@
zu<8Aok573C=Bg_h%TZrGgTF@pBfXdfC-9blZF*t4TjoZws!_0E7~R|X4Q8%?Z3b%9
zs$(ka20MFMqH2YF&kNf7B)@SdDPT@U4-ioPu0ok_MgD;IaQ-2tvFoYpXJ0OQ_cO>%
z#bc!4wY8~|uDg()7iGpAw4-V+qIV(oQxnxN--$|%f8^*?tqw*LN0#9_4VNb4f!Hs?
zgyq0Izm^qEJ}Y9O3%7o9L41W&pv9i}$zsMQj~Zhy=1yEL-x#4Szj7n|)YB|%2lrD#
z`7dS|WkwfIIHaqD?9c7-Wm*XK4dH}I+mD7(V;!z?DXntBC;QXzd@Nn00OJ^ww2K6S
z2XmNQgk)5Ex-|JAtp3u@4>FnmqvI?!2lQ*YUK{)j+EU~m)+hC&2oy6};H8xsyj;8}
z#H|to-r%GJFDyDJwn-ae9piEqC-Z)o@LE7_k-l<6l-Nwn^APNj#xE)J!G7)jc{{|T
zFtmbfU=M)yAT1(qUvIU2RqH)#73`&bbI{D}j-cbY@uGD}#$6)m#ZvCt;rpsO28o2J
z-&*iUx@_WJAB2=nFCXNArMBSuMl7OX)%}!%h2R%5tXADeO@qv)TQ1@i)v^zmX&7+A
zgP7P&Ide7rEsa9g@b}2I_Zs*-4W)y|Gz7yWe(SfTMtRo*3E=*=2KQJOTS5;Z+($WP
z^muA~$V6KIcBD^qjY3Vg&&3L~R40Q1;u9ObCO_>j-W|ZBUo-pMZNZ4Gb&dy}`qVIT
z&D~sWW_>`K@8KE#EJ4BwI7m|g*)HBHzkXv&EOJ5IE#RFSeD#w(?)^V@@LG@tNJ|S8
zqJ#fAw~vY~q2~cX+2jTiThIRdFaXkxjG<;Uuu6&Mw;v+!oWL8hJolfl$Ri^UfE0o0
zs2@3v_<0FTKw?gRH}@5U?#cdxw{O#nxlxG7ko(gfl)N(v!tGYksV|V%-mdJXe?=!H
zaE6BKJvTjbGCF%)?xw#)fZP@DgJZD{QTae4@7ZxS=v689v_G~<7v4xM0AP%w83;ej
z>x~bbgij#t1Z4M}c>D^L;6G8UNl79iuvS$x_#~Mzan?e7VbskgOuGd6Dr93BC(bM!
zgU@EU>LipocZ~23ri0bqsi7@NjQ{Vy<r@OfBsZ!0cIjt-M2Gt1{9jOESXWE3eeiRX
zSO0w2kIQ}=%$J^*g<3u!^}asC^&%0AQ0pof(c&)3jntRDZ&Kdo+v7(h6UOoi#?d2@
z^M%Mkpm5V6EjkshYoF}OLNU=?e3TKZVxM_Vc<P_!?Njm_<b8tG->AY9Znxw1A`6NA
zx-{jEVf3VDw6_9Q2tX<gu^j7W)V8b=kkgns%pn(M#p9ajuiXsc>TWnP#lJ!AaFdcJ
z;C}|p=hIL^_z6PNUSE)0nuD)?RB?7#l5MxA9%84BZ4Whv9-n!s>%Mj85m~JI;B&-h
z>o&I#9XQE76Yz3ZUZ%fG`#w03@e!v9CjxrxU#_cbt+}j{`&)Y3hXT|_5V6O8!>vF(
z8B5}APqYY<$VO8k<WC{yTd`T@o4(p?H%|f!5o!lPUKOx{l)1dJ(%-~xpQ4^VG904?
z8V%3xbt?DjCJNsNUx`2~&?NwcAxFlY>sO2P!gzt;A$ns;LY0AW^q<MX<sD(8b<Yr)
z-1~tQ;<bN8k}|&8+ptmv8xXSvUqD&CaX<(SS`se4`XH?oJ!9xN={!iZ&fDY-@+15U
zNF2h8#Hr?f4IfNNz4QSWBFyY{eYs9DKl0mRJLX3`p%Q7aZw&kTDBMV2vXw_*9ELq{
z?+rS_wbz6Ff+rv_eJkEnGD&5QCuEi~WHV*H`I?2+Jfr>;Kj&2cg}@6#9SBRTLr;ru
z1QsmZn=R+yFokX3W9mmYAoGr8&a&)u3>(3=eNuym>;hc_7fetiIPG^U+eJqJyMZ>j
zVe>&!;DIR`MLA>-#N^O9Dw*#5Z##-X)YtFM6z?nFI?p9GLFq97oklbfqJcK|Eg6ri
zbnd|Uf^hE3?{!OS7{2h6j|gskv#C*yE<^Gyz{m!0C2I(-Ii)=j>NBSdp=fpm`kP)5
z4CD(Z;D3oFQe4it_e^U=2D=9X27+O%nZON8Qi~kejGjh?Je^Q^0OHAl>zNa5Qz7zx
z87i)xNJF`cdFVAf6_Al|t(Og;u~W%&erPE^`&?8Uj@QlqhbP}}6G`4BB~XmFD=)o<
zCOc(XkC`dWMB~vP{7l<6LwuGhdc^Do_kbQa?h9Mu0!P*`R}^yZZV8b%NN@!INGgFb
zHPuRLCFADGnl3=f*sa%b`>EZPZWvSu_g1w`ZBMbBGK4_7`A3Lukz#aR<1R;}F*cPb
zh7h8Nr+;`3n13%QOmvC;W<PRl)LkdoL0j((VUvo5z7Z;)o8tf-1L<%y2$sFC?@~U~
z$)5K4dXWviiUEA|d9YT9g#Q~S@p`7Ex8pR8)o2iqc3whB-_h(TCgGHPZ09q6x;jM#
zI4Npheex9``{C-$8u`>)lz(&EYTd>J^`hvhIET}75n2$hf-lDZQytmpX#7fdT$BgF
z#Kx2xfUz(%RUv_vOR|Br0+=m_Q?>iM6SM>=tXhgln0!PMvZ*s6^K)&uhdJv#C&7#i
z@30i_9o4DXmF-n*jai*r0ff?4@BcxUc)EQYd&I@dEtJJz)^RSLtu9>hd3XFXd$-va
zv%$NDjg!|h$B$++W@~nt!bCxSvYG2k?}wNGZ-ye)^rN8;YOv=aFZto|9kyQ`CWwXW
zl};16^Ic4ARqp?s8&sziuLMI`Qyx0s#)%DW#iqAH)~7hIM)Q!Z-$Qg~v;TWl^l(N2
z;4F~(-2K$R0V@9X5&Z-t4BCygpnc=WWsUBHW!h<McnAlqPd`MOL^0<R9Vv;ORA{T;
z;ZgW~y^I*HwKno+!O3fruegdJ0{j@<Hk$78^PLK2Bc#LeC4gb%BlJmuGTp2wFqe@Y
z(*e;xpz8`*O!_FNe1o59Z`2bA0tA9?A-6=QU9Y$rDtfsye0PQvw_*QdU8q8!f4R{j
zUy59@Kjkl<?-D-t0<H`Fsy(BbHBIg+&%e9dXo7FKEy#o(BI4vD&-`57b+(<}`ra;`
z(6#`BeEJ#pnTI>F5Td_N(MCTCb=8P*A8|zO2w^B{ZUyk2q#lG?$a~r4*fEB1{UHNY
zy$y>YQqOK~+eHATnz_pkArNHY*71x%U~zlbt45`ls0uGyDG>UNlOf3^)J>%{81dx`
z(ec)}h2l;BN~3@gY#G@46VQa%lEPKX<KY5ZSOjQ+nLQ$ah_M)bmtSv*%$YmAxkWLo
zlburi>GdB=c<3l@Y5QoB3hIVLKmaPl1dG|>ux;`?$-;|ePPAn7&0B_`I^;>fBf;!i
z?(kAv<nbdD05Xx$&2GQ>!ajht<_ho0!gp8Z8ZexSyjJ_NM+9e5oqDk^*{G17vB3x|
z@L&|iDo8<XG<BEH$m4e)av7s%HuB!k-`?<9^VVfqn{(7Sefov1Jcbq9j{1WSTeFNC
z@{N8(l+}#NjzT$|F_%wqv*sCmV~^=DiU$or4<~$H`C5|8zb*$X2yy^202AM+iC)7h
zMGu_bSmD66&*#dn4%6a%&;Y*yXj;M!?y^qINLqe}7D{Vzp_$Z2Q{{Zgvd0rVg#)&3
z(TwJ1H%t3K)HF@oPj^z_Zg2a9;q==fhzK*7ZKKqsCu%x8G&morm;(Qe`n<H&{`|fe
z{5K_wsa1w$rk4v)2hlISN2z_mu8UmsNHV%lp#v7We=aLA19;a{j1OZ(AHESr_CVaS
zO~12PTcXoA1Sf0d?0{j(tc&`YbNigy)5dMQzGM>VN9CU|MKhMwyO5p+EuO&G_a93x
za~0wpa-RKd>0h{Ousc9S3Un!G^}aSu=AV6#M+5p1f@T($o%bprY5R<G(NmmThesne
z;7k}5a2?Dfdi9uoyo}Pg=l>&<HrSDO7m$pWn!Sfp+fUs*eH<Dmh5=v_6Wo{WV9GpN
zyDD_}kdSU0{nZnta{y?9j}uQn8AZw4@T2boZI%?cUdKnVZ42fG;FENm#u5ZPyh2#t
zMi^K13M#aueHt8JBiz}OYYK`(x^JnMo;#iUE}%b*!ZhwA$&YqcPfoYkPHD{N#_+Ek
zj=nXOYbTi8@NZ64otL%*vrwJX^<(!Ftk3CQQW>_iFD?asivWC1CvacHnZ`X?@4+1H
zana2kV1fU0_&}D9Hr89tM@3a3BY5LdwR+oaT*#FI?Bzw!oiFa`c)<eCcM6nEqqz&p
z@fM<fQFlYxaHPOJMz3z9H=a=vozeyO@_jp=t9~(Cu`&(<x7Hj1ge*3_@gJ@ZFcjg&
z3{DB{uzFW4`hXA^6Wn97D*@LaupeFpNuSAQ+}<p^%;GVF1mOJ4%=>)8<&)6?_IS61
z_294a##wiz7(+U0zo;Wh($I8@dm^Y(@R9Q^|89Va>)M}PZ|fgu&^~Bck)RXQ?&9pA
zOxe(ke5MRMQvh(N0Tj`IqC7@ey<rrVgg|EL+w)9}8h<5|NzQMP*NxUZ3Mr@}<&zH|
zzzp(#Ro!8gNcQn2b_C<eo_ku6jX=}`?tLm;&?|5myxR$L6LVUNewT4#zSC<o0%SjE
zdzZ(VqCVF()d6WOhh(Fax>8Jx!;HD!y6rvkTNdd?4!xpa`(xHqLAGy#H(j70>aX<v
zh@j$+2U)?(4H%5vXgd%HxJGY6oIK--ium#x$fK;8`^vLEO1GY@xku^=%-NYaV!lL!
z^0qfq;oj@Kfa&hbYgFT`yFA`}<v+hn9;q;NHi~k`4Hw>~>D>}hoKm@VrjQmpC3ba^
zB|q@2sqUUGtEc+Max~=t=+*wvaq8u_OS1E*ndhz|DhD=ej0az{a)c+f(K(@G!*Q1u
zjMo0EuD7)pLDkZl)uqY2KYMaez|FpygwF6rGvr#c+Dw)SWQ36Ibu9WuKglk2A8saV
z2hD+Vj8qa*4>C2~-0<A9t+uxzl5XCM;hsRa%q|qRT}Uo;swFi0gmalMt#V_$R#^rA
z($=W_;kdZ=VO3}a+H^o<^9c~f9O-XK$nmq;A!+8#dz+(W6QtyZ)%GK^uJm<#|77-y
zeY;I8Go$lvyyv{gMv%q&8j&@bFDlQ~&u?iAvZ~+x)T}#ppn@T&(Y6(BWZg#(ex=Cd
zb0l)2t(nJpQDZUjcP_eZOD#9*uzS@%&a4N27#%5bVxxlQyXydn<v*7P2f;rTu1}FS
zJ<TmCkXv%QIX0P+L3Rz|T{woNLXiP%Nb6m!mkq@u3ov)889OC3yWrD8H(sSui^|Av
z$*YWM=qa#iCa0HmJH6~Nk@l*RPp=K?BYnC~XZv>|{*p+2ZQr?sn26ifYU?GE*0+A6
zS(i_tiaj%0!cJnKpwr^VfB!RaDAkXI^~Fu<b^d)!iK@fa9l_e8k_`)$dki$)8ahz2
zWH?c~9AaPn&HZ;jkVr#as8<<QY&XM<(~i>K&t;NW)SGu!uq=iC49^`+m1ezbiWd_g
z`~6k+c1mn08%d-A8-aJop;99h4t$9p=|p(n+pOUDGoXh)C<Jml1^Nu56Z=F*fNnIZ
z4Z4vjslwkF`8G4(l2GyUXq*NG2jW?9M&Yo244=Cn`LVQ{xOEj!ST6O+OeN*GeQEMl
z7jheZ>T7xE;a&2w8|Y(9!`xcyszK-W_WtmBjl=9nd0zAfWHy>Qt)TYgiN$x{HG?P&
ziPySkLYMHaHj;Cr6wqIVIKwCU1BXyEE1}5Mf;+>zH!n(l33_VCOu<vox@%0s<m9?j
ztCyj-RK@!TZ%U&W{cHZ|mI|80L=~W;t<$o8FQ%PeRLB}UL{qf2_BO);J(zX4?-kU0
zbyYMt4!<5eWXF#Ef^ndo?4qJlPzJx#QM(rl+PPZs=)G0n8uhQRt;bPxuOJ%iy*MRj
zw|8Mm@F;^!9_jr~uw%B<_+5sbjIQ{m^-_IS7hW1Bzn-*`b~R6Trms-tdFzhE5&DjW
zVgUcz5>SBpy?^z9V!&!{6<E@jYqYBb<)iAOYe)E5V|u?UKthmmC@Aat(i-vUGffRN
z<D<B=$SRr+_w}2Y(_j$m<+qs95c#Y*QkdIgnhzT3OC9pUj2k>4UYd^0^B8`~tBBUU
z>u{Yb#SXb0D$1mL_VgNOhqAiYiTOoeFS>VoZx6pMN+%Wy(}!KbUpAiO{gP!OI^#v~
z2Fbqi$t%Y}k`q*}ojYobxcv8~()~x3L-&yD&@q8ZrEvUkPPwP(fR>L28dHT``>cq{
z$f6PN?1%Y!6S>h*0eM2ibG|>5;?h-pG#Zjm{(FZe@Q{$XF%-+4v!l8{(h>fg@A9Q2
z>IkTwPH*Z@DNQb%Yn0nt5MKCd9L2-^+G2sa-fVmGBe9jCPV-+_vGl5GGm0*DG0^9`
z)Po%VU@+VL5c}YaOXE;BcaPSapaQ7gU-<E$&~sMpZ?91=rLT%wj83_>-3%MDeFLaX
zZa6R1{kf|`KiHh3tItKB14fS;BmC#zNMT+VKb+B1A7H<>^!cFJw6E<L=uX1C-^^C!
zx#e*jifF%@Bj2I6JElJsI6x<n?_3fqI3XuZcT^@-M${IGqbhhw`5_%}7kg{}%7!%$
z=>EHQCDkb`O6h7O>x2YUc?VbrF#-@gy{tK|pUHA?GkN!epJ%wNx4+82Tyk&1`wNa4
z0~}0d7TX`2HKOqUHV+5OJ$AsKa~s|)@r67oVxAHRU);CbX+EhwMZ*I}Q52WIEH(lH
zcWJ-!=`B4C3=)4NX#%Zan?_3m!8e_}MeLNuY6VPJIEMty{7&aQw>qwkwGs@U4+E$X
zW_KJ~1Xi=rhLBYX95V~{7fqV$d5|7G-#(aGrLBFYG7sD9hWEW-J_s+Lzl3wfoctc$
z&M%;0*2Y$N=Oi|;k8OM8=cPUfk31U3m3z`+W9^$|ly!P2Mc!VzRnu_S#2Ln~xn2Z)
zm-N{uN7a<`wY2%YUMn;`xO0qkkI_%c682x&{~}ifP+PsO+p)V;j7OV<@1zep{*+G{
zq8M$H70CEz1zzIGuwp>XE?{r(Wdyl6Wenf<Fe<o3FK^os2&0}~uDBWmn7j`s?1ai<
zFupw2SHo)u7;#qn7tW=1nI#nKEI>NVHCtg%tfCPiwOphI6=@jUBmtm}A&D-5<-gXs
zefKWjIme;|E7*(xKe3FUp4of*>7Jx1z(VzSABHK5dF;Qh#tAj)0lh$MxCjRGKO=&3
z(SIQo#gxOASgH;+sTkq|WCpmh8!HR}3O0q742v{NZpW2w6d_M!LC2t0!%Bas?hp7C
zFMw_206c-{I=E?^*LX#UTM}z?=qI4PB7#E^{ItwtGhg)vIb|NzBCL~#jn<`4iSss4
zr7b3FXD2{ln#~@k6M#~VTeR8P;NHy>*Xuyys|eXEXyX|eH7IaO;f_&Q<urLs1NAWq
zJ@!A=Pnhn&Mgpfpt-Skon2dm@M7;+rg&hPZ(5pSLQ9|<ogMPB&TCoPdgh3Ax*Dipx
z<$^i{vwk`T>kiF8y{Fg-Y)1tA>|EmqSe1>uM0fkMy{5&0cM-~BQ|2o*wWhp41l!-V
zs5Npn;R1-t97yIqRl0QK41FuI@hLs@=0H&|3A~!l?Z9^0zu&9-<kdUdbo|5z@VvIs
z53Tcn+}l<@Hlp9N6(_XCYB1~Mj7+pTf|Z!%2VQ%vuC~_Yt8HvOAUpdDFm13D){E{o
z5iN=Tgap-1IFkI&&;q2E7i$-s3D5JcL2o~(MN|Xw4ME_wU?Fv&I+@75M1I4aPJ$2+
zv;cWgymi9macf+Ai&cg4pI@-~fN#SLm<RzYOvA8sUPYs7y=;ZaYk80@hTs4ku1?`&
zR0YkQQNUV(*a87w{b(yOkPR)ITls#{1qs9G#$=oWVaYb{F4$!rHK<K1Kx9EtVB2K;
zT}8Z3-BNa7Lss*x7h@{g>hFJJ`EfGknupoD0B_2O(TmXwA$sZ^sh4Dt<HW{x%XAt#
zgbx2Z*AXDVyE22)z#opr0R$zqAp#_o(hf4RT{9g=yITJ<cR&Cl0TK+SVP*~c>rZH_
zmDF8=HmZDqkjB!EK>K8)rG)@jL<v|QK7{aALqG~xGJ8Mqpn;a?MuZWgKMfn`lMpvm
z(?N|F#QN_8H9(r=1UwV&b;QdnQwV*Qfu01(L8G80B9Jtrji4O~_+$Dd9ZD9{bjg(l
zG~`uOQYJ_LsW$+@HruNOakNpE$r?V^xC_2q0}`o%z{wdd3qsYt0K7D*fwaK31vMtE
zIt`=}Lc+Ym=zxmu)5P1yCwT!l?`1F*k2(*8n-3#d?c?_ENIM5_XIhMs71@7XWZn3^
zX@8(92TX4wIow2gpdJ&N>B@&Q(hyA$Bq`WCrFDMK=c?1}-TsLL;-5n?&OTnrGIdkQ
z`pn66zvdL69NCbsX}Ez~Fgwe+U_MSpC_rq%`p!51*3WR+MhRQbUG7hSq1wUC5OOK}
z;`l{XD$5&4%#*G9n*hrxmw4aE-={P^-2})FxTn;9h1dh&McF@V0&`&Tl2B@0H2?n-
zFr?<Y+;v&sT(uQr_$&4RDboJV_KAeqk)_UfsUOXIUlXp?Gc*U-EAB8O)9qq5Fyuc1
zM>YUg0)!#5T%d)WkA*Z^s)g!}vUmAlM!>wX23L<*{fCq#(sgNLJYu`^J%SO`4|{(m
z6f=$XD6x`hbW~KmynD`Crg6+|!4B%ko4Jyoyzj<{f*gxRw8qS3`g@f_BR)O*{sRpI
z5k*U1OMPkFPm)~Ols=ugy#RQk6}QX?J?N`w(w|%VB2yq;Vwg^7wFijOnXp79>m>K!
zL>mY(%fr5E?go3L3M~5m?s*GvvkocTx$^oYV$ZS0<cGyhiEpuh^uBg<bH1EWAG-Z*
zn?NI);*{u_5Sb#AFeMzE?;&?oeAw1+u`U1zFn)A>YgV|5dP{&#=>GKn$$(Cn-Ap7k
zFYZhtmLlo(%JKe~3a5UFNQg~VsKs1SuGnD2-`bT7AVEL<=m**-{4!sa?4D8Ms&A3T
z?d-R5^UW+&i*xIlsQv2(B#zNN<D>RU<j^C{0*Lj^8J{*S7rpke140%HK{NfFa-(>r
z#9mcu9hs6cYe#i-6T37NztUYK9N}@+AWj4VPPK?bfO;Cyxa{UKv#tD?xv_ni;?bjO
zQDktGe5FXxc2Kr$HY#_#bmD>Pr*2c((&m*8aE@tq5Y=0bA$`Nmj8_w~TRHgMoAD73
z!N#+$5-(`q%lIJ`;+pk)6vl;~@_4jC&D}ldve@o1g&<SmF6C>Y&kSnuU*gdKc|cL+
zno*!Xzl2;-1a}`_KeF`PH5GWCoyQ}sztaM|_`>NGXfk?Y3_x+5%vwu8q&>)3xf2)U
z%X9Rbg9I1!y@sDU-|MUMCxPX_ym8}&WVR_~dOa|);Z*Dl?c2S-YE?scsBQ6g@eggG
zi-ysn&+HfCq;#Pn%)eSGkiHll`*OYZ=d(%0tsw-c$}uf>izrioCGlNBFFD*vGj5uW
zlNN*blb(GScTHuSp7XmXVZDxVQH10Hd?Zomc858@ClUjzfBEGvYWdhAxCs|%Ieob=
z$?y9d+nZyl@DCXIYx$r+MvIjlEbe}NNaypTD@C3r;X+ms|1~YmnvV#9+i=ab$c!?o
zQ+;eqAX&_`xZ8cw2hs&~z=u>DY4T}knyVU>AT7Em3Nr0Ogb08`s@*v0m{SiE5Hh3)
zNE_1~W(ZDEhw9yTV5=h6qOB(aY|GcW(AN>~ddw8@3o0=Fo&_<aq9`oT7%{~L*z2+5
z(<eY{S<mPcd4KK@0CpUNzltOX;N$cj`^h|bG5HvdyHe5WzR#5YL+4}H(4i{b<5V;>
z>RfbFpmWnUmu>k*H;V6WJJz?P&@Dl=o^&?%=uGVtT|2C=bL7^y^*3p5Ch+UavDGGJ
z=sJn|aZQmp<LF?rO;?kL{?(2A9W1&185byqb$Xq$)6=~}6<{qD_%oPZ;?>}^aG4;5
zp$HQgW&XF083&fSv&d%~HE+I)0)hnLOochwr=AiY)I`J79_37s`Odef#8s=&Uj<L}
z4AcpFWl@5ym$!<OAFS>x)npZFQ1l-N3c$y658`(n*xN-g(go3Lt#hZ5-o5$~iTI-A
zQ<wor<v>TihJgnFOp$6Ak{fFj<TG?5`H+C9Z6Vw`Fi|v2B&2AEBstzabqUl|7U6F3
z0lw5|m;Z$}w-((Vy|D1&`IRz!7Z49AMDLAD<lhlIZd5?-QEn1#?AVNea{>ikal247
zA`Hvttn}YmPv-&LS-7`cr&;?>nCkx3ENL=Ua-d@1M+}V-Rdr&IEwsgiR|Ac0cB%o9
z_%Q*J4yqRk1o@%vR@7QrXBty)fh`f>cOr@>+PZx7jxc-rUZ+&o1?c|9_#bG~2w#51
zyuL<FIrb@+i=8Fd4#sp#06Z-Az5l>?ToS`Aitx^E`b3KeO%7p9equ#WX%LmFjVlYN
zq)-G%+?Dr8m19o~7w)tXG8NRXXedGfz6KBHBcI>lWrbnDa+3w|hrQRuGxh6Kd*5Q0
zagyzmb@_izL(>3;s9v~T$Ug&B58YZ=kO+r^(OCKnniv>ALx}3#XE%&lciKPM%^dyc
zc8qRp6vu`NGmJ0mi960zh_>7N-l=F9sBz~vvy;p~=Bu!uJKJ#s2PZ1)W}_$NA$HZT
zUEg~7$8afhM6cZ<aRZ3J-Ub4Sq%GKfAI?RgvJezbH}<gz%sBHq>^fJK)1HwgY847d
zQ^-QbH6{p2c1SU?{(8#I@CrigDQ+nM^@`zX`0Xa`#2UL0(TYRiaJlv0IGT0B<h`w)
z!vWGnb%Yo!AY1o!e-FP|L}}*N7Ekg5+QOI`xXC*BpQ`<C0Jw8#-lkX(e;dQwYOqMh
z(BmoaUqq%}?&sq&DMB%-D0-6Z84t*p`^&@~V~#gYl2xD<;DmL_TzmVid-6^r7*h^%
zt^@Xa%lnxzUE~qZ-yt?B`R*lw>mlRpe{|=t?2WndL|bMn>T^Sq7nymID8TM<{N@XK
zSE=B-i}Q2BtF|9nmup)Q+2-WK?TDfe;iT>%mupnDIy)tKjL^QXiboDfsF(IMA&Lzt
z4=OMxa4U|*LB?q6)*+-_K7qAwcj)=w6T2Nf{X8VVsdt66_C%k6I+=IdFWtWb;P}cG
zO8;)3Vi<JwO#F&by_P@$&ui6+1wKD`8M8L)T)5`+8wZq^F;SR9w>EKU{>+)woTX+D
zfAI)`;g^uFT_{AVl_S3%N7_*};XaO9#`mM^9MeMQb+-x}&f6C*k*&9n0ILof8WCkJ
z+W;5zZBaoIhR?^BVmCcmsXCFJ^q@=66EJ|SXq^xr#9zuZ$TNQwL%xvU&lZ_vA7!Jo
zz(Yn1kGuQ{re%%Ne#aYW{c{a#U0?^MLruv{f2_nVDrLI3FS7#A`{PtsFQ&FB)zx7x
z?pd^AX#%#~4H*(Xv)q7^_z=t?aK9QSV92VO+c~UR^87O1#o*-ZPo6avLoPgb)nh%o
zVQx+ia4f<Izx;aOc=6Y24u=HC?ghy_c}=rL>J(r_QSvQV+uXUf*`nvzn>o4d-JdIM
z#wbqW{aN|>41gZId7Js<(|TiSWPNh+r>VnF%&5C_b+|D5xsrkw*e$1f2UNW|f60D)
zFobeOb103yMtz;_EJ^tyO)Z9Sx#xNcYwe2&D5{*ICpqY?I=7>`D|CJ<PCUwDZ(H+3
zknS;4R=n(a>8#T(X#6cu{&Kza_HKmi_E%4Q+xLzSUMlwt^(&75gsNvep1`rYiNAXX
zSwxM$D*PR{88$f3q)94M0dNjcT$(U;N9c?~2kd751L2NaJXG?B%cHm@sV-OQg^RW)
zNT;eM|Kluq(Z9G^1(}Sf!bY+rh{KB{97^r(B7cqEt}0Ox(09k+xOL6;QGs0iQ=2da
zjv<AtZ}eRAYesiWxf~YDE+Vp#3>8z`>cc86z)5FakqY*>R)w)m!#8*cTv&8O)EMr;
z{3NXoV?K06z~zADQuk$yyYx_X3JY97D0C)!kKl*@N2Pb~-_GTbovu^{YGOHxi^|iy
z0T*(7Yv3!=J*^&%Pb(m4{>Md9cZ>IT)bq>Jkk@(g{JWA{529T#HZ~kl<c`sFHGiw`
zbd+E&9~?`J<Z;m@XpjIfT`ZOT5H$gA+}-}=R@S<9qJO~HzE9(2;CY_kg-V#<!`c*&
z(#I$+aE|h-V+e4{0Ugdr$Vh>63G;rjVRT{EwQ|(T2tUMLqkVD(7W3a8gfG)>h!`S2
zmZ>gr#d7+u)@hVF63(584*f6kHoBBk8AzeuKQLEM+<U|n6-}`dwmK%taEonh4}rXt
zj`7AVE<rBB<aE~6FeTSz<-gV9DeiKv8B}%BE({rM`))qWM`jTEi^>|vY2@6Rv}LLy
zGG`_Cb=ux#4fRuUE`{6=eV*_DC6dod+5^2kfHR3fD8L4z+)F?^543xW%%ACHe>dA;
zN^<ciUG<a`#YX}oP6U_a)RlNg0Md4g0jDTJbXYrjcUo$P2EPs}9F{STx8u1?W3o;&
z>Swf>sv%pZVSWhm{ctLNS*$p&@i*<<j10#?1$-*aGd>NP&{)=C1=U;mT6_==3M7zA
zjDMoazDw5wb;lkk&ItAvxT5~Uow6(7;!}h_&w)(LYqVs3RDm=xV+&F4|2k=Lub!+6
zEgVH-wd2(E(2P9~H+mLb?f04b5r@b5oglvRlo#H+@@GW9fBk!$qbIc4Yus%KNMo*4
zqbcE@@$Cb&Mk2e7e93uaij@04KOHY?;8<rL3`eX~7IV%_TezD|`w#!NF@;yXE%sN4
z^EfHpi=vQlNa5rAXDXmli>;$N{<8N_{E{;cl~zCM?xRc0pIGF8DU}7A8xq#EblVJr
z0~?f9%6X9FZFVN<SLK$IU3^R(_JZN?dl><36FN%&=RCccw9ylqYc(ELhrH$3ER-*y
zincW<z587Lobg20dD{B(AL8tDSx|L-Wl;>r58q^+#KTY`$fTWhqr+4$ohOJ%djQ0%
z+1?jr_r?RnE{UEIYejg$FEnbn6}HA<M9xv!Sg)$_$o$1a0cq;jwukyad~3|<iU;!w
z-Nvy$11xFcQr4fjKLx*s3k=|lN_+L|9ws{GTfY|%$W#?ATBj0sruG&tGQEh(?+urZ
zCSgUil6Tfmp46GRx-a#2eR##!AMXj!4lZ8m0xu>9V}=Lm@-{1To()u#W|YJzlI}~p
zAhNi3uO=s*rW}veRm%E7Qn}|ZCf^kak$Uc@CY=<TMg%|RxsffgOc*p2e7=d@OsFNK
z&G>2I4*agIOhwRGFH3_i7@05ipFxB;-(}>;&VG9mcT2ky{8S;m^e}Pwlu_`k?sd|)
z-ZV7MK@qLNt)XyOT49@`9xW|&lUnW~u@IB^`P9nVOo5Ik4L>@^?+N>%#7@!&Un4Da
zrOt8l>4bIZK=3ZcM|-kF26Rp3OJ$Xwk4U(DBA!`1IxpP`R%_Q8OYA6bZG<0Wn0fd6
zhvyR%d%f#24+dxACT#{z0pW!dgaorL#)`LJN=EgCy{aTfv0ISgs1(y1I?Vefqj&}J
z>f*NXIupA{`CNW_h^O#u{7S1&2})7meca-D;!9_vE3-m1Xx7|34O|Dr$NW~jwpDn#
z@#3Q~<lo!1nx2^Fm8N8Gi^1;0hwp5}M>w1}l|D``C_bgl9M25(#Ac&?UVx!b{V1+*
zLWK)@aV?hH`)kRAyzbH@z&5|Kf{&7Jt6$ttk`tdG&O4u)n77W&zvHTDi6dvDMa^yZ
zt8bO3WpN&ZRkHol-wC7rxY)Io67hvIm;KPX;@_wjj&hloe%s`VGNzvn^OAY%W)cTU
zoKjF<-r`UDh(;Fn$TIhOreOp>wG>H1v1B`9A%wGG-n2TZ=ZZaw2K<#|mPXKF)nV4)
z2@cxIrQNO@in~AYelz>^nO292^La!;m#%>`r8=ubS_6nw{C}SU=j>jD<8dSklbEtg
zqEhoA#(VvzzPhp0iUHMHC#lf`ImT|_>GL6CSqaq6D+nDYn!uua#F|s-gS4EfF4JzO
z(`$#~ru0!iuL&fz48BPGJ)P!odJ<6%UJ*7anK`Qu{Rgal%cM4Au@THx6Dk$%lyF_g
zTK6aM5<&vj(i462&!dw~)8Q<!p=r39>H4e0D#|ad@gJR{Y&LV1@v+~Cy%WDPKV2@H
z9<ny%8L)c(Cxd_R;`g;Z%96Q&{1(3+(yUpkc2riD2Ti{ltgQ~478ef230%za#)8BB
zXDh#dqQhf_CFrc^;H$7O_H7A~PiBi$e}u;~TAC8Ez)5dIFA5YtM%MG;aT^mlE-f02
zQY<8r-~Pee#r*|;y7|C{RyZP6wire<j#K$nT(ysP3~;u_JO5^UhKb3W6HoC@v>9)Q
z;r-cKi6mXgZD356Sw>wRp`DKmiT2S4?l7D7_HBrH@z|JA<)w{D5gDoDU$Kk|jrr;}
z`@|=7c4UtBbF6R!Ki2dwV5EV8r1k|eZriG|#&taGJ3e5vXDgpJ3+E5^0CVq{4at8A
zU0}X(nS3h3@Hrq`JV^>#^=YATSQU*gp%Msq%pOhXSvLgOBYx3;OK9i3{_j2c%QJ@P
z50!2&$CtDshom#=8~C}e;$|LH`_>kNcMScX5(`dc24;HI<(kv#|7+?~Iit8P{p>Nm
zXmYN@7M(?p>Z$tzb&h@BgZ=r3G6JflZGY5Lvl$2_x4|>RRT_&t*9oK02^{mc=?=s7
zgfI)psHLno$NquCX4F!2qNJ(%4<qM|@bCEZU%kRQr+DLx#hlHm5#aduh?ExI6Jo?@
zQgFW@rMlIr7*%5so$;k^5H~VbZ`<c&^D#dHuhXGS>$<AmtDajzf6z*JQTYHNSnn*~
zn|w0Y&Kt<YGU>q1YOR}ghnIQEakGWz^!^4Xc3vk>A?+h`<q$9Lw{~)FOn=(Js7?99
z)*O>E<7b9d3&Z}yz0+>-8v*PxzRUl;nG)EsX5O?8<Ht<aez2J|`n8kGCthH-M)iSx
z%-^3`$=cn29WSUkpWFMdqV@NZ5K4{&4FWZii^X<K`JUg^R$8B@$^_z`h+*ESe7_&g
zmCAn4@|eUf@pq+#CxE+|)`0gxv4BU|G1{c_UAdt^d3V(C>kC}cIQe7pl^M77MkWj|
zt#lLVIY1TDn)aaRV69OnNVIJwT8gV1o@M-#*~MJs=mCfy*O6iWeD8jHnouRKLhU+B
zd-v}Bbqo2qZjNmtQm6Z~6MK|W1F`I&Zf4hF#}Fq;&Mw^-;cD4Pw56oq1vcqtYQ39(
zKFP1Lbb)!8z$@sclaWRJ!g{st8{S8&Rkq8WeU)pk3${K&<d|u_BiLOVi29_8(;m3A
zK{)qCncbfWu>tX;c<oPt|3t6$Q^LuVb6=pKB;{?HuT(7-+;AX2TZ16dBA!X+qk*OC
z`)T$V=j3#Lw%mJb`siLBeoAq*M7bBPc*ht1iR7|2|6B`o{D`!5SA>$%H%CDlWXIJP
zFC9y5erRg*#}x@gZjgp^7k?Q~&<^Nz1RFX1z&e4_%&q}U$W;$(Rud-c7n@bz%udZD
z{-Dy2U80~5BOR67zv8U?d44yB_FKoYq*0qN9TZ{|N;tPzP7G8VmX14@SY+HQSCkdw
zO0#4x`B32+Ydt>B*^}_>Y(fhYga+V@Cl6hxzJz#@Gr#J6O4<Hl*U|?i4jkuNzm_LH
z|8-CA_EU2AY5)}Ll`tH!qKo<Ep**JlpvXr~H@(Yx0Eb=PUhB^{DNSd2^VE~6ow&%g
z_+9_!`a~@yCDdu-LzWYJh3B^BAHS%(ULPQ6wA9{O-k(SONP1?o2MUxnAG>$MAn=Q1
z61dS6;1S3{Ye&?J-ifo#Z9Qt)WTK-J%Q#>J3Vf@kkAXaCPag}4o1UnW>6tn-Nwu6t
zJ#0RuiE=h1eZ5}4Kv~8^eIEoZ*K-xa_m96MQpFOF`1R40=Z<S?Pc!mP^3<QzT3wdh
zBl~^NO0yW|G^H@{<<^6vwV10v47;1zO(<=-4_h|TzJNLztT_@du=o8+QS4;6D){uB
z9=n<>`y;<?+78&DH)G#8q3`1n6@tbOw+)U`%u62@-i~ej*`_}^QdCpqAyp)BfEl(<
z<v<R3k+FS@^Qf7GvZf|Xx6*j|b%@vh8KP`E3YsCThov@A*lvPPF$owZjuInFT(bW?
zi7b>Us(YzN?J2Zud~+p1#x6Ymw@@f3-@1Lg^f#OCGQn_T$yw{(&07!FzDo}T)unXi
z>{hSScXq1UoNnWO+)mVcWCz@6^^W2Jlr>VF_vFeL6%fY8Kk9psN_(v;;OD#i{EN+O
z<I8#<bDO-3|6qE>Nc{ar8zJ+L7s6xR7IK0+wqe-yR@R>-BXsCG37$=){u>oPRDpP5
z<M(Y_7o`+n9^1SbPm7`>{N?7-kcAhAoWs}nMW3il9mz!8Cpxk3ihNTtvsOVcim7Z2
zW4kMFP#h+PO+JMc_!052x)ZOOK<5{HdmVgzm<1-2m$BQV5-VIVURkc(n&IOc;t$6%
z)vh=xXh<<%QoC0ONk%)Dj~65uFDio(1ny&ly&q1eN%!R@`=r_}sBH)nJOlhn3i=e=
zSoio!RVL4=D@8UenIzuLusiS=#TNCk?L`uCDXOQiE-nm;;P6y{pK<$eW#Fa$KK3de
z=(EI6+PAxs)~?x+HE6{)Wr+Lmg8Pmt5##0^ifp~;T~I-lz($fo0G<@@89LT;{?bIU
zY|+EE5~~R0O%+}IvSkf7nXT(Y{&(=;tL`ZrW;n>v?#*deENk>-A7*#i<iB5O=pR!w
z9_(|!UyDn^w~F+PQABXIbwU|PfH0`5lSD7p(%DoAT^m`tcJ94Vh~*3^Pni2fznzeY
z##w`ig}duU;_O@@Mqqs>0S~n?D)o$*FGB-_Z?PUH!CIZhpQ;OSYn*Kp2p0{a!Vol9
z+!O(B*Rq?=R&KM)Ssm@OCzrvm4J!71ON+3NR_Ht5Lge)ap=}Xhc26Pe{fm@Ayv$P3
zO^>xH$*06h8IDxs(cj?}4*GC`)V@3-2RpE){PjFIuD<4^Q5w@RHg468Juvl`Xb?U;
zRhqe(m!s4mG1~WgDo$A_VUaj^y*KG*z`+x5TRK}~lRuxCJ@=U8v(S8GmIc<H%M
zPdm<Z^UAHOxbi1Wi`QjS%aK8Fqnp<fTuy<vbxFvXKJC)X^_wnpko(z}LC9X#lQ7o>
ze&w4gw#!mFAb9UhNVAX?G{7nO@9Ljl*?R5RMhU?QE<J9gsid~V{K}q_?Ms<Tz%>D#
z95}YJt_Qz^q=|N|`VFHgz3sKFSqMx%rw@F(L2U2Eo(J&Quvd{AGgpLK6~&rL5f8B%
zF7&&p^_Q2(oA?q*H95H2q?pQW!bc1`^G0|`&ri~jqu<B0iex>S&BD9H{NypO;bawS
z1bfTS6z<nMv|n%8<R(Sr#U2CCd-a*djhkpyG5uxoDkP_*%KTGV9axen-XGNuTn!^3
zub+QMtI!zGm|0%DWso~ZYcM2|5eENF8kB_43N!g^bV&P!s%glyB=%8-1VoAmY{cGM
z4foEi1>kUqJDW&$>ZAS@BBerC&#n|QuL-aM{HXWRE10?)F4FGla8LxX3?w%vvG=Ru
zchZ%zRwc#6#9<?a@+UdV#vol)u5m)o!xg?fCj>VaE)k6%_TSTn7CeRX1&NP&$G=#r
z_-C;Gjuwnxn!g;rSCXMu4|l$-VVxUl14ERGjDuRH=#H=P;a0}`MrwgMyo|$9gik+)
zcUwyfLxMnFq?r$ZKQt0EQ<7hx9aM%Xh20*q^YI$5?%F(IgPUM6|7;C3&$&X2HUWEO
zdE`e)zQ0SuSjlBiZ{XN_I?111`W7RSJ&DiCa?=eH+XPh#Sw=i-8e$f!yJaXdE^I3K
zfTVw^+$;@>zP``Mk^dZNw06Qt?@J2dyTk0o&K$iekm0z02cvSlbW+I8BKa0Pf;K)_
z+k)%1vZRO1RrfJ8n=<@d<F~!@A~X=_^YsbTzg-i#*M&Xw+_)9cr5kb4*F;#)+Ckw|
z5${Rtei<py@AXvOrl0oJt2|TevL_s1tn&Z5&@<bso?t3N^pijwT0i&@h<De_#4z1x
z`fKWR4o4Zf2wjLS&OrGXay&SKbV9uZ$D^g%_N-pl)M8Eo8`ia#-(SuTNf3U~MKNL6
zytD^xKlz58P-3kY?oKJhRZSgFfZVm$?&nfXCa_<_oIZ1o!7v{kDxjABCx;<HEwEh;
z<O`lPX>u6^<k8$XDVi+NuIdR~%Fx*EP1@d`^~S|;gS`OPJu@dYh#EqsR)W)CW3xJL
zF}(&Mgfk(qh0Eq8adB2tx8>olQBL-SHyk&^LZ82%{z(l4JP^LX{feZ0@@3&K`Y|*7
zr=E0(>Ispc%acqrmyQ4-`>v!&ID$oRPi{ICwKrINWUC}WW$7Oe`%0dp9MvA}n>Cu=
zrMTQ{)Rp>u3ylr??I5m86ra$B;_9Xknv#7wgbuVPSY*Stz~v!@tZq#FctNWa{DHKv
zyQ<9ji;8AX9<t~oQl@s5RKb7%o}CR9nd(U^#h1dC0PqVSRZB_5I=Kt-;(qi7Gzm&N
z+7S~!t@UzqAwUpziuMF=K~;~>wUhIqqv1x$oYWyxUgI#PWC?DKpQw&*kgY#1R|8Q#
z)c=PHsDc$6&X7tr&BK4+a8#w0lC7FGNEoy(rXkPta1;c$-^91SQdvY<;AM94ux-O^
zA{0YIW6`;o3JC~Wpokw2X@I2{BIW&BI^iYSuq><g2md-7kd&^l(b~FPR{QwwA%Ul!
zcJBik1L?coc=%3HH~+(b7cT`j(2Ni}xB?vL%^(VCckJucRHUORdT7HzVgaH0j?hDa
zEAKg+A0YzVB)(Ayt2B84{a}Lg|8p1%k@7EaTY;=)6>gyakE8RBXY2d_cp^sa4{FtF
z&1&tfMku8y+ESwwHA}6cHVIm_YR?+6N2yV2)JTllt9DVbx5gfk<o@#g{d*tR&2`Sb
zxhMDie!iaIHZXy-V`rqc7cl2$sJ@~+{m_XfCz>A6DAp3_@`rTx&1I(OoICT^6@#$8
z>{6OJ;`<3pItk*f01&wmwBG5$2V!ZijMlmD9d84qjc8y9{%&!^$xchxUw-|sX(IuP
zD%b}~2MtGe$Wy!?3Y~JHy;p~Wmc?!KG2V|aaW^-_E1M-D%WP&vxZt{gbHyFR=99$I
z8j}CEy>LMkuFj=GTUa57su)2j#@<l_7#E%!c&nVOENn+M6>U(C<bea}mpnJ{Olggs
zAC57p0PUgG&R7MKX%T6rMlJKoAhZMG0dGS6wCYcO8tU{hlp5)!qxuhoAZ~LifhJQb
z%6&F6<U8G24<#p%F715wj3<17@y~y46eJrBx}?HUTzW9WatNbQH!5ij%y$2fEJ29Q
zl=*Lk@dO>4cs|~@q{EiS9&UqUJSb}veHsvl%T7rOR!rpW0wR`j+?K70w9Nm=i;+?3
z_&?>$$0!G+-vCM;^=0X?bUw(imhS88ly9M2m$CD(uiJ>jxj(7UVGF)8mRg<DOS7LD
zigus+nxTRZEigYx6M^UOZwKykw^hXCBof0&D2sB$Q5-~j#5W(Ix}EW6=rcK9pGLMI
zMFLFZ?_Mq?`IAd&+HWI$SD{3o%w1IUF_a&uinkwOqD(PGhe3%n4B|}gmc%amMOm=<
znsVlYzo4Zg7=-``f`$snO=Y6(E;lbRq;n~V%)QwG#u@j;GF;o8z%-&DCMCb3Z1IXf
zmx8bk^GgK^Z8uFAho*x{x~vC-M$nVs!%Gj4@*^&>Y001lZn}14`oB|sVWixwaO&K2
zaspjDny9!5*+ABR7wme^6QUs~{se|jzeIRGoaR3>)gOb0otPG0yo?SNg1vXKssJht
z2_DIWG1TicfR^s@IAq+USAv=7)})Y|g46gb1PnWV=ucvW)OehRBGdkz{%Ek)-{f>4
z0gDCC@e+b-7c7klFJL2Q(zDvM<Jn2%wZhVlUhIxH>IfdhJIF9FJ6`mMFm|Y1vJ3d^
z#_N6okSu}sc2(JESv+O#l2c~OyjlY8AEy5sdlwCQJk(b3Nj8LwGO+D_fp~v;KT>aC
z_qGbw9TCM%)P|IZVwAMlX^P)k|HVMzH3ATO(Qw*A<chl~cc|fx%H5CR>~#|pxhDRY
zx)FzziEFycXYI!GIJbk&hSD*k!VF5!KXAvzmd9;^5}1I;1mX=!;rsQpY@e+m$&q!q
z<kjWO-ofK59kPf$+dxjhFbqPnjM(iEhMQl_N_EWK2@+YE4YFyy2X$AevkxUXw`@7{
zAh9M|G0KvTi@xnA?i${!So2uL=VUA)ysL-VC?x4MKM={YUTz*@U(YeqC0D{vK+O9x
z3x}@7sq2LLRB7G&#5F$!$fK-Z;AL0pV<^jmSx;4F;eIG<O-z+0OcyV`=k~td$dL{T
z#=KDp62NjnT{_T?SWL19x|vWT0GhI`aKIV;CZb*eV{<F{y2!JRsdceS?juZ9_!8fC
z6Tbd|yJ;S}gwRnjfrvVsF8-_Xda|LL=cJ=ikF_wHAg6M{1JS&M)@$Z_K(#iCJZSpi
zavU^qVWS~`E9A`IZy4JcehEp)I>nRXel7oa5rULcFYCg3yVSfx#)bAml!ojhkt6X#
zTtqpA8rdSZ4pQLDtFF@S`1V&sVlS&>&5>C}&AY5U9`eN?hyXd{zBKWIr2^~>O?oX*
zK0G??f(U=;2?u*3@{C8mMJ)*6_bl7$3Ay+14{YuTyg?)bdvR@OFj{Y-(z9UUw;i%Y
zL%a`YBASqb`0+D}_9OF7`-<kQd(vHj599f=X6J-Xn`vs+03n+LDeN`#%AW4@yU%o}
zjOUm6xTYAxj87^C@Q@?!B}n#*J?G;T%58IG^KXFMpCVZF?~UqHi}^0>&=^^W<t5g+
zm~j3f0MZh4@i?{b7k3`!+n7)Xl|@i(5P|PE(DxXx|7(@*E>f#YjCLH+gZPfK?7y1+
z;M?@;c;4^-ok=vVpNnC)DT#f4P1`?RC4(<3cU4R_`j%iMoORmoMrP5${P-9cSEB_c
z34l!m5%-3s;W2bstZVeQIz^YMe?$Aq$H;00UbSAM*gErR_aSJVke49}WWLVw2L4Xt
zY!OXHfhe?IM@op7;~TGhvi<%Z&&oE~9jvu3P<k^vJ(C_YNr?t^nEbUiLb-7_siae`
zg`_Jq!7&-mHAMi8nO*Dr9dRML;={E;ZS5i~4wT<Lyu@zte)Dmc@UKgyS_?0jI_zx%
zQ)PS@v(;Ee{g^A)Wnk9X`_on$eNcnHbXBg3f@8tqr*wqY1kd+rrzE!{l}+nV5c=!C
z)@yszoe+7}!_kw#C^~rz5f`CPXPhmbgyk*#u_N83Ix`&r$p5RIe4V5*Z&dnU$7G;|
z`=26o$s2HkKU)7)GOIy=oYqV06a>%y6LiY`@Sox8XuW|?jQG99D9F;&<p~BZB5*<#
z%vFH|pUFES{&0)G(q|`Bfvv?(LQ&7Ltd7}q{*R*1oZX3Z>gW6GWwbbuS0rZ>+&sUR
zBz_18?jV7kM_D?d-0meX=gkuN-5KUn1cZYImQX>$Zt>|a!TKYD%O-;3-M+O)kUGQV
z;QA}`P#S1%!?)wm%&9&kkLmK3<YAhAfKvB$m@N*r9b;Z;#FG*1<x4teOM_Zyu)9|C
zTbGQ5JM@BMxr}b+Q@bx$zv@vvo`jq?hgpc+_(&7x_vfBm^Zpf$g2VewxZn9Y%}PQ^
zNQexcE0E%omxCrk2x+-DBP<5+`;3UeRhHh}Ouh=52<41REy(${F*Bz?JvHzfQjL^8
z)>%W!#*99LFYHS%*4NYXK_NdI|3}T-Hm}lXY*|bN#1MZ=cw7%gGX}@NluK)?Mg=f{
z>QsWT<zre}F|3mJN})NOAZ=p=^11a!7f9dB+kd5w^cFMlhw5o;qXmGP0;ogf6z>?B
zuW(z?Jl>BTfK<aGAzw8}#_1M544x`o5pkKjZc4ntqTI%F_tUI&_b^7G^Qp*p&Ww%t
z)))elB#5Jpue#5;OD@}^x(o3k3aUrZ6*xtw-5Y=YPU#{dMBQ7)<`uV03pvbdFhPl3
zLa1o^=aY||mihqQkw^6O;)htY)5VD__U*qx8HsohQZh$b{AUj2j^d#y%B3sr=FL_w
z%aCOi>kZI<bvJ*fs`Cdf@?<0@Le%=VUa8-`eI%AMGDxrS^%ZVbVL_1-Z$fU9vaoU5
zDzjAF5W#O5F!a`tMhlABM%XJ&k?ue@e(&t>e)=oqDEmHH)!gD-97zd$CRkq1wkzV-
zyr9|+Ky@*6d9dt;8l(gHA|3f`>{DC5>l5W#??yB9<50o%4!X;3)9Ni4ibF_nAY4=G
zL4@Ab*w$Mw>&LltM~VsTuf_m4m%wByz?~52>wXuIttjoCAbW4sW2L?kK#+D64ocPt
zcp%lfF0_tThaaG?ZV*i6#<t}d;x%Do?p-h;=w!~@^km@M?_+P`DGlHD`A8;TV!hC^
zPda$@;#eFfkA3?Pwh&)!-4444=#yrq_L!N-sdQ}LR!gJ@nDx|~PJ<=B(A)$hQw`dl
zxU8I8s8XhFL3WDhifN@TxEsZygM0jbg)*rUbup(~e2*zhZl&!k3MD+^gBBw<=@U8P
z@?lfm=NmJcT)HMr4*>xjbx8zgFzk8AG8L8&R%`t{>fd|%Rzv>Wsw|?<_NKKkG%1zj
zPpZ)>aQB_mRoJO5e+K@%9~Q|A3wV}Yehiyh3;b{RmUDFm4DZB0t@#JJw~pk87ThEi
z?ox;20twl;t%>m?remWM%qE-VAS|xb+OV_aW&y`CJ*#!fh#o1-y%>29mS_ZL#;Io!
zl5v_HIQv<mqXUQ34AsR&2muWLWw_EJXy+ttxu2s`iiaMjZbb|VyfHuG5f7R}S3MRf
z&TVRcj#|e{)@QT7&HK}TVX2O6(e_>45!$9Nv0HQ8t7$`O(LdrQuoL=1Yevw;JkMdn
zl}P#HD>d@P7u!g7Kw||4V2+^45RUfMwhP+4ri+Yiq!+$uz3)LJK@cygvkiy<Z(XN`
z6lgKTi9JV(9iMn(IZQ@da6MRvZ<VHqe<@4@_BQUn`(x<L(7om>XZ+_7Yzk5vh9kNa
z3y_@#_%J_g%_>hi8g1h#YryqSQWe_D+i5>$5o50__{}1978xFK&eO|gYd8+_;nziO
z8f91nh(7Y{y*b3(8*)kbxt`$?m+-#O$@8G;i4=6wCTVgTW7SHC%rS{F<7=^r`c#R|
zk^Uup=VFeQU$-qUw7;N<{9^1&W8kbRdU^D-G}60M>i0+FNYPHs5#EUAo!b|S>|Ep7
zNe>GfK3kC?uMus-;768i`k8H31ksKrCc6+ubn@m&rD_VnIXbRu@@Lh<3XI<|#>H56
zhHX^g$zI?U<$8~~(L8=MYxSoUL8uv^B`F`koMGQyb?OWzY|IiTR<<KJh8Zh&VwX#-
z^gcA8{y2q@t8bg#!`!3}u3KmIh=I!L{APnUwUFUCR>x@Q57bj(9|+_nY7U%cr0HRb
zieB?}WlQPtcZId-uRPD{iCvJfuUHf(*OgGdlCKL>H<O1^q_sChNKP8=yet+KB#pzx
z2ujdbc~aVeio4pCct>`iXb=nhg-sT7X%4wPg6ST)=_E0YpTs2cg)iy8*Mrx&7~7r`
zT#Td*w;3sm*?OJhfCgQZZcRRtZWwf5gi+QkSSgKn<ps`<jHL-iE#h@%-_x%irrsH~
z)dwfDp)|A2&@+r#*H&vC?|gP_1aS<rNJT`_Tz=<0_`GdRo^G~va2&wV|DF~<gref%
zetJf)aQooamE=|S>!kVK)_IBY%5^6yvkii*v&Q1JTd{s9J6;gDfIH+5@UCz>fg7sR
zd7Roe!yJ77Os~fs_)4xF=wdTUhQ?^3rlS}5CQg%ctw_?t9yuR58wD(iWL0i^fV@fR
zXUn%-I{Kty?IoU^T0R8$iDMK^@crZ34yz#KqYz}vbe<HMOotCh8$H6I`Zn=hN<&Ft
zl_WILJVb6r=Kk!%GHdh&H3uh1t*W;eyhy?g@e3oHe<q|sf%41mWct_AG4=RH*dt;p
zIH;LkWFYB>w@t3l?WQoFVp?iAv1`tgzL}3?&aMb0_#b3XgS{&FQA8mo&)PoeEa%(#
z8tS4@zw$VkD2JdYA|71CL<z8^lhy{PU#tj5MFc@0M45<dy{i$(DeR5^pu^NjJmH$?
z6ku7HYFG%2Juwz5Au2-X7b)CRLi;7Vg_rwkM~=O)7~;_=P%&(9y4r4dT-oV8>))jA
zNkAn^WIWM4KAtc#bcqQjra6O&W{1aDJ4X$_<QDAxsuK~y#7#Gv^=RmwL5-9<$__r!
zI3gZ{Rf*(Mg^~xirUUe0pGZM(85QmTp~Rht<0-R)d_+*Bc~d{#ZsHST^$-YI=0@7a
z9XII&;fmhKeEolJ$<2njnYOC-ci(&ZmeOoYw?XsJjvp2Pnb{{n3&0i>Ux-5~eSGz9
zSykSHJ(J9q_^g%xjTWE+q-v>#>m5$uxp~k?xAcrjSO#yK=9NeNkb4YYlGdIfml?4i
z-jMjehAaHnsyX#zHPLY6zU}H*$Sy2}e4BKxknrjwViLu-Ou2&~d&oWg?!WEBG7n5b
z8urDqFfnsJ@!zb1<o*pakV}<ZocsApBJWZYR)$j9@qTa6&#Qfd2Hu|ZY?$T{M6^$k
z|N9;B`3e{96Y61-{etTP0UEoj;k_D$sXI_8QMp$?`Ws*)>I~=>Uz36w4@Px$PdxeY
zYsASS_Pmpry`qVd230%$c;Uw4je}|Jz2nGqO7V*&w@H5ymXg$pBvl3zxq~o^<StKx
z{_irX?g_&VPly-`E6RKMPVL~$FM2C(()qgD+-aVt(3S4dhz9N#tg0iwi@6h05sJjC
z@75)mKaGedp3B#LOrq#V!n0d4U?PSBso93hY{MJg&@Y~3PoK^HzmtRdkkrh9s>G1q
zGj{Z;hrO6UJ`P8s4Ea~9gXU$Mz2Z62?7eZ7#ZT;Mei3NRL}Pp2k91wvG1_WZA<JXb
zVA%<yCq`)VTjzbALc?eF7(6MO^*LndKnhziIS<^4A{OC6qN;?UR>Zr=a_#sY#RX74
zK{UE59&&WrC10`5I=4SjHmEKI1ln1(VFss5iE+m(kFUH>p~TL;<N&t^N9jy#cL0OX
zJdzrC21-RItkv#*?D|IjyvV;pr;nG{A2?WIabird6hz0~Bx1G)s@+A#2RB!>-nVuo
z9R^*5ZP_o2B)BC^*7G=t%r@?Mol1}V1!(<;|C4Y$SP8#sRyDHSh_s+s9t*P$(?+@V
zQr|&9iCu;{SEz{HYo4cA?&p8sitvt-811(}*UT<gQy@f<M+ZRphfDfXu<o>;!2O*N
zTbn$ROKP%w!5$_0+kC}ohC^dSl)@Bse_8B=mBD*^OIbes$R*)BIEy!sh;gh6Gb^1f
zY|z}2C<%SeyevTUwuFNkH6hC#4l&z*Hn!6laDEdW6HeSyR9u5k|5cu<+R5BDfO7#}
z?2{Oi;<Nnkxc^CctchHc+nu?vUQTeEKCJLk0X=&}Oq6FR5NClXj7bX#`I=^i3I<<U
z+mbYnk+s5QGB|z?o{AH%tXY(_ZAeOy1vyF?ydEyp`TZVVoY;{KJn=l%(zLVx6czLW
z<a>O==-+j7<|`YVT2X#_Wx0*`sU!*I;q=?;+N8NgHeZ(3%qhERU{+w{`1N;Z(SNz_
zO5$deJEl8+IGFy0Ul#9_qt<jh*L&^NF)8pIXfd>vTIBKtC8qh+W2`mNX=51n$RzPE
zHUsC{`zV^gnORDBfO)6xUOid?tD`{+U)Rz;NF=Iv27=!jS>->lKmrw=Gf%JGiShkq
zs1R=H%6C}3C)>5brvc88w?0rS@K{*uhiv%s2kMgFPp)&5LBk~UZ$dG@iQ(ZB&AHi%
zn<J!T<S#Fe8U2SZ2lm|}$kM{>C+PJ!nl*Pqv`)+QShb8CU!R(NKF<L;fq0}pHi~!~
z^(ypL+_^XiAag51I|Gcrr3*=XTKGgANmM{|kBCqTp8f`2CH)bHF^WO1=QnWEpJb6J
zvUHXNK_1SMfg8R_Sr%QNar!@e#h#MV;Vlx|f>#pO=eV(cRpx`vZI-9JJ#!M$%P}ey
z5=!o`xp$;_EFMA!ux*_P`M&z10ww<+jU^uDbZY!#QhcSLgpY0yDtrX{I5T<Vk<8qt
z3K?S;j?RfM20Hp-M!jf7ezp{i{ZD~*ENnrlML6+^zm)hLjgS`1#2QSK_-lj60+KoU
zk5`R;ch{gh`WNtv6olgU<nSjU5`hvVu7itt1Aw&~`VVJcok+Ib42T*-85nJlde{53
zd&#<ei!<O^;f2J9zAsrVJb}_eLB8PU@~dvQZ}NZqQ&C+#`2E!gU!PLjLD@g&8^^Hh
zffU-e>Q$GX*>%^dA=od3oE6v&M0ibI&rP!o?K~!>!$a2zaH|2uC0+N0w?XawVF9+<
zf`U?{2Cj`dSRM0&L^qzW`#T%D?)C^%wrVJ`Y`VRV@S<DI(a%>d#7*H0;mm^oWSM-8
z7{f8n-M+<VGVxG@j21-{-pLt}sbsgX`r3XZf0r<wIv-8F&9xLtZ6<(ClZ$#sIT!DN
zQ}iJxhj=9nq?qr%>V5Ag9MQ@dkGC2d!nYWrVR;1JDuLshm*Lw1i}}DUPuAw{eI)31
zK_m?=`Gwg`<qnz`FQ4f)Sj8?Tq#ddJ>1v7DKkJ{yI6*PqRd)bqUYc^QC{CCI2ZNhk
zRuXJ$LX`<$yK%qi#;j&eJ)VO#z>mussIafyVg}f27K7Jt3y$XjvC`H~`}eUuiQY>J
zO&A<PC~)!haiR9F$8(M20&q<Im-g9wfKFwO*3po{_hsSR2<4@^Qh7qvQvya}+PJRh
ztG<vJ=Sv)Fh{{0l3YO7%oy?W2Q*Pf0&pnztsb2W6#UmLJCpB5XF<z?kzbApmtI)3A
z;75DY_1S(#@rTpSNgCWOD&E){BUrT=>j=$8jl;|Z?P0Qn6rsTkZ*(58WDpba`G<gK
z>@^ai?01D69md?Uvktiz1W1a<J@C3Rw?Ec8M%UM6MVBzjfLzc!^#1_2W$sCOZ{zFq
z&m0JWa?Z<-N@0ZTCrj>T`>MP&pVs|4`A+m9Y`O@)yRiFDR3rcW&iR!eS5^Q$3@>?1
zNb4{!86Kh#4cIo&K8M8cQ5&MkLqXhN`Rxjy)%sgxN=8PTC%!iyIiTJz2Q6zaa~m-)
zsayOBE#<@^*Pg8t!JtcqOY&yj)6rEXX_vS3a&Jug@+?sL#GS2CZjM${npS@bek60_
zw9zP<Ir0v02gN^$scB(RvrojSU1vD(FQE+v;XuJ7heFxS68x_<&E`bk+hILGXfPDy
zm%01+;A&BD?(9!eZOvU(a5eD@pNAOZ3so1QflQ@W*Oc!dA;HwazveM<xklpt*QUBn
z=b2K*B>m1I57*l?S9ZqZN{e~xJ2%eiu8D71Hut%~%?{hb>f@aoA@j?+AYazYu;a;k
zBdZos)h$BE`U&x)NzmYgOzgp2^OkbQd4SEZk{<!4e`Q{BCcHyx!lD}?Cz1?28U!>V
zdCZ5AQz8L~nV@0V4yFmEe=@IUo9!c9L<l~l)+WKDpAu&FXX1v9^H;q_70%bkuV5Vh
zq{{Z+)adMWXvmSaHcgQ%VO`n8)sn?BIb<xg3@nwhF%_Q3#a=sg2_y0c9Ek7tS~8Zm
zE85BNesD|$GS>IX&vDk&xBRBI0gSqtEcorfSoIO|*TH}Iu9`J+JX8XQs3Qz?SzgKU
zC5GHu*1fz!tvf~5R@8?d^Xu&bq0RL|WD*pvwO96kBGXLQ)jKVpBsiQOX0MqPi;V#9
zu^q=hZBmRiZZ}Z5dfzlMhR-=}+IMO-HXYErHO^IoEj9%TDO|zwJFTO=-7`P1!=Wel
zsK48twDfL<1+*e=$w{*ACtBELqi#{?+%BLv5hy8pq;*1M9Pw^)&aWVi+YVF4uU_YA
zH(NaOKov>euNo7LA;{Ib=4x;$-I$S+z$C&f)?vg2(O1W}Om&DL&)1~=%>Sk`3F8lf
zKfXW@KF@fUAUThsx@ua?hfxzJR1EP{h3pwMN^l1gMF<cKVXbgqH6Qwy`{hoVd)**;
z?V0c7OH-qW_^!xjW1c|MtI+uw<$4>|nQi#LC^}<*E25b3H?n85iFVi8HYzt0oVkX5
zVb8XQk!7hS3u*avM?i^F{xM7xDr&>t{E#sAr};EFHiJF82v@BLL>RkHh9G`yy?@eN
zeB)bPmy8g$Y3@?OqdFyhyimmVu5c3W)6B_JhnJ1lQuh|x7%WaRqM`Q<0Fsix&r99K
zR3GCUmyOG%v`?)m8Y__V-wgIqk3+wl)-b%t5IOhweUt&F)GwV4drOPWaDDN2Zn0IY
z@h&jS4;6+7czL9?BC^}X{SDYl;ii^gr|5zEg6+}f_z7QV`K}<Wg)sBJRIJqp`0u`2
z@0U|4j<c<G)sHx0R?tkk4ua857<@`4pDzGfvRP`?2zL+}^N*|kWicZ6$9|3p-&yra
zz`|P>J>HBs@^{QfhFZ-S2ECjQdCv?>ecak5_$QszlWXpKuLFB-qxl$UX9b^qGffgC
z^8La;?;3Gb_RD2mg#N}i#)HibRkvT6a)p06vq0e>-;bp~(-G5`_~Vx^1xfNb>g?;E
zyv3USQJ}ro&dKI*^zx>O^MXHM`sc@Thmb8q5VY7JGnPqsKM0%65oP|YP{uE1qw76G
zuIWVoUk$s~N1x_RfU~kuPp&|=zWmpQ)6#CMC_75Fp1`$h=)fxXmQ`S%fjCF*<M4Za
z%a_mQC4b9O?U@m3zPl{eCki$3cs|^u>2%Z~_YqWW&PUn3f$US)hwb?emXyL`PyJT2
z1=FJTYWeGRd0abb4;yns5?kj5#nXOg7uAYK5pF|g-xw+TzSxnF#L5o<?Cgoma)dFn
z4p90zN7B`bgXtduxN_altyW|BRJG^bHP2=;5zu`c{-hQh>-%SUT-ozcCtt7lt799o
z?T;^gV>6h49*B+L^=%`R`?{Z>s3qrrDLTuvE2I_)pw-XnnGm@Dj!x}~<?H?YF7ID~
z9tLl@U8Aa|*-)(_S)vxz+&zc9bg!jDe?7lDRa(xn9XdK+npVfN%Uk!*+DUL6+Lz+3
zV)Vdlv+II)Yr~&f3dCiT)M5SUpP>QqC3(S!$4*^^4fkWC)DvGXgLrgeAw|*Ml_8(u
z4duHi#svJxlSOZu=c>gVj5k%^T#G&7-x56^++S+xnMpn=emy=TqAodR2W`Ksevoy?
zkbfQZP?;upKc(E`8B378yjm)0E!{_fen9nZ@}BP@=}}tjGM1XfetJzR9dPJXl~{c}
zq%{mMXFJ5^?HCb7jd827(qzk(>$BDCPHAOgsh$<CBq6r(I<XnqqhOs#m*|x~Mh@-g
z-Z^KVS7A0LQ8w4r$dkG~R<?BMfXcvr<%=3$wTvhH^_0jVrq`4k$X*2dAcj}xP1fju
zkF(K&991>_wdxO1UD5|~gPP`5wbrtD_0Snx%$;DZQ+11}O1h)B?_EJ&y>bT%vx-uE
ze>VzN{f39a-zzHIk=_>Gp@`pCt<AZra){*kp?3HD@m%HR__wH1eew?v9rElH`Ucv(
zV7jAdwq9<VheDoV7P9220gkHAgI+j%h+@e48~-xM)8#vZ<qa7vU%>fcK(a>e2HQyX
z_1A9^&#eBWh~Be#3WtEl8M`yWht6-t=hS|v+s>hZZzJm9O7gAKf?MRS+U8F)-o<w+
ztNbL(M)FHPy@gNSC6ia{3^V(_PCrcXl=FcEq*fE85C2L;S^8^4#BKPWfV&Alq;AIg
z@C1pNQ&&r;d<^dmo^j7mA*aQL7dSb~&-NF*_)l_3F8gH|YgXIi=ErW}VNhy%|JMy<
zU-jo6PFWp`0U0mGA=nrnn<8lz_`QP-zwq$LN+sAQ&kZKvIeIGwiecZ#!66{u<+P}k
zFBaBJ2i?=^Zo|9kABHF<DvMa`Xb+oi-jgiYp3A3@p)s3XvK(KHb`_9f-G;9VM{>p2
zWd4{r;*HOp`fV(cA-#85Z!^=h&QdH$!Z#HALm|E)9<*tPf7Vj)9anVdJ##KT<SiHD
za8x8AWc;^EBW6~alG}-Sv;P4*6*JwZb@g8BHw<cBEpgxNR4CZ9xyMHEUVJ%5VO=NR
z<RaS@yg_=P!R!oM`<rm{-{{vxrMn{sbD0}6kNKc&ecD=Xv6EDL%E|X9`qu@A$wolm
zShQ3);<K}RZ`2ht#;me9z`aSvIYUkdtLcqNnxn$LX}IZyeb?tvODi3=nk&B@_A0=R
z#`*R5TBt~7xiP79MUsyZ)p$Bigi%dBm8?Qkl*#%}Agk<~e2uc;-_M`&P8U%s`n(#g
zAAPX-`#wo3)BP)+cW;gEx!y4CBX>)mk*;T@I+9w5nu^r%69A;5C}G0&QZF6cjR!T=
zK*PQog!X>|tpzN+0%wlDfK;rK-LnYXdvkI6`wOJwbsfI0Dz!L-a##-uHnqDj9PGZu
zHZt$&GELB<4rrj>?N9EOm+Pk`q%D#Nbbln_bMK6E-0@{gx)0;0Ag#MdW_T2+Y(6zA
zVO+SRB#OIlnI!Fv>b+bG;C41(_xMzHYSc|!aEw|UFJ@D4v|J9KE7dBGP1~jtweK=1
zi09;=96!A!`<IESE~yNDnRQ=8Xddjd4FBSZopDw^nw2haLt2>s2oSftZ>D1h)g4ff
zHJeH+T_ohI41ddFIBIH`&Yb;T@QN>GF3y!{TPSjCjq#P^20lkPPHHL88fFk7dwzQn
zE*xNzDQHaYoMzrt{k_B>{Ci&t<r{(Cg;VyQs{jr7j-<Ni2o0Y*)sVBBOGMSouY&gm
zogI&ZMMb+G$br18R5oQ$FI?MmVk=(h=ln4LlXxR&R65p3Vx`hTq;l)_m%Et1jZHtQ
z;*!`YDb3~Fci$Y1*HC9^gB;q(qR-46MC+FnL;jn!by&HHxL27wl<)ryrSg8GW_@7i
z{)1Z=_dH77UP|N>20(9Kxy`~bCvO|R=49>Y4$LRo<fh@Ml^v^s_p5YmgUlFi?aCo_
zSa?ld%yqx)s#-^eT}vo3l?SpLtpIbN1`+QUPwWZ0(kg|IuuAEOf0_%WOX#~rCBgn8
zT>*pT-`_e}#mwQSog|X1s;@;R{LdImr%YCbA2~nDwRM*Sq-|Kbk;cyJJwG)VwK<bi
zY#JYi8o#zY;<w|nSr?2Zk&s11Ri^mg+-a)HvF)whR~B<r`wgdl=@K=0$@-qOLcCCb
z?A(NMOxA=cOkvf7{RApIStftY!V*apJ>W`LJm1#H6>=)$>JZHS^g~0#&$$ITNc#Ls
zVo=FoXu)O)&Cfoij)4|3A7($FeKCQ?tqzss5|b*{(JD=~o&GQ0)TzPFHrd0yW~A*_
zdlg@&>~%ahpzMjolE6l5><frdkCvX>KQ&YH%(S_mD4(yq7M;s;^(K{1vIh_h(q*}9
zYOhZEYG+Zi$=hhBjN1P5L0?fIi!$LqE$Bx|n&z6%{e_N^zISEUz2Dun#SMnZ@O{}N
ze@;jLLL~mX+#Ql=+<SpIJ?fSiN(J?*<0L70DZ~}&fA>?Vju9JPLR0R>Y=MEJw)X4>
z9tY*nuNTH|zQj+2metmjOH$UI>Ul^pP9F|10{<97Qp0~NbU2{vDdnaxC)>9_BkQC<
z@2L6M-hNjT%Q&@YrLdzFR5ZOc`18C>@~f=N^D`l-a;^A9Xiv<Fs`_W~@3MJyB5NXv
zm}hu~s^ars?sJ39;^khv_nD4Ns^X+f-Fvl4MP((Q8?(a~<+}cUo$6}R6b#hN${2c(
zU%$g+xc-Ru+v%Gj7VDlxZvB0qlh%6wPxnrmAK)VRYmR@l3UGW$zALWSu=e9eo2ld}
zi)3I)uv53PNlNP5ljk$b-=lroWM=Ag9~8y4kIvKk$JFH`m)^X&i{A*;L7%sMc^V`x
z+&neK<9BnTp&~^#THOuD^*!|8{Q$9ssA)lY4`IfFE>nj`)FDNt>14FeMh#jZX4171
zLXrzC#<vGvd`YHwmN}3-q0~kb5WW6iV-T&{>0fcX&lL)VHkLC+#-$${_j8w$1^Hi;
zJ$d12Wryy0W3Mwgj4IBlWnnOU6zin?*YCco9DzMzm8eRw{-#SoD`evF75+8JB0@!^
zsW#`oqMw4h^NGKh=XlRmbk|5|PFxzxdbfJ05;}miBaa)YX$Qf!N-{DfelvG(vt6ft
zJQA<(F=oVAyU;t4Gm{NB(|ijv<2+i|<GXK7c11ZE=rctdxkPzAd+P=lcrH=3Gb-+b
zETXuM{`q`z->H1@3&(~pa}S-rH|Dz(J4r<>xbU8A|CRhGhnCkGp^IV{tTZu?Fuil-
z!EZ@1^FHwl+(~O$LEX~4A8OBF{@T`Yy?8S(Pwvp^@z*v=je^8j8%Aq#lNQ^AVGzG0
z9AP1|MPfy@YrN|9Xy#uO)$8?3`j2l5-8>jt^U_4W$*fe^z=fFqTNdbr&Y4c}uYXQ_
zTxqz*nQsH~g+AeF_&MLO$Zusvrhz!5J9r(Id3}$ed0U7DCUici`jqCNlv!NfpMl;{
z_kQcD-QAsog(NWRtBVHQid{7_QJr(nATZ~1X6*O5HG_`~rczSx^+v*3&r@pe@8`&g
z$2?}_wlsXA#}u2yWMOPASL^94#c^B5_=)LL?5HB6XYmNzv(=kj8Ff!f-o<+i1B2_@
zEh-5z($d{AhEui@;eVT?WoW+HdRRFm@<-O$+IR0nrfd0@b!wzenW@U5jw6-92czGo
zo|adiC8&j-f77aZ`RNkF+}Op<{#l`*!b6ulLbasUYaXxa;mH1`iDP@~zUk<TqdS=e
zm{>x`Z-hCg+^PpXrL!pCisdQ78>%<x<>JPk*Twlx&h|5OYZKH+OQKv<QS<bSHwny<
z_#sKOL&|q>@e13W9NR_Uv2D1!IBx#wyq|NqPD17sDV0LCSN87~sq=s=9-<f4*<*su
zFb{BOgI)EL?*|-ospL+qS1jM{$9vvvT@DczVzCdrQJH`%3g`QUU15!^Gs<01x%FA5
zp&8Z{V50#H^~dQjZ!t<uK&4P@A2xFT<o>H?TD;&a{Qh+%tw2}Sz?2)}lmmY8^m9qm
zuby-AA6&Gq&GU(q@iu19UM@)qw0}Ys++3c@^LxbFEmbch{HUZiPBqJ#?Z@(_UU4yf
zNHxI5Mx*%cDwW%Zp%wUCI5zGsze}x_(8BYb&HYr_P4LtH6W*ZFq5~(_5_;?3lHR6l
zkC?T?b4iK{jxdYLkunCF10kXcqg#)UrEdqmAjjt+o!9?Rq$0h+i6DK7u&;A(-vdwi
z5hM^EF@FQIleRBDb^2Slo0oU5goSBH)Dl>2z!eRi_7>a&0Nkro_3{o6+c+rI5>v1}
zYm1d@D|28MFf^j|SDY%~y#39IO(@I0m&xu^W2rdw*~!&t%UBe<)t~qeA32TewKX6R
zX3(qsHs&H*>{^s`bl;sZ%CoWt|7Y>G%SpSc8l&|}X%L?$Auj$|5)8>`@8M^+`cM))
zUUKepsOl?=tgL~-^qVxOB*%jO+d1D;3>I!5WEg+#FoHtYdrmc4qZt}8QMudF?sJk7
z_h>$khOtw#1Sh8_-_vkGg1kl4u2G6xeKDWJo+N)t`osB#*kjG)+xYcqQg7l{=#XPP
zRs1#7hC;R~xI8DU>75qUG7cJDw#@J7uEhdlC09!%;q>x!bX>Awz7HxN*3K!Dzm=4y
zhHNNHvK$eST%e!}&RlL`m&v;B;`Bq4vBznQgztDup(&Zo%rkpB^sK^quQ*8{1?HgT
z#&&&Kx6Etk`rScD2D*NGX7RD{B0YyWX#hWVOp)(6(+>BQt<I+9YVe<#5*@>E(ullk
zYX40o{`7N&5?qe#r8o%jBwnadHiYA@#n6~Gox;ezpvI3P=G5wm(*p(naWUvaMFi+h
zDT`q64;;c1@5D*6*B_w}FD?>ZvjOlqomBlXZyS^_R(jAty%Zm|(ogPM&2;tBvF^Ed
z#Aycvu^H$HL&V~HKM;%&lFz!fE<%8=laT7RB+m@`B+ujxi)D@GToF-)KW`9XevW_M
zX5NZtYvcJ1q6r~}W(T4EXzKnRVic%AZy3LaL$F;wV+p!6zh3A;8Q#(CPlTt@!6{fw
z*G;D3D`5~5Q8gvo$0DaYAEb;)6BC}p!}PaR61v3(++M>}Wx-$X9m76pSq7UF*2o+v
z)$7<oegk0tAr4lGca>@`+pL#<ngn_9Y1FMf$$A+bGPv~8p+BeBcwaF94=tv;txM{d
z4k+e#X%cAwN&lOlx*tAI%j-J%K|-Uje0p0ID;jxUR)}I#<&b5LA|~eeG|3=Gif3sl
z=Ow`#`qLtXZ-#y(I;Bsh!8lDy{u8;`+SY~Q`n7U3xi0CiaPNQCiL+nV?K}PKD;o7_
zrI#Al;l$Yxqb+|vmgN@Y$ghAeTLn}pNE2=t|Gl(yaM@Q_m6@-@@ho_%wIh?_P55RM
zg|4Q_Z!7c6K=b!zV@}w@i#qiAs%hs`DSFPook}=E)Gw~N4&^Kj&5wT$Up(QB8@9D~
zt#dXf9A+L{3<2C%1<D=^i#B6oENwQP)~5(2#`w&S9wHX^B-Z)c<vGh=megLWLccI1
zt(c@eOL9^h(kR%VJmmJLY%6_PE>Z=(8x+zr^wEzCeh$xQt3K#C{v#?40;`d?aUXu`
z)foa&nmpg@o0Pj3P!DM<X#llh&Of(Xn$<wdJ>QQ;e+?>xy}7{WnTbaS`|;>{jj9h~
zWN8oC{OQ|PLw<yW_o5CpJ&s4dGGEdXi{zc`-*g`|uI(R1r7T-}79L$%T!fkWI#hV}
zE!Z(lAe}C<N!rAY$lf0g-}2=6l~{W@+s5`i9_8Cy!jl!7JWQko4a!5`4&R!DsS$W2
z5z4i;W%oD*Je&gQ+NKniyDvK|YBb-kX+A}Gi=btR{c3u2qAfK(H4SQA*Ar#$==@(#
zb6{8ah#9jc1}zgAD5VSp3TR4&79`x-xjBOHWFfXAnc~nz;CIOT(jW{I-bx9t@ozT>
zYoC=n_2xPWY(<BQACyZ+)64Wf=w9ka7(ZF!{tu@50DyWv#)2bWl`U=nA8^?Zk$Hxz
zTID^yD>*i2FOT`I(63FZWHy|CJ)&3{>(D+7Vp^(^SZvML4gVX2C|#iNEk_EaUqy79
zAeP_vEL`<sp>iM(fAgxGW#T$fHMeII!npj3SpSZ(9|%d?Er~;1gk_+pZ47D36*dUM
z1Gj#W-<E`UYP~%%q4LxsZ(D6DO;;f;xyZSgBYmMXJ`vgGWyE$nw#?*HcqjTM@oP(v
z%Y5|V-#geOlJ*>9cm^rzgze|8e=xHh4g}f4R@#W+-E^z4GZw(xhQgm*8KSBHK#3(}
zCnEImJ?C@lG2hSY8aO?jng_9%NF7K%D4-B`6F0Jjx?ctZ?SsbxtgFM>$U<Y<=;2K=
z?^_LkB>gPtcSb`7zGz>p@Zn3|;3pOSPRBJ`Ei$93n2EoY8_2PhbMftrAVYYD7&aBy
zLXGkR)>1c~j%upHojf84Z65t&)L2cv$;BR28~+s2cTW_;U-2M;gEsAB(gm+RM>GK*
zR`W)K>Z{E{X!s4zW3TN}9FkYilm$aG5_nGA-b*Y5%iA35+15H)wZ&GARYqCKUBdp$
z-q7mKGy89X$Aebn?G+Itt!6@&f<S^&I-2Ung*gAM9FB1#7wldn4pt2De!ODJS^Y+}
zs1o<3`aC}jAwB?0P@^yrP%VNSy<EddOSs<Hx)bDEuKui9N(d7sgfb**Jd`lE%Kf=L
zI^)wS!-S=CLdbDcICA9+Y6hyGmaz}k+=*6*jfLes0u|A#AVSUA7!<j+=NO<XUO)Ux
zF@wN%DSkL~>Ed*~?b8(FS$p$?i_`xy6cJ%gp_Nw7Ob=NO%sMV|BVR695OV~fD4s7A
zJeH-3*%E6dnq*_0et{%78r<%w=@WOqgG3z{3${d=?Ey6LybMv&Td;woU}|L}gvEq{
zq+$py4ouuvn7}%Gz3XWkJH9hw=R|DDHY6TZoc0PVbUIugmz?=mc;q6K4o8lY>Ug=h
zjq&V?j-07r)&DWHyo;m5aa?M!x_q!cID+Gbd+OMxzlG=G*)K<GKGHt<_Z0Ti<P$4n
z<e_{&Fh&dtjU~q?Be+#;WMSp+Y+3y&mB~ZN2wqn@inr|_K}=Rt*afZt3ZmwIECpLh
zX;`1$_Ds?Pr6iH}oPaPK1llSRT`ryzjE(`xLyp5HUZNT*EWI4(Y#AtW7f^@6T>6oR
zPD_y@I<d?#h-WaE0ABwpYwN>({nUqx0sd-NR|6Z+8&_5On!7K_9|oj)FLQmwm@faA
zL@+9|DirG^SI*>-Dv;zskRT*z4EHc6ylB(ZO)&CS+|-FbRom)`csQ6NVn1qi;>RTK
zpl1Abkg@<))os!FK`mrLMK9N6T^(do(6`o`TT1pT-$~>7^H}sJnTCWg7z7Hb$q94e
z-tBG5y6bv<qm6_>9tsX6A-EI&*dCTyYAz@k0*Sn;Axdce2mfGjSK(~qYnn+h`88V_
zrTgN<`4~g#mBBPYh*<3yvh4%uvFn_PYtuEW{blO)qbIau+Mkr&{0UGL=JWj>s$I|7
zNRpEDt_N!bybk1a!lUO_CbGQV(uHt$r&XDZKuu(CTHlrJz~$^`1ihRZVcMDiG|YMj
zo|=NgyquPnM9VfO)KK0=g2eH<Lj_!um^o+CvwkiqUu@d(<T`9{s|0&*QSj8t3WEF{
z*zxv|hx0#4jf1l#6u7+&O1xB|w?I<s9}riUf+l7Qqhdt)2^57CUqNb-_zOJlxR5SH
z^MvMpNuOUmInJ1l_3cs)dfEvMlXr(x8NABcI(gtMWwR{0r}PovD-8)!<rgvt!7#u=
z3kfE)SXsMABaF=Qn7A>35=V+7Z}Y@{^09Hc`>^HDXK%-N0y#nd3K0%<<M^+z0para
zm>ZzRDOMnf9rO$j`P;NzrK$KoY@v7m>&^_*sQfD+d0~(#kI$XB?pWyv``qLvrDiY;
zf6+0ELZduT7*y<lUT1_%)90WPGVwc4d_Yhi@^ZIQ2iDgai}3$^iK%vU#MYlYALxB0
zr_!7kkEjC~CwaOuI{EQgBm7UhC*l14Z~_T#<BrI%!`ZLawOd{yFcRdm!0c0c(X)9;
zh)h1_BxVA32cM}BS2-1Q7}AA>m4HkMwRxtE)nnh^9Oj-9&>Y*eg_`ac-07P^+)~=a
zv7{hAXjO%wVo2Tjj2S<+@vYg|ZSif0te2zi1L_bUhlb}DptbThh4oG_f$Xw<MO7)X
z!FKy8kle<j0#lxbkC`1j9kEkC)7ZyCQ699Ip$+-^!lcu_-+<{Z#xt2Pm?tj65Ip+U
z5<(4t@?^yv=ufvCQDnJg!7%?z(?LTf7*z_8bMs6e7J>V)g?PmaJr4Z=1GigC`d-oc
z&r9S_pjrO*r|K!Z-an-xM&ex<V~3i33YZ5X=(A>pN|E|k>{;~LM&1T<p6^i;YZ#Qr
zDDZ@2-_6y<2>PSnq7|6;V0urYd-7A;XHiwE`;zrg=uqjf_+3%UiCX*12b`WLJqbkC
z^=U@|JP>xtFX=-z4M2!VfHZAcx<~_Yx{<f4j-Qs^E>j{`&DXR%-ArG9mCirqw%Ene
zFUu3%OWnqxXS<S7qwJEWTm=|<;+7v2H65WCj(zD}g&1iHWD-FuL8Awz)5#7#q@~@)
zpwKC&u`r2AE;&rp%$j&1W=IFiT(i9bZaRe5jiFI?g}}`#;nfJ`2lw9h((D<zc;v&V
z;LPuanw;Vg5EvK<fsxeQ+MdowO7)$^y^oMbF?}6pLXP~n1Kn-e>s#tpQsr6qfPIB*
z{+a1$hf(;eqh$|);A8wtq?cHT_oDTG55mtBGlM&1Muaa3-a~E5M3oFNIGaQM=iBak
z*Y&ByRPuzrUSP7~)Kn{0H;*C`8b&BXxv$|RVh>(Ss@e&)Lzid&6@$56p78G1j;qFo
z5>LvK9~cF?_BSc-$m}Dl#`<E{wJ(KF(XU}~nunCNd{}RxWzntKeQ<W0*KLQxBU0k{
zzTt-j)BEj&lENI)1{#>Y@T4KXR#-DUH>RBY#^D-EJILmLD1R6@fg{7B&3~S%h?nbr
z>$ysutuQPDAdn^ag5IWX6d@yH90pQe!#4Rly^M9c=P?(2>^Om^V?0S0VkR2A^q)}p
znm32+{3sh3%X=}{B!Z}(&wcQ*bP`6>W>Yy0uB9FR?#pp(T`;bgOCY&KB{97Qw%8KE
z3ZzyfFvwVGq4r~0V&OVypUR4aF&30(Z2;Ukq$$sIvsnG%eJ6^FjhL6~-GYl_JFX_7
z%%+RD_3?OX#TcuOeX(OH2#kq*yzu#Y&85l{R_1!v+!YPHZkyMh0D`}sk)6qNdS9wt
zQJ#G;nuq5jg^aAZFA*sOzbRyR#tkSWuNJ3@_a<pu=2?2z570ciW~TnjQ(6rJwSlG1
zQ>)Cy_H)=csd8Bd0WrwM(~4JHra4BTh`Qf44*-prcKf!*M^`%*m>%inzr`1~uPCP{
z?RKrRvGsj4U9rR)JnX^}=K0_wBBTiTBXA5b_;-D5BWDRX#$@)Mas`pOV7;Mxo^)-F
zM!mN%G=r;~OtH!w2S;iHM3Mm2-otc<s@Ts3U9ti<Spzuh9<=n`w|j0<>uU}`vyVeT
z%;YGgy|d@C>xrLRU8FM!!Ib-xc-6-6+Ixw_xLEHy`0K?ZzqS=gr;E+&@{qP2ID~l=
zZYm=`WV}qOV3RQZPvBH&Iuihc(BB2K%rM^DMs0Bf4U?w-tsuFP?ln3<Lk$~4P8Z>V
zEk1oQygXp1a4YR3Ko8q}4XwS#NMWIjYUuic{O+ZL3)eFh#4&*iWfnSh&`8RUk}TRD
zuMEk;-J}tv6Nc53=4+h)!+idMvBk!k5xh&sRTBxnA7P*rRt#zxitnH+v+WLuosU=x
zkkTTOzU}m#CIxT#xH{K1ztJ{5Hb@Kdv;S?)!@Zr+kH<z}wjlr>iztDXU0`a*jvnIw
z@)4m%J*Ke$-?fmxZGMsjWv8jr$fnWLZ)5Lkw8kj0*Hx~ZVDWEk{(}*W&rVhz9ZTo;
zwLQU2seio4SaU+vOwg#s$~G7&sWJ@!{B+X?H1$={bo&j54u}uhwShQFaA#s4Fuh{_
zittE!CaL&_nou?LD6|YYM5|yQv|P4h7=Y}e;oAk<?J!ZFhzF~_7B-unV;CFck)HPd
zgKd?>Q*Nwyx~!{S5m<uEe+Gx5GPQ*CRXYJiE5>_d4^~ei46BiDpUqr~71xf=R+DFA
z2;9_Pyl%%stwR}G^PS@*e+i>ugn^5ugRZYV<S%jzXgLKDV+S3nboClSRox+9V9RIZ
z)TcHS!(^N%4f_szm_vO1B6PYg0XonU!=>oG4s&i9E*eWk<W;D}3LYJLo81@$a$X#d
zRzMMzNR_2J*=`mTlM)_q#wzuOvv5UqkIkY3{=ra4)wHec!nvYOAfR${_sVb*vxoSx
z=lmj0Avf-CiZB{Qh)zGNr*)YSwa+vN%T>t$vrWH>9{5GiCzY_PCSJ(&41p)GprKLs
zNP!QbKai}yPcl?FRfXR^*ReEfHV(BVG<_xj)PVG+H9;I_C@V2Tl{^g%gt?En9$iB*
z0X2?z>utjk5Ir9j8UbV#T#UbTf$EVGU?+g|w&7i98tpvJ>9}Z%0`UXJbm}A*v$2mD
zgvmYA7Z-wI3b%4$52())j{0_>3ERLFmPzShv@q`}`|+BSxwf3Z24VooQ=HQQqjYqe
z(wN4bM<WHNFL|c!7kq!zxh4MNDdp4Yjek>CR+gv4X3ddrYx!RTf?M9pVWoYsCFjAC
z2cOUV!@u70HvKH+l5y`#O>-uJzoSciBz^J{)5S80(O)Y0(`*=&X#lOV_Fg^GdA$XU
z?5A1<+`+S^EME(;q!U-VPj#)^c+xp%${ce6*2$Wb!c_p80>jJ+?-d0hgn){XpTY;@
z>s!ax$9#~WI-KR<>>}uEx#aoDC4KA1VkwHPa6ewFXUqY9nW%)JVr`t^XCU;=M1xN_
z%N8=VY?YA!Ch))Oi@&@e#%<!n-(sNyup($Z-(=CwG4vIAE`jvIt!Gzf-CkZ*OKagd
zDYM3~bmVp>fmsGB%I%l=dI+t!cJkVL$L6^JA?81ak>mSRrR4-Lo_aWG^U3evb<NJ9
zZ3=ofQ3<4kZLptmE+M+6pWX7_qlu=Q15Uqd?$$ft9sAA{XY>%JQHp(`znS7!_h58@
z(GsYF6FK_WS{Hd>{fY9JJ-uD#wwwRTNw7lGTBK;?rS_AgkS{<XES+`@vX_aaIIc&@
zoQ*%ktq7qJMxiX``3jA=DW@Z3AD&)#RJA^b=@@!cyRI$FXcn_^>3uHl&>i~UO43G|
z7D5Lg7ACs!N{%sG^_Pfjs?t*ErxArM62jQRrfDh~h62nDz8S}833rT@SmO*ue+4Tz
z01_K`YTwG1!wdMkQK1=e>=s<kE4Z6;gM!Y7!|Gp;C<zCO*@kwzZZiO7a*=zqJ@NI5
ztFu}Z#DOAD5ZUw{y=`o!IzyI4IRkzJjzxWWuh<aA5NnC)6)J{40`ApPy4L=7vUxP<
z`4ImQ>lDUp!)!)jVs;1Hua(sP^VbFm<4+WM?Er7aL)_DX04R5ok1&47ekFM$%5b2I
zYuWxQ(=CE?{IJCa3!HhkDTmU`Y7`K#;W8_OB-XpKb=XpDTN}XqLnCHuos8is@llEd
ze!QvKorAgHtR|6U{r-dyA1kliGAkyQs|$sv7@QZD6)tO&BGMl){uBr8&<x##GZPO-
zCI>g4!TFgB{Y6?5T2rAv=HU7)#FiF389n52n1Bt_+&aPZG}Jfj?@?4`>=n9zK*o@_
z!tmqi9kf%@<wLAePXeuPi*Z#wtlUY{ic>ReD=p%N7?^z)ITYsZlyQipsQR<HB?F>R
zKB$wYBNNMqHg|9=;t~av*RK5@14IW6+1%~W($l5$UH1R0>dfPzj=nzr9fPsUzHiyH
zlaZY$Bs<v|Ych!}+0Bft2nor)@B0=?F(DKo$`YBelYPlLW1ji_o<E+~^Vj#U^T++(
zd(S!d-t#`6pyYq{YY#KiS|2N>FpaK%05o?d1@K(U7Dvia{Nz0XyxYb;3QYg3Q8FkQ
zq*0OHE)S8Ye@na=%eM2T{^?5uSnSa%_G)(@=Y)dPAT>iNgN58SBn@CClmuGd@Qxs_
zB{kjb=uc=px=)L!V4&j1Vgve}v1pA}OFhifvLKg$k<Vb$iTzS!4L2^(G18r7KPQQu
zGs4k6u=I`W7DJKaO_5}1JvsgeZAAoN(j7SyIGP@6jRtT|ZemEffd(^xP2Vp|<dn^{
zIo(r*V2y+mn|5=8s_tAk??dDAt9^(YUl9xS<FRqgg?n9I0cvFO*k3~UgdYgDvuK}Z
zJSzOfgNJgazu0dk6C}`3zZl6>$a|A>lNkeO{*^L}tCMoRf?{p^kiw^13cGsec#Omv
znTrmBy}X!w_;QF^dL3YkutRgkS*s_?UK9c*mAwAfSYD!Ov4+^Y+vuIA_l&4R!e3uK
z+q@uSnI~1*Zk~7PxTMBnOUx0}He|#<h1^0{a3Fc;EW>v#YB&qW3z}&iDp{q$QYn>@
zM&Y<!#&gWdlG5YdkH-W`)2U=nwsAzvVV4$UH)+{Ham+h}5^rrl5LxwwiT)`xR)6>=
zm5_^OLrcMu8En2Bv>QIY-D7$M#w(#@st_FhYAeOB00ZI3G;xtKY6r4iBHg1Jfz2yD
zrzh(3Jo%Kf;&OKf&)P{{Cl^I2up-iQKjy^UyVhT;XA2#PHH76xPB|7eHB`gQD>}Z7
zl1w8n(dt;0o{~O|10%RvxZKx1?itcuz!^|N)}mF$S9cLjS!<g=9~_X{karWSZ%<Lg
z{@nV4L>dFdJE_c9F`)@|0PhmXcIc5npoYx2K;dl!)#Lrexg^*cD)!-bVg@E%f?y!d
z0c01jRt~N-y0G5Ss5oUiAdvqDl5%*+UvM-(E={{nJgHF)A&rR2B|LugGp<y1D@pZ=
z?M9jfzrA_R_{s=|SdpeLwDiCSV}K5#DN{zE^ErdkYTepuGe}87uBVF<VJqtY;!hy^
z<073`>%2|9Dd|_1KqgL&w7abX#V`ZE0GGa85gj5}qjj+4a~aR`{NF(&n*;E_%5bFk
zPvH5bv5I0}ib7V@NF}q;o5DtlqR&9a2=I6x8cPxqPKgh$id#a=^w3@~vF!UZB7_JS
z!1*v$>m}V!(9TmzweGyD=V&li!~)%<tVGRRxp_7Tegd3QqPIT!*_Dc9+O$W>g__<R
zM?lUvJBMka&5Kl@J>L4ddd9RtPhO{$Qv0~xzEUlZ$-Knm<=PR$J}sG8$M7fP&|}z8
z%HJ0d3A7KEi&(zQT3qS=63rO?Bt|_%`H`;9nIGhgsuTQJ7Q%HgVx7<}Y%&fF9hU<$
z*IDl)f8CG(`y1;gXu!)fi!c@M`6TYwj7%2Miv-=WX>3I9AywPnunhkLNBvRWtMHz{
zh!FTl=ux>e&WvlaZ_lq<tB)IrYoQUfhvK3s^b9q9oUDE%$Yt;+V*!TeGedoNkjQ&5
zqx*%alTUQBhh)ASG~XirSsP-_fhus)MPLcRXh&&{<?6wpQbN}Ht<o`WKb{a^zLeHk
z0H93@jfC14+%G5a@?`lKg5n-g8oGa0roXORXk|xuj)pc3FlUCvvb_%-wBV`YU0sJ@
zq2Gh<39Bl-{hKGgBYw!VLV&~iDjM=ZT<GT6*7UD^-?q>^(pQ{Kv~_7N0+-!nlBS4?
znI_&(je0kchQ|w|SwAO^9$Ku5-U2YdDbWjj2&H*0SuNB*DUN%Kn=fgvHC&552^p$Q
zlTIfzB5N%R5*77%a&Ng<)fgcBfNpR%X@hL^I68%?*9?iRpk>Ar1sc61KA$!QQ1I$U
zq+co95fGqY6m1bTO{!apHNyvhcXRYX<Qj^LxXnNy{wnx$R3Gvp(A%f>$8IEL1X%<z
z;zg(|@qdGwNEPgRztIO+a0{R(U~4G|&1f(2iRcA{{5mlT*%Mc+1UCK`!y88mjAD@z
z^S`7qU=FSRL1ybdOq5%_ZHQJZ+dNX$fGDXACi>ar&&FRp>!Z1q-9>_^@F>EmWRx|V
z?pp<9hIz^c{S;CALxadQ^ir9R=D?n3?!V}W)TPQ$2>D1oPu9?4py4uX=Q?#-+CU8-
zN2K%a`*`|(ZWNn2Q6tqj95LXbAfA~u_<?_Qard_%TO60@lpZxf{OMPM)CIpzZd9o&
zy4%Q;55Y<dEz`$QnKocb#iYD+n|o{NrEXk~cCWCXB<F_I{jNQHE_n1TWYJU?A&U)u
zf2G_Np>%xvdkcT(;P8zcT#pOx?R&cFYQLX%E1fWxk$k9LWYp(KC$}>m=8C?YrWN1D
zMPThG=312V;>=iH8pcCQ38r%HUW3yFc;yLdCU~Bw(H<^rtVW9jp6F@TS2N}F=VZBd
zO@<e_IQl~br1LX{M#K#XFFuQ?T%5BIfo~iLs@GO1*DPtg6-c~DrpJcoUgXaF0UxzJ
z1drkqj;9YqV=)2Ce+}XZdUW5jArb669AJN}%AZpR8Y&EJjX#BUxVU$;Cu>R&r&|xp
z;3K+JJqNlQwBg$G`m0^V9#@(Y{62p$rQm`mRF=zB0V}f4^Bp6?_r!jz-2}KC`Z#sg
zs*Tx_$fOHnMrK%CV=*f*hrj__(%t`5{v&e(AG9WP?V*|bnXfPa?+?6VN5b5jcRrBB
zQJeI}P3MP)D*}HAwp~|ELewipJJ1`TfIv;PlWX5$aFdr%7{(XJh{up%6e4mJSj(Qp
zRGODS{7Y^JhVh+&&IZmZdAk$;cr*Z)#stYfdyOc#TIuidmCk);HN{Y@wr&wYl{h4g
zAC+)RX0>NUNv0N2yd;_vrv=R#J9%`b#_UtGZnucq4vm`{GLvDIv6U%C{Hn#E>hnUg
zC?Z-R2P$a&<;+AHkc(qbm$~4R|F_V}d-wIYKs<C6QGg@FACG<M*lF9I8CKqG{HKI{
zwSn<GE21z@OdZtdKWinQ7Y%VN-h5)K;Lkg~e8Rb$xrUlUca$NL&kKKExM)B9Ye5PZ
zRl|-6#uIw^@E8uRvYIlO6yZPnMiq;)>hW)I7@@Z0Hu^@mPIQ;NXRMt*UsL4Sk+D$`
zqTMIQrGajgoe&&#i0-Maj^X6lC~H0rG4`Iy3p9<?L(`+*qUCuS6OH{mdTorUU{nt8
zu!BD^_6OAFa!a^RIae%L&}Pp)OtH}>iYLePUZl1Bc*zNo9nr@0_*hKaL*ryzv;*`Y
zY+$0pK!8r!Yij9L%IPTX|GN?)AV|9f@Y#=?J=RlxzG0UQAuXbvC+!v{WT-P=)P^gz
z=3u+|KF+%z{*`vqoX$eW(a~auQu3!s*jLuM9?&x*;Ir@ZyOc!;dVtD*=@!*pgi?Wx
zM%%$@lf-`(PbH|Q&vG(J`AQxT2W=qPoxsG=yh5MWIknZ%hxebtGdGK0sc!-uC<KT=
z^2EQ}{PWw6q_-><e&MtLB{>d$*yF}_<+ZAuk41eQJ8K2z!BIdrSpx^&M}?mxHIdxU
z`H*dV3xW+7DGVI=r8@o&5t$ihLJG4+HfBXQK5Saw$R9k7j}#(!pVQN!r)hfTi7~lx
zY5di8RdqBD!ykqz+Fw2K{{pHR-0|D~b{0SuqK|hJ1q^-uK)#C+x}*uiXs}A^+`$pc
zC(LeteYM+pA*h=_Z20m~F3$4=oViM@Jr!@FjiAO+pC!`i%MXmS4eZlwkPH*E!DN5v
z0$P|Aqf|d(en6;Mzj>_ZOJcq00iY$DC>=TwoQGa+A?6SRJ0e6+HUq4E`9zm%((vP?
zeK|%=oG?e0i6rKfh7NAzl25H9bbYfW^5bsIE`1#3eNCM8tz7tQj<A+8q@@wCN3%AG
zbq6A8SySHHbc$EsJmg{Iecd^M)!cSgsyo-3YArkNK!#$eCf%t=3B+TP=aYL~UiU?X
zsoN<X+#;nW8ycEIwKp)XLqFuD1Gkl?IgfpxS_Q-x*p3W0UHN2sg_j`uCZO52xNq@m
zDf*j=YwV1+nfzAPHsX`#ce3b$X57QxC=cXB1&v8J2U(2-i+g=hRm?2pZ6ce$PH?gA
zpFMAU8gR+#7b7j5N63A!Mk8E-O0I*%1;0@jfoGPo2Rl1^teJYJXLAfy6Q=8RQvfYE
z8cIEue};Qq*h}T(x`=;`Bfau%@w$#i=2v~2EeoaX+{96@8ot$>b<~W`R$%M;%oxsu
z___AFf2y{L8uVZm)+U50)2Z+D1pmI#_4kgp7aK8SW_<M9m;f!WfiKvzVCK0Q7rAK9
z&DLitP|n_YV2ozpsc#pq1EmOcu5W%C+g}+(A;jZE(q2{LHY#lftV<noWgk<T-!6Su
z#sXv_;XxR)$X)u3eg?h4%*TOFrB$!U{IZjKUeY)T>j;-?u4^58@ZQYw0gzPAyq#jP
zd@Y<BgLh{`r>|IsRb!+(0Lfy=tzqdSvShn_@RaS@q@Q5D<|{eV5vPDFnU+(V?wIWq
zMmbbTQN7aFJ>98gVwrebuLVbdTwcqJXBLK)5M$%hY$fqZ0V35E_EE3RCg932id8B=
ztVc&bV=~@XneZ^w&n-c2Nmk{9ZHzX_NQ<GPRg0WqgP-+>jNa=HmbtUa!B2lBIt(F0
zVR04KPfo^5jLVOraX#;HGFc}bQVV{71jlokNtA6G?i3jWT?pRUfcJq6I}y1Nvq?l*
z#wn?Q;#Z~C_f}~mC*INKEMLBr@=NQrY|kv}foEjcMP+^u3;XLxLFp;iH{75?u)7IZ
z&xnH4otqUZCDg|(q?gy(-j_>^aNg&}*2^$wL)z;;dr2Zb*V~u=gYX>*-Tm@%b&8X`
zie>qge8*_$XS3XeiOdxQ@65`Q>PhVj!Hxc7-^opS97ZUrZO4TaJl@Vl3Z|brn{%V?
z9O`5Gt7f|5Xa;%yF{@IKCJ&NEV?*BXL(piam&I3rLsrD=psYVyt%<|<#IfwMAe&)&
zN<4EHB$4p|3B9xNCTBdOok_5uEVrDN>Nth=wrN$gw7q9o;qM+&*I$+j53nqUI<Esf
zE_W|d5Ag4Rv2n0NIZ*TDN3$Qa6yyx0@hwMRC-X*i?+r<|>zU@ySa0itgeHlPo9*Y5
z<@RYuc$GCqN^FbcNp}B6BXjIhl~ufiTc))AqG62cjZ4;<@3C;*El#bD*-Aw>CU&ZV
z+gl0@;y?|p)^h;*kS+h;cktMV6nxHQkc{8pkwyE=%4=KfP5IOJOKXjjzk}`&IAgx0
z_;focja?r-%re^ng2dBy)!#4`Xu0xk7<v93pzCM-bK3srSsC<o610d^9)^OMaRHlq
zNi3gg5=?I{3anZ|OgDXD)z3#RbiB8-RxiV_v*|_?m?7@aY^VU?E$y~9-d^zN8f4)r
zeLvM#h~W5luZ1NUm!BYPDSzaWiL}Hyyj0ke1*gph2Zc`fe>w0)1+k5NT+>SY2ptkM
zpI8DfI^1XJh9#^~2WG!{h9q0%>D52Oe<W(x=jR&j)J$YDfTMiJKYNgGRl?Hy^Q^?p
z-H`s{*}{V1&)%(PfnH|z9IT`>f>;QjzCTS>=O2^HTDB@xb}VS1%ty$Nx;m6JM&+6E
zdyFTt2Rp`FbaD8bso!%kvHp&?Ch5F$H-nr;ooZWc+}mYbRzEJNyL0nim6kRL4p6By
zb4zG>1{(<JHYQ5i6m_wIv}98LTni9T;Mv&7&`MmSfmpYBebQ|=XD|4%X=m}xKo<K~
zo|&3TUNTV9Gm~ce0#0UEqHVm>aQLm^$2SVJI1|I-sl5~p>tlI;*JaiCwQ!Mv=>kWo
z$h2`}L(XNU?SP}|9$kNQ5ZP46>WS;In0~-rLn^8sZuEl`)N$|>H-%!@Xe#%2iN^ug
zy!7@8TW5Ksz@MV$%zj^W+^XfTvi;S3Q0I-0vz1RMTFMHw>yp&_QF;|N{Gtn!o#01u
z9HJVmble<hd-I;;%GaR?_S7o&_8}Ev^!(EGBa`w!!8dSXMpgE)sr;IF<*Rb0y+SV8
zJ2CReqg}1j?Hccy_H}un$wq9kATGprW^pB%qqMWhaO6fSjO7u#Z%E4GeTRQvT@!|1
zcq<3FEil6#{|u9K2>V^0lnnF8^oh4xAg%iS%q+l@QdR&oLhb>T^b`?m5-Dt3BK=TW
z2<30BHBf~;@ci7sSx=Oz)6Z&VYWq@a5Q9!}M|(<<febZP@4jX<D<1CoCAVWD6<S21
z?*Wh>fn!E=)9rtTY_rzNPl_|D<mu&t?hM9rRld+jcD=_^YbH80R%ev2UwQD+NXR7X
zYrb&n0P@3QU^^S^0Wwc7p9jRiRO_1Ce`WT0HDrWEo~d7Ok9)hxKD;b!I<@!C%MrEc
z#}R{pq<JM1JX>ERPoR3Fjwa+~U$$y<qvB2WU(du|E0NDb$&b)2Zwl@n3y{Yq=CLh?
zNPEw(<P2O2?8&t9yX%j#2^l3EkXG4MjD5ohdCKX2<gN-G2&-qM0(JKKK1yjJFZXeo
z%efc;q>s4|tOR2|-58C(b>*S8XcWz2CEJR&Z^2%Zm!|F*FHE*~Qj5El#N)MzFgYwd
ztXUcnB2~0p3%b(dK=pqtreUW_RgnY9CX;=!Z*Z@&eCSl2S^_`S?&P$+7cw=1pV80{
zlMJ_7N3Ql$6n9OrTkAV=`dj2MU2iS0x3FXz`gg_Az8QVY=6U=r>q7(Bbg=F<4YE?I
zCPaX78Mc3yk>3|zv)wh35bZud6FF6)2>;?TDi#HZ@Gkt2Hy$3dE4aq=Xy~hnw?Tey
z&vgZ9{n(FbWzQO#v2EK8M|XSsmc_=d2X4FWHY>^c{(AemTD3Eu7^heU1KrVK?u##E
zj!kWfi!L5L47x8TN_Z3YriXG)^Lg_Krw$yV&*0}e_JYBQw!RbYvrb4N+bW27eYewo
zJ`*f<p(^ssBZ-8=D2<gYNVvi?nfq9XLXfFzfUnm#^Y1nDFA@sPwXUbi<=teS9vrLM
zKH+38gI$_T7O@%C-)Slg7PdCO6|$4A#m2Lq=Y!%(hBHV?`9%~en44K&ske>D*Hu>+
zU24gGGZcJJ`y`1GA`fh4VLxizUi{V6k1+ZAZbWEd6)Lx`)^VXvB6OG$?|D|XCBEa~
zGBW`^uF{FgE86>Qk*$+INzTujbzK2pHv%N2Kk1t5Pdy+nn_8U}aMLx^;gwg2M|mh#
zsCM2$K4|7(Am5n&%UOLiugH3!85XJWtIw&ipsoldaYHKemDBFMx7Px3T`PIvku_I|
z@m$z<3iwd>ByOFSqD~)15lC;T!53|*$sKkhBZ&FjFh?|LonQ@g!W@g;EI%H&H+TEI
zS;_)!+y^p09MhieVsQ8bKmWROY56&*QYV}jH|erH&-&iO>6$94>1`MDS8(I%;G$g+
z{c4lK@{3yA{Ce-~HI)tFc%wuS9;Z^FT#Mc7Wp|V>Jc`LDTW)7F4&2kOY!+QC4m3hc
z^J^!ic2(V})+N5UDB{zULGiI8s%dKd#uHhxr}k>0zx_tY1orU~VlxJK41r7}__@;p
zWgoG63=j^yV-x7^66m4g=I=rL0P?bOH>G6dq~w(?<&;zu6jW}?Ny#dz$jVkfx$66W
q20p&-k32*F&)}A&yn>1XR7Fwoe+`&Tg1w0bfWaMO-G<w+=>Gxuz9&`y

literal 0
HcmV?d00001

diff --git a/shaders/CRT-Royale.shader/textures/TileableLinearShadowMaskEDPResizeTo64.png b/shaders/CRT-Royale.shader/textures/TileableLinearShadowMaskEDPResizeTo64.png
new file mode 100644
index 0000000000000000000000000000000000000000..b61d92a0e186cde1a77542ef7a5e06c130e9701c
GIT binary patch
literal 5373
zcmZ`-c{CJm)V7r+Oj#0QNK(nZZ!={{mP};Ho(d^3H5l8N7E7{IB#b385tEdCj8Kx0
zE&Gfi#u9_Uj9EXw@4xSl@4V-}&w1Z-?s?C-=Xvja?oGbtU@amfBgDnUC1P{M(&+#%
z{$~O_2fOl-rQ`$P3%qpo5*K%3iPmzi@4@--Ejw#VF3x|e1Y4PNa3cu4;to5&EB|Yk
zd+&p|xFl3<EH63V#Xop6nd9{>{~MC(8m{bf5v#>Xx*FP<@oXra)PK8FG63JBf0v&r
zWj}HEzS>cW@r~JgHin%6vZlu_i_ril-qz$*t=3jZJz1#=36B`)>SfGM3#kf>X4(4V
z1&SyLQlZ8}mlG4~0_9Q~6?`*zAGEQ-5v+;albz0lmlyrvK;6RompB_jPj;~I>pZW#
z5t<7#Cx!n#<Qv|CoqO^#of~cB_UuW~6w;Bj)73UWa8b<kkshaWeJ-r-Y``Xv-~ane
zo_xoW+9_fjE?Kp0KLpP;B^-U1YaV_aJXt}RU<}R;d<x9h925HD&P-FURwHxx;K$8W
z<C7!MW*vmpVfP*O^%?`gaVZPdF?K+b!!(2QWLg!~6W<$v2946osKBpoeIC97V@=3j
z24pUzkD}&RC07Nbt$7=(k--TV*Ms-Y$dtzUG73;yhJvh)YN$Y@JpO5xqoD|IhoYg-
zPFK(g*@PSqj)5W1&R6LPkdR6V!0-c%PLxQ+-X)Qfc#S#2sliD)nrI4u=NXhI@%vor
zUQZ1FoNlUcS5Tx6wf!AS#qR`*vc8@mO_G6h1tqcfkC#9+n|4q){{X^CtG_YG4Jt;+
zf%oIw`H|nPt>y~N%=VhWW?M-*()A!>u^NikCa3w24D@$3AUtrlKIX$hM5BF=UqI%G
zL0jo1i(W_QJ4K}7t6Z@>$^u8tJK_>_6F$W`e3@4aiRR%JtHFHp7Tn1^ee|bbCaq+Q
zOVCEz{0i6bE}sI|+T23M(NM_Gx%muZqC1x@w`uU|nG&yieC$NdN4v%qmNk@G7nrO$
zq<iuRp)oIheE(X^bKFox{a;ofG-hbzjQ!_M=E=B6uLjzvz%E}YGlJL|k5c`#;?>{o
zM={-vy_eAr3zaQlZ}3P__%!MaaVqvkWIXPRZRI+v5(;;3^8FYih2>aMYmc6Z$8j4T
zdTX)yZWDe&RF-|&daj8`%Y<^?L8I)~tP}d4j~`nYblkdg)S>J5aa1>j5MSyyjl9-G
zPRP-0nBBt(VV0<(U(c<Bg=@KaV^j|~m~!Z3tISSys*jllbGPdQ6!S=*J$=AiLcONv
zLy>9ZUSZ@gKV@L7gepmNiJdm!dO%t#g%>Mj`n;f4wdt2YNX0P(D6nr!-C6-bjkPV=
z{;BZUJD<zu7FUYq3}5J4(0R~vm4c)C;*XuW5z;?9>-guSQS-^qj~5Q4XhNQ(WfntZ
z9zRYhNekgsc-h^><;-*OoR)Fz^=%R2wn?X@jx(7E{&&Z)Rf*|+9oZBT+tm7RKUZ_|
zPYZ2RYsP(xSp>bU^$Xtv317K<veB$hvrgjP2+d|UDPzn)zPi|cL8UcJ>RV0&$+L4e
z;B^W!C<yu2!elgX>$<nAC(5yUg#SYvPEhIgW-gJYSH1SOb4>wiN!v*9?Oe{KBW^^p
zB!$-sFA*7!p`6?SqxUUwmJ9~buuW6A@-yf0*1BP8NFxRCqA?4Ajw7!myn$W4&!{4V
zWUL}2=`Uh`tituGsVg+e9NBP^j3({6wF35(^rui+Q)tNd0Wi`?c`jnlyvt=N4NN)>
zj1Vy&YnEP)aVO}6XqPW}$sIlg=v#B$!SQqRg}(AI|Ilz}7i2nAoFvx7bLj@s>FiYS
zQ>ja0yZLF_Oifo>`A7ASDr=kNr*DGm+K+iUTH!F$`z^mK1usH^6rn)mbwjq&jL+UL
z@mjskP7X0NY`LKVGKD&9-_Y5%x^r&?I&*sFH@#oI`oV6FB(X_uDPYsU<U>Qn+gxqd
z^_Y0hH3m3G-9ynUG?A{$c1;>nCWFWK?c$SN(Hs#@l6sNku5t~e6IbnkJEW2R^&ye1
zHxU}Wkl;6>=13CmkUk;%v@v0!>kq^MjW4ciiR=Dj-QbA@2S+cxTq*m})fGXW&}{p%
z$ZSGN;NyJ|ffa{%W6tvpS9cQ9<8DA-F*ZN^-D-Zm074s6xuRV%FkafyUhoi6`e#n3
z)mEj{Qqg^L0;)3HVXRqS_%hqeLBKg;=W8l$)7CMLJcM;wcEPns`0weXTOMum>AT<z
z#t%g#X9Sk0NUVK^e0?vk!Yu%@dHlEc4By$s7JuHyDnWlBW$Y~@g1#Sb+u2+?AJ6+8
zr#HvASI+8Zf0$nGI~)2Obguj*vkcm7{yI2u%7O`O`Jf*kGsd_-Y}IJ|K_ANz-cXN{
z?;=Z?v$)w7TPpM?{++uO*9ykiiPYlGJUf(C=dLTdha$Bo9!BoIh3=ojz;)VZ-JObq
z;#V!0HKKAw;*2g!RcCT<(uJt+b0Wl*Od?G;RVZ*e9;f8FGbsBmZ?8HI$KCZ7YR>=!
zall_qs;JEy8O;A^YD;QUXN~Z46xoH^z(<CpvFH1lGb)9TODFh!Ruc}9JTw)&d?YsD
z!8#_DbOdqs<>P5JgtiD`3TYG0svP{9T0L3{F>H?<3MPFHCT*0lqnd#pl8sy}51eL5
z3wrgShFwOU@>!RE{HA(zJEl*O_e235viqgn*xR;L!dx^s`p4fZ08?INy+lh;X{GS{
zOgX@f>w1LvyJ}^~3C&A?#r1X0SwilyO8LBvBtJEXnPz52ZnnVdM}^RFaMyTT!|Hw2
zQTcp!Iy*A?;ERvSzX9#@vCCROV=vqDd|uGliJHwPssLNS+PbOp^rZLSG{Mzu1zb#}
zvbO|z{kOaR?3YvQZuZAzR`IUcgU;~k*1&j1n*&g*E&YmyNY%z=WUP4$GjD#7eTyaE
z%jh_^4%GGew(E9Ps>l_aK@Y@5N+QfRRMR`wj7bkBK(|1D70~TJ;XYqlrq8c#+b8Y3
z!MsWb0^P8L!vOCJ?q{ojg*vcs=TlfX1(Qd);omU7bn_yeWE;n}AO)1Tu}-%(YOJXY
z0lHZxjbD}A-U%nKBHn?WHnX=Yc-Uga7sdQ!xlrBf{xoizUQq!@PH?Aj`Rfufq&d%`
zW$g9q&k=`ds8y_km~2i7?CF+bwEtr<Wt^}tIu4?*4LtcU3O-4ZBR+FQBhzqW6^sK}
zkia<ZS0%kf7Q|d#DO?6lm9q8N7At5@g2pK1%<)&0*O;caUdtXzqriahS!5xVe}Asv
zzI*Y$)7rR4DVqy<J8k0=>5yT75oDC5vUop3Fb{Wj+b;~g49|Fz{(>E@Kvpphy!&sZ
z1x2~DW;8#xe4-FP-^WjUO}FS~9a{tR_{BdTC*5>4vhQa3Z(TBO4~bs+wDYSy?yjK$
z0AW(@iO+#78^~c`1HO_F2|ImzsiIed=~}nt=;bi{IF>U57_;^NzK4rBr#Kg3#y?W)
zl;ksOg1Q`;s}uj-cg+z~nOP-ojeS|YZ%-1~tXjhtsJ~F~7FW$a8>h`jMTd5;J=+R=
zAg*wZThS)=xaHQfwD@I+sZH5m0f7ovg+B}aV+_d^>6KzxZ{t2ZlE+ug9R%|QT|&(D
zg_`!oCtpU-_7FV!=pAxn<m3P}I1?3gZ<ZP261~zI>0mN)d$+l4gMP1<0`I(|`pax|
ze_OCIoXx)p(<T^cZ|nLn%3+DZ1Z&hlN_svjIhu8qHiTX&eD)5L{`H84#9(q0ShP#&
zPoU^oj-!JNS?(7Vtihrv8LAh5^hI2xmxt+%6TZB-Z_Y_KM=eB%oY<gB?4(Xl0*(xr
z#gob*c<jCCfPrTBu8044TgzLw+w|x!5brRze8S>pn9(jq!y9GoQ)U8sdBeP4`4xh9
zUW@N$TAuY_=Vb0aS&n@*7(d!|hA4g3j9UO4d&~L!A=^8L+EgI!{*t%;y^&b4;_<7$
zwmLb=c8bHy(B36k#?3W^efx5TLQP=2i>-u4)gL>MhwmMh;N%M)A(S7hJESTq;j3j|
z?B;Rrum@VsK*{h?xgriDLXk`@?#f#QU$41_xWlID(9B_p$qCmAa^pOAQah$y$p9H(
zqP1`jMlzhadG@56PcvcXSHb2&Y{*9fbLZSdZ5uk1BYajP`j|s-Z`LWywoymcE#E0n
z0Et9u<9&ODN}fFoC|`g}gzw|IU%RQ78zcN?9u4m=1BF`9pCHizQ}m88)jvR)+K%e#
z=7FFlTXFigq?A=z47ymilv6omu!i@!`vN>qB>x;Xnw2=;iG@ld%rm3<ckNqHeg!AS
ztmP!PWJ;{ntc0_Q2OgogE?K7Nn_l(P$yVs~1L{8=c<O949Dq`)ar^-raZ=snLe|^4
z?Jsx}cb^5I3E~4&OK`XNz95Pq(urMf--H+a5;Jd6n3Al*nf=KODNOm+JWcjIkk+)2
z0#16%**#zl!4+>jB=jYQChOfILIx*dj#u;u_o%$0FsGnVKQ;~M=5_KU*yABD4Y}V&
zj?@fkq^JaJa41$D{yTR4Pt>F%WfGT|C6yAjtT^ImF}+^sd(mI?@GW*Ees${u6bLJQ
zO$Bt6!BUp22N#V7gDbS}!=4?<d*D@Fs7qfk)R?}#=H4Eq-y3<#TLLzmvha<Z{{+8W
ze;XMo+8(i39?V4<Rs`L-gv>NOuZo)4{1~v_;e53_WL{@90G2pm6&vtu@`k|-|BVEZ
zb8HnG9!0@n$H9wY<TT_}wh76`#@AXJRq<D<JW$>)kl;2@|H12%vTt?NX1{vTAu?Fo
zJI;*GztHQ}hZ6NGJ5~i4<u4BmY3$BPxRwD+RG!ALHNh#k_RE!vEbI9}_<t=9r~?wd
zT&01S+%wGuC$HH}#wrZdJn~9Uli2;G@n|-CuvzrY23Qdn^9#C<`JO}6DrjA7{w=s@
z*V7LKernfxLo&B+q-|#*=YS4fC-T%ngU<E5B%5kr+pU{PN)zwsMPmz~k*&_H3Xb-y
zFFZv4FjXSV!I5f>4f`7ASO`f8byN2QdBSTCBi`tEncYXcS4K#(9_lYSc@sWI`wkl_
zyeJboZ61-jWhZ7OyMJ?BR&f{|nB9g?($x`kFYcW$){aG{0DmNP$<}E|GVFP19Ws1^
zZ+D%PG}Ca3v)d|7l!<!V*TG56xO@o98RVCs*?e@>cbZ8P-%T_0jk(+0AGj&Ccx8O5
z6yzOC-u!keH|CH0UHO@9c19HNZo+;Lv!-^Jf29~ORT>8&^}xk{zO4<-N3!_vT5@8n
z4QV3#bH7OX6nlN2x#m(3$P)6299p{1pQ=*%io~=v8a_r5^rWD9zb7@mGFb#MNZ-5a
z);H7pXUic%;j`<x?V*2-f>2T`2+I#^dFrN%;<tc?$e~qg>JFAh2|<Mp7*hr4K4R$_
z9(ZHVzk^o0#d2>TgnGDm-oWu#^Zf8lv5AUo;)S|-k#a{2>1ak4wlN~}d-D?kNd%0i
z;;~AfS)Y-&ehI&=$KZVF)f+iudr{AhP1tQB7#&V-$jqOB&vSjW+Dp6a#7$P+4r}2g
zW+@}IF@>BG5`M!4*(CdLQq*OCY2Zcd$Jlje6KAr!D8ndr>jqVSv3`8HiZkQwM955U
z8S{5BZBvueJ7^vNI>r4F6yAf(Z3B7G8|9pllo;F|bB(df4h$aMq_^yk>uf%a6oa8&
zhkieK4$S#I68+u)?2EV^Ik5-SHH+X$)B{3W`Td5P5ASg7$#S)?F*T}r66&1mv=}S4
z9@6*pNw<hSixC%@-xw+whrQQ=Tk@KY3@5(mA6n`LVl`T}QP}a<<0lo`m3|f%n`QeP
zWcC?ufjFg+Nv*Aa$<byVEvs7rPgHCSN}it)LVs|GnG>{;Imcz@$a!vB{qY!ozcJ3b
zrTS(6P<FTt>ZpRU;Ge)#%67`oC)q_n+x`yTeqYZ$2fuu%(m-X)2la{An+Z^h7N$ji
zJ!pXBYgekTg!pumMbz@Es)2*%$ACnPKA`qMoMQZ=kcAAN`-utG4ICXx|G11_++|jk
zzp2-0`+PQt+X9>@-bt;tk^}4BV(Vc>0$=9H8z8ogbEkz103ffwgUid{4ysn)+fx{!
z=pZyLL}So1smDG8dpKlZJ0CU-=4=P&CS69F)<)y%0ZWC|o*>x1cUJ}8y_@t}YS06m
zrT&5vU%y}l`ei{FMEzmuk&e$r?dGK9JrFem7JL81SmXE1Yn+y(yWm#Zv2XcNX=)c?
zzNKoSP0`veIm!hU+1(j<Q?5mM*4XAt<d3kn)#zRsn<|R_>9UriJr6;oS4uDIgS?qD
z*MwGi?KZ7Iq!;^xx7T<H=vwe^hW`F6?!pv3wO<_cX}{x+jx&WW2OIR?@}sZJ1P$RU
ze8_iTEv?&M|0{LZ3dNCif8B@KM$A!}YE{%p45onHOZoYn{CvDochEQ_+-!bZg!XWt
z*y7y+WU}qUCdfHfR<<tYwm11%<DANEeQT3@EMl4(Sug!KV+@*mWTnUp)e}hZz+Ok^
z4>U)9+p)h6+Fk5qHhl^$Aet<pq%I8RM}&Nje6;ALl~iLvup4%B`&Z=D)oc{$03u?U
z-ril!hhi|%r5B`k?ufU@gd_ugBD3Q!sLcLKfG|0^OkJ0m1wCPhZFuC?Y}O-{9|E7V
zaDyon?frOM=xe4E@;QkXnYQF#u*84%%Od~FY3AgkQbQ_s%>PReJ3Lvl!ssIEHa^P-
zrw?Ie;V*qYh}@TkmE2tHvrNeA;xWdlm1+}J>2?xuu2MIUHiJ)Mw@a^XvG%h2$y4zF
z;k?S7IARla#FVDHal}|<Tn$&ZACgYak|G+sBOjs6)s+o`ve&8|`-&RcM&u`lA?Xv?
zL$l$A1po<xb=&BX`r7Ql`(waQ`wzvV8{0Ids*Nu8Lyb^tWzs9`ycF2uqUoCshIp55
zR>%SbhtEg7f<!Jh|6}5dhVAl#@m_EaV9^MO9YdFf9?TDw!>n$E-SG?aH@^MI{{XnO
zHMMlrG_};UjhwX%jCJ*mwKdf=^^7$&58pXu@P8a0gx(1Xi2DB>;6`s!4ji~_tQ;(B
I&2K*aAGc-ylK=n!

literal 0
HcmV?d00001

diff --git a/shaders/CRT-Royale.shader/textures/TileableLinearShadowMaskResizeTo64.png b/shaders/CRT-Royale.shader/textures/TileableLinearShadowMaskResizeTo64.png
new file mode 100644
index 0000000000000000000000000000000000000000..9b66ffba3df9ff28c75310937f11a129b7a85e74
GIT binary patch
literal 6008
zcmZ`-XEYpK(AImGMWU@1Jz9iC^j;&;OSDx(R_`opMU80DJ0VI~y_a1P1gj-Vbcq%c
zix9Eae%|lj_v1VF%ze(CbLP*Sxiil*Nl%P*XsPZ|5fBj2>gj4ey+zgkjDqO)7NFiu
zx&;z<H6t|w!dGQd`#BD`Z!%{C9ZdrKf2XX!ChOKi>8ooUaErSCYue*4lnDq}s`WJ0
z%skdz3zkFfEEbQdm<Jhr6+f?7fU%s$^w@^pzzv^Bpdgg*St;Fb^86TVFxS+6sTLTb
zB{uT!{PY1#D`KdUFzQVxQu{+xlPgzr=S#VK!Qz1K)w+P5-yt|8^M3ueBjhy^hojL2
ziaC_3A5h3LIbC>88_I(X+1wTLcCrlLQVb=E%6bIZ{KVny@-_fNQ=;DJ)hdeZY!(46
z#T=JeqOFS(?2pEkFS)~hS4}BTuEB{xc;QI$<+>@tNi>q~0<5AG>s+7^C^II|Sw9I1
zFVlmR=VhEn>wm75)Mffvl``^B*1FWJ&QL|qBb`g4s+&sBKl^QgBXC`%<Q<tFS=Wy`
zO0xY}vi$4NWU@b!s>?z8f98p0rT;k>yy`O<H+MxJ1W>sMd(8|3v-HgmvT(IZ^(-0s
z#`GE!XBSRw`GLGv_MM4lEy^^#B_RZSud9*~3%{*2ML;q{iY1-Ym5Z4osGhUtBNh6D
zl1@sEc|AFl4fZ4N)A3J2-eK!>sW-~CybWJh{Lzv}q3d)s;~>$N=iITy4<OX9UYxGF
zo|Wj7gplyf>j9i3KtpQ#8!B!S)UZ9w(%(25rM9prrmMfk9zo>DfJ+4o?s+Ihld%TA
z)y2kM4MDhZIZz5Cf6x?VZ9;;4c4@GQ#bJiFzIyI&Jg(RwMZUqbA7eqGY$reY{~UA8
z`^RmPUhf#9V@uOC@HfIAN`Rpc_WmM=AGA8GpB>6<91C2@OvWkFy^vgs5fJK<lsF`{
zO2}6b(@;YO<z<}<cq};H{luA<P`JtBM0$_5i<T82h9Y{Of}lIU-=I1hua!qGOE=*L
zHAQlW!9Gubj*$gQAuMh?-P~&Zbcepp)GWxAgtPy$HPG*@9QSy71vVT0RLiP#EIi;&
z$72Y)HaH5$m!d1sM2qa$`-R-kzFsxh-CMut6KBiMn07N#E;N<D=!;@2$q4}WgNPzp
z)=#<>#t%%F{`U1>KK@{P3hI}mBIi08@yfh6UDHjp_Q%w;?jvyRiJ&dVv^4D+%@Ye_
z;ena_6ZKd31LqC2y2Ja`u8P$Qgu7>$_3bkQs&!dE(K?#F6`9a#$`Gb<<Ugo6k_T8r
z((g%t-{1O#Iwygi9_eV-xraPj+yWyNv19H;8Gu3`t7{2)WXU8lAe5{-c`8<Qi|b;C
zqc6sN#<qD0-mSGX+PFIjVt%ADjN;&Mc0i*#G5x9-!d0sn&Lb~Kq;aEjE9v6i4oB?R
zG0?Kb6Nw@^axxFV+$m9a7;&a%$);C!j3sB(jG<Sf5M;1nP@`b*qjKjVVOTB$5={^e
z(E|xYiGc92B#df214BGrAmM41E+iwKf(YZ|O8^o~9?I+&L6I46>E1Lc$+d|WQLyi|
zIC^ERpPbNGo(nqFZ=_d}SC83^)sVJE2(Ng-qI=5Ks`E@_#8q6EO_^!-O<a$}#hN7F
zPYb8k{9wLU)=*Rz*%KsvHLou{B(Rh1+OJ)mH$4OAx)KJ}@gG0qvlVjdCepilFVv^O
z=0lZpXrM%er95V3%RGMv2>2Jwyk2;jSuWvVztHL8-C>a<&imGbBH^g;e8)Ux%xGc9
zpBotlg8HoSK27>2=}pBt{th2HV)xr;dPUaJHv+)>wt*hmxJN=68u=4zjoclcf%+Ow
zoW-$XERmZMKF_)1Z*Y7zYta~@0Y>9c$CE3`+v0C7P7iYELZwMXOaP4K&u|Zw;H7jL
z_!K^yB;Znv&cLzzmZ)vlcHRd~hQ634lPDE(2h>F6%wWi*h2^aWVjKk}7=S^!Ufr<P
zv@sDbZnD@tk))?aQ65UJ!3Ed?9O-sd+nezkam!GG8ll5^1Q9Do9v~Gj*F`*azhSt`
z@gMD0vUy&9EMQFZa>x<SU9L>~0tGm=>mQtjo3{=CIa;WUH1B<R<>U}Oa9}J3q4)|x
z&U}R@BXV+6vuCjE#a7R`U*#;eQ}Zn{iQM*=vD@riF?ffs(-q~OLB>Jtrt%DQvqgxP
zqeZQUVTHSAwl;Dc?T2A2lKj*50xUjcg}qxM2kOdv6+SV^4T76>c4Y;24axrZ4*+5&
zfU|>3SdLkw-yt~J<!&QYj){<Y&bern#MliNnUHV||MZ6}5te7CtIBo8lcxW5&PUPH
z$xi`<nW9x^ybcn!978B0`8!8=!bd>M_9a068q*{T4cf@&X!yFw8qe}vb#QFAn^b<N
z@#6N6&~@t-5@$>$z;-sM1<kyJU8A_`3*w=PlO=7-fV)tBZ0Rt)|BcjYQG(BrKFfes
z`B1VLEb)ozatly9f*QcMd;<IACfXvOf)V!}>L{Ps>=j5TQU?-1To@c;GsX22hnN-+
zKor#n3KL_JhvIN6Q?{+){5~~cLshBUu$2X3u(a-<r&kqq#c_oED1@Vv>G0f}XG4^&
zv~v4>vafF3pUbWb%D8L}S+!E-(d$3N*>CG{+OH?Ac$k|ki!K6WMx3>;HB?-8vJV7F
zEywJHyz=KwmOY_A^I;dnQ#=9I?3^f3smCNLDTo(T7aOs+4K077=KSiHVhfa{L#fG?
zWcl;eAVoXllz3;V>gwUgOGP3w%a1C3v0H9}2@laoiIUD2XosLXMe#Rl&urGTH{40F
zZ#3}h!XK&}|J8E;L20B|j+WyO7;Wv<FouDHj@R`XTgjRX*Ft+gXgXI}#WrFf=*U5O
zF62b(41PP&NP?ndJe*FMmHYG`<A+9IC>>Ps9H$)WeKLJp;!qB3@@PTIX4I2=g@uUw
zd20(h-0J;2Yz>ac{mYeY?wk>oRrsOfn_=-T1>!^4a>aL9H@TnwX~fj99a5`Tp2(XN
zs+U7SC-kDHtAG=B7!Iv<LN9B53_IK=*FK~KI5i~|&zB;3ikbwADPCAF6gsogPrCc+
zK%<8df=hOXKj5E)juN5ixYDt`FKkIFgPmx-$*~(FwgMKkCzhnSeHgNAvqfd5m3nrp
zYcEutpPAx6Zu-`$$$d1%vs(5-u{on@KA_oAve&s@Z!7tJZOXq64`!*FAgI_ke>09w
zG1ide!16M!sx3H|N2V}K%c~SSbDxwxXYH6D9IhiA9NUI&TgR{y*Fw%NDSGmyT&$zU
zf`skSXhxDfqx}G3lzEEJ>|bmJ7EiaK4{jls6Sy~fvlU;2Cu3nPi#PYtkp(0LvurUO
z+MfVa)E&m%bQb=;hAzddx)$<vHG0eQDFD1X2*{NXd>Ec=6&5QiE5={~BK(rgm8}&c
z3j>oD?4l8obmMZDmw=blo`IBsGC}hjho+`FEsuq+Ch(0PFNYMxH}#@q2xAMSpCHWz
z>*!NhJ8?FrdBq!k!weBSU72?(Jd3qdWQ${Ws18E$@Z{z}<DMrErNQdmJ4=FKDilDE
zObArx6{%J30oEEZ&HXWt6F;~P)s7qkGV8yE-wwXlum655{oTab=fl?jO?cByGZ%tH
z<n8ac26zVEpErC@ak%h{j&QGEYB>X)9T?n3UwIs4H}J7-9QH@x`GBi>%;RZA$s4dy
zK!z_Yj60%z%6>4^c!t)2JUTsIg7GNVdyhPn<1!WvzNH?m8XZNzP$K9_lnSwfXDJX{
zngW$4rx$--0>8mAKN1-Jr+xh?E^#q>e?!qphLOZ%lYN8U9}_=p;tp}ui9Mzt(kRT;
zDaYqu!fC@^rtKS~Sn8BR@u?OHs*=4x>nWoxxU9z`&Cy<8jV|?;g5A_{uW!muq}5G&
zyL`*DK_`F^M>m_6`zP!H{x@^BtNv@?XNN7&uYJ>fAZWEy<cN{A1j?uD)%EHkdf3z*
z-X#+J(&qhuIBm{vw#KE!?2;#p3s*I6_<=o6w%)D*uM6(6e=>Ri<cACQ?&Gk8PxK89
z`5t}-@|p&31r(#Fqo%QUUlshv33*CiOFS_TdaLPeWUw1Rno08%V7wozuq<VLT<s8v
z0_u5W&Xq&xL3L@0!qzk9b1tMV7}3VMWGP6xaOyi!?DWW8Gx{&O4nq3@XsIoE&S>C<
z%f}*rd~rat)g77#D@Uj!a)d;*BXM+kSEPyaZw;|xMjzGR_Jw@okr#?VP=^sWW;a$x
zO9b9_3*Mqd?FLb$?wupwFxZuG+CG0go;?s(Go#`eKykKM9&gWr$85~h%T!p0l|Bzm
zeobG_z6JA9<F|U5KSqWimY0!UC4vc|gC%=7^iq0MwHO$*P$YDY)=T~a;>)H&JsZ;G
zmkh5?$<@OgtNwP5>&*#aNsJM^Te*JX%wz%s>_CBkQMOIF=8A*UX|G-1t0fcOw=>@=
zIx=CU7S7-yHnZ>KRTwb7Ky{^YlMDBHNu$f8d*t@r(lZ_V#**n&o@BB;ljkoz-eeiQ
z%nBq#e)}ZELFM-1D7QhIyvJX-nBwq_&*b{S!}Ct9bkD44f<5<4WB7RueviD(q)U82
zT^_vX3uEy*Hzj6bqHE}gcQk-K4=W84jQMZof6QSgnch4}x+Mh}BlXO)f7519tV4fb
z!W~))zzj-pP1DyvQfP2k5O1CjdsNdjhz^GpZ0iGF(hY!^9SED^W(Gy0H+6;qr7PEg
zOjjXqmVTy1#FVje$@eKUztkf`Qio07QOfkz2@LBedx1%#J!VgJc$A-tK9J368aAZO
zNj14t4a6t!-vG844(_%MKn&UGx}6qsje-Mb8$$G^XS`89Aa^=9fjFD^PqM>+vv2f<
z0%j!jFBocnMEtVRTU3N-MUwv#+pmPq9{Q(!I4rBaUj45==MK2(R{D`WT0i&;DiJF?
zK80uP^tGi4mGDkTIHkSD6b-T@LFVq^`TB|TBf4iBL0`Z{U%|<wK1<W4PtjoSC7BWR
z0)?vmnji84C&aRIBkJ0S$QT2CTs7x!mN`E^t6+0;#5Gl|vy=gGW>Mw09pkZR@zX#*
zt9H&|@qBeDKHLz{oE&*Nfx8ti8pl;5Y5O<gv$$V%W|qwnnHz)YIDr&7Ni6x&r)W9q
zX-5r_Z}wJ2ah$Wbd)RuG2!uTK%2KYti}087PENDFXzEZR>#d5s05A^p$G$@z4%&eU
zoZQqB^j1Agqi3*YEJ~_aRBY>dV?Zp%zS@E4bT&r7HIEpG8pe`+C!?J*^N8$_(Y6XP
z43T0e-2hTe`KKE4D9Qp+u@oThP;q)`wRR5>rwSu{(4B_HvR6^L)Gm2-v{u$vQE3hH
z<Yqsf?MOt4ul-3eku3H*V*Zc)a;V5DT1+Nxu$A#!XI;W0l|@VSRa|Y&X$0@M9Ioh$
ze_D~rbu%Y#<ZHf3JHJZibolHt4yz)*>6!xcRvhm<eK06679E>Wd&~P?Hgzg)!aHe)
z&o#*t9ws;{&YIqcH!<WULX^5!nb<mef5pKs2{-6lFn~S<Ivly6cd!8$bQc?kd7{<B
zzMe^Ud1Adv+^3Xjr?R|!@N<20ozhk~OV*WNk*Rd%=S3XtK~=<-XlNM6RcYzW>9wPV
zJda>M(sUmh{`t22*|+8VF|Hr$zt7;W^QK;M&n_S9F7^spMN?OMVFZPOk$pd|Sl<~8
zEad+aQ?*2b<!zcaBp(lP9uO`#P5vR)N7ovVnVVq-z&fm39pGmMnVfopK7&(~+oEpq
zy%uHYmZT#a$T{(;jM3{)7bYvH9TS?b-28o&Y}gNP0UGBKNomthe^#aiv>Dl3KPo~9
zluj8|`0!;-@hoLdw=e&*{yE^xjQ+~>65g(j1gi<`7$#;wZnZ!7l9QjUsv(;(;Go&>
z!y<P5XJD(sqRE`jc$(s-s9R`dg4+Gnu878ran_cWbrW;>8uEKpr(sd$<id(|UJrUG
zsmN-({BphmpTXTLu(p`eUnlvjm`X>ylOo?|)L+=Ld7)32->@(0je=s!axCZ|d}9j9
z9LgJodLTBogznvzi+*T5Q*=o<j(wMlSbnu^4XTkqXaC82v#zpC&%VTq<Fng81yX*N
zwugM7yyA>jT_ky=TE)k1i`Y0y3}gYv0j!##@*}4!@oIOR;`rVnI14OPIb!D5-}CW^
z@)B%F%f`}v_q?Y*@wQQZxHjYqo>-do3+o}IIr}l}WY8`B&SbjOY5AL$%KI)8xpyV1
z|9!BUZ>bkyGWaR)XXDcMec$A_NWSwxuIT1?be8bDGZzKLpEY&c0~N;in~ZROB##1v
zz|sm4jJRTtgP@6?fSQ1tJ=#=#?JA9!4TI`7YCc&?4;5LI@ic1^(Dr#tV$l!iHPr?G
zvpPX@5^f2#s?2ttC4dT$n7?yHtqD$l<=+HK<W673X+=C3+x}V$R(`oT7&BOM=?=R5
zoYcEwp+IrNI-q=k%RY!XE=bi4`Q$JjIor={(_@nOtzYeq%3sQB@kG#%9lhJNh^xVF
z{KGvZ|I0nSeW*XGqo}F(mrljRnXkd=jqLovjandHJnPUQP*EoTP1%BEj{`zwy{Q$L
zb=bSjTwREu2}6di(MM7|ldj1=zG6)+&A@4ape$b$r9&S2?544l6@T4tZiFx33MF|#
z{f$xgq*>dD`P2_YZHROsz9f4a5%u$c2;s+Z$p{OT-`OI+Y&>0;dHpk=@iq5fe+``T
ze52g<E}2VhCc{l*n%a}kfiG(?eYex^H&+TGII$2MlUZ+AQ`l`~IMN+$UMm8UIz4EX
z&z9Y8(s{70MNGr|S*qM$KV(jiEYEi>n9Mv8<Q3CGm?<nBFW>sJrlE^c2X=yIm~31+
z8amfn$}!cQ-RP0CO79!|c0ma{k;c2&=X6&d8~s(1u%6kmTO~Z~E?d@GdP`?@1-Bb<
zw(Z%oZwSF|QeJDIL9Yq}Fxt}ZYNWS?k3kTf;;nuGE3R1Pp}VFY2s<6E0oUqel8*1j
zawM!1enYQ$xY&6|&5(^^R*au7;i|jY9OKz-zbKk{=M9$1wS@3T_lz++n<QNy+cx+W
zpU=1u;tWM}piH!L|J1dkKdXH<#tIs<e$PN4yX0oRQ~Z*GwS2eGEcr5KH(8_D!dXme
z&!f2TJJTR7qV@xJ<?FK%zxrEdBQUVT!Q4oLGZHN7aLiPdn?)bvyS#VftH@{6nn*g6
z4j7k?PJ>e!?0j;eAJCbQogC|pv{p`llX&~Y$YWtV3ID^ru}D)(>Q|&@3(AiU3rm<o
z=ZL|Zu_GohT%mm|es$9SCO_?bGe5BN7y7bfGX2J*XI+Bx`Jvs0O#ah*T>F!wVEs2y
zC+z;{wc9>~p2NIP$#uUaoxWqwE(?DnG;Zbi&EDwkE<k!>`S+`}zWgaWsWTAgm2G%T
z-|S)9{&}so`>4_jH}#r=VwsxO`!BpY_FydF2m8FMjEz=rUtad)gY2^=y~e@%;dqgU
z)>5+Q#>MjQkZ~(ZjopBb$?4AMm0OKpKkWChrV>T3sG<T9tq#vw;W%ww{CWb4`WCz6
zEb&g+ra95>gtEAjQnXWyR}dsam5`Js)h(NxANt;Anp*GvGqFDfPBe?+?0yz(CgX<t
z1OS<`MTUMZmkzs+x9UOsQ*>FWL7MIBgvWF7U5CGF#8xx2|CL6H4D1VZuIbgJ{7X$l
z6V?TfKG8RI7n`G>M6p;1P;cB_8zfHY;T}VgDJ85=+&^1>Tl73UZeT(h+bwkU{0a`?
zsmBe`&J!!;E}P8aQn-g=|1f2e9yDLG%Q><4imjf~gbO?icL@0!g~`SfD}~j+<g{3@
zxLsWf1Zr6Yf}H|g6<qvXZ-GEsQtF|&q?EWc$V^I3;o&0%X=!muSp`YS46F>-{|WGc
bfIZ#9{{Mi@>XeGx00KQNW6ef&N7(-WWDbu_

literal 0
HcmV?d00001

diff --git a/shaders/CRT-Royale.shader/textures/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing.png b/shaders/CRT-Royale.shader/textures/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing.png
new file mode 100644
index 0000000000000000000000000000000000000000..eb20b2316fa22d8180cc685bf3886bb5975ab2c5
GIT binary patch
literal 204254
zcmV*AKySZ^P)<h;3K|Lk000e1NJLTq00IC200ICA0ssI2dm2Nn00004XF*Lt00D-e
zG3b_G00006VoOIv0RI600RN!9r;`8x010qNS#tmY3ljhU3ljkVnw%H_0Du5VL_t(|
z+T8u;vn|_o<p+*5+a7!EyU#m#0w4&GAONCBmg!P=w4x++>s$YQ{YmO*sTEy{=#H+6
zQb@&O6-^P|KJVYV_uON*OVc0L`OO@ed!PFNkSrCk;{|zf&dHsbbImo)m}AbB>9^uD
zmcA(c-=zOidRD5WOR16?X)NVZF16B9Dy2sHiS#d}|5<uNsz%ahq<<m(v2<6ON^@yn
zD)_IK8mX4nQZ2nE{SVUrTY5{XO6djZzmoo2>2p#hok^$CMB0}oQfp69NpDO4gY@4?
zzmzurdyDPw>{CiFOaDUpKS}qI(^x8`nKY9oQZB8eb7>)cB>hY2f0Euz@_$wO7t)_f
z&q~LVq@}bcmHcB*%A`s<lYS}vOX>e9y(Vqao&PV=e=9vNZKO&nq^VR(tu&J)J(NzR
zJ?SIqf0F)3>Gj_-`M)In*V2C_Ju8i+u~bQ!R7fY%C(<qHu2f2`^p^DBN&mg{T2kSc
zrT<3yZ>7&mQ>l>}sYT`+zH1{rmOhsLjr9MJek!eF@&1|gymTauq*l6;u2_fzX(ZLs
zQu?v<-%I~edRwag_ZHjVrTkx({!8iqEWIktr9xVxe`BeU)>4BZ`la;WOaDXIpI4>-
ztMuPUFG&Tas*<KsDXpaq|CaQD^#79n@6u1Cb*%h<E!~w)q?`>il_o5qq)J*!Z%Y4C
z`XAHg5np-ZC%fV2?XP}W+yBX@Tp53w?fUB*FK(quS|GO^>$5@VBPo|kG;tp}ch@I#
zF_xy1q+BZaRlGta)lw-<_^l)r(nu<C5m!<xjif@#rH!<aYW`=&KR*4FJ@L1{#uEH`
zSL!!d_6!kRNvBdKO{8P#0PSBPg|)Oq(nY#ci-|ds?n*Oh#j?p<cUq$rH!I@K1#UeV
ztu;nJ=MJ?rktW<+zl!|-R>HsOyMEQne(jqlk<6r4T1Xj!m}7cGB$Le0g00c}%FGN^
z67N!4nhIwu!Dg^|TK=)IH7nV0rIbsxv|+t|J;`m`g1=AYfAUwfPx$NPBG04>yFbEI
zBxzqtnY1s>rMXnbZ8C)4umlTfA?4BxaTaWwu{ncld8W0E|2EPJB`?gAFIj@_?aG}m
zlk->k)*EL;!;N-*GUDwVHEel+bkiqYER18Q$Y9aiCtTd}-3zoYNhzsofyGHGc_8ik
z0wnqF;k(Dkt+j_r*8fu4n1Ff)sFfzto;1SItl1I`-;$R1NwH7%L@&|urp-;3AcViI
zp`RkcSoz#cLe^i%B;(zmk#sCQK<}sQen~m*xs+zorTt3ckxJA(lcwe_D_lY~uv`9a
z?qze{HlBYfRnjT9`4k6GY>CkS{V6K!o98AqZEmt8h3giQSAX;DmVb_YuCUb+a?YeZ
zX@(_A`B{kRTFA*<Rnl6@q#5!4N-CvFx+1J>rAuj*9xYU2BQ2z{G?B*ahz08s>0c|&
zrO^|NbY@!Y!jB8;PnCb(mtt$u{uC9??arCW|5H{z?8KCaXM_Gn+$dA{urb{iFQijx
zDdoTy$=WQXV`&c~pZ3DiUm^VKOq#R9L+k}IE2VwuP#QnU0c6rxn)4>5ZT%Jrt)&Yd
za(Hoxy6u5~Y-}f{Q0^Qvy5r(KX<wT0*I7R}SxFbt#m?ltAyQdM75IS&F4ty#TjI!k
zaPzq|X5Tc@Qd&t9Y0eE+(qrkdbVbH+{fjH<F>5tue^)qX;cHerPWyyVB8l)onzK+f
zo3udzE~EutG+exWUcT+yzzE+irK_Ft9|71*?fJ(@?nxJmoiQfl84(aQcK-<L!wg;D
zc|!QQL7B(>V4&f<CWfmGcVEu)mn?O<TSZPb>O1z`@*ktWrRjY7cY3s|C&)j=9a99c
zZNOKcBb)7uN8EXAkzIqTr8m0Vxp_hgd(th0pR7tF9g@~&NVKp^u7x*3ktNNEcvez@
z{-@`v(TFZ~6^hx~(~hxW6&q>6pWSGEU4#)M3oR}%K9?r{-J55IyOTw%LaDEA65)uQ
zH)h#0UMeZl()53K^fi?Z!5cC{{t>7_pvV!Pb^&OG<r!hdSFC>yqM7XexpamhyWUty
zx5I}=$~&>pwM8#`D?fAqrF2JnReD-Fln&T>A%<K!m0p*AE<GG9=196Fy(GONJuMwb
z8|l(Cu96-}zmR?|J@N(2xcEisbJ7FMRT4}Iz;YtJCjC_UWU%@zincagN$NfUh)HXB
zCS6G<($Z~6y8l#~@ty^dPK_EAEaR9JzP@v0Zpo~bma~;u&yi_=8Q{lG9lR~QD7`2>
zVBaozk+h{hk$x$?E<GNIa3VdBz97Ab4G%9hCX{|Gy(#@n`e<<RSh_F0Bt0eFk|b@2
z;{yqPB)u#BQu<`D@-20j6ISaOoGopA5g#oL4r=}8Q{-IZdIot&PGB$^1kpJ`%Uqi9
zM3MM4(wVg39yuA<ZXb}O@Hy!PX^%e|@x8gUl-`qG+m?SU-m}sz>6Ublz_+j@@?3gX
z`sud(N76m%v(k&wkyNr~D~ef-bS}Lv{cKzRg##Xee~E?2LFEf}(Mnpo19r0*)=wDI
z_{OlLJqSWrpNzj=b8!J+A;Kf^IyZ*@Bk7LxlJt^vAEQ~J|B@a_A4=~@Z%8LQ<^PiO
zqI8El7t#_~AO+WNOTUmF4lXXGBk2|C^U|$;$X;VkPNg@u7cYGxJ8O~o6_ETf!Sos%
zz9We!ahj=mD8iUazOJy{v5x)IvNsFV^IXdLiVd5pv|yx{-1ZzmAstGemHtHff^<td
zlrkPVw%#Yw_e^J6U%IbK-;us7-H~R1Su+Ev(#5Uxed!`DE~MMi7o@LBUz1+xtA47c
zR?;WZ_X$+a2Ya{%0!VgSmWyqqEA)Aed1>7Mm6S|!qM_u67Y1xjrMsZ2dy&Paiw~qD
z98CkxP$IL8_n)vOc9&jC>6Y|m=?|pOO3zEjfG3h>QY~FbpGbct{cGvRE`?IMC;h(k
zr_vWm;R7&851f+v?@5hxF}V1)^kwN=(u>q#BlZjQ{-N{>p5kiTh9)0yXz9nsg0&gd
zz+@-nOOR~%3<zjs30e!Vy|!v{4?>A_AScFNv+^UHX7UF!N=3WNzajr5!b3|O0vEfK
z-k1J@FRBM3{F?MN>5g<udR{7|b7_Juq_pxU+wxEP{}t&^q|ZsW$VStG1+=wd^X$-n
zf%~7~D@9nCeqJD*OETaNf>=wBrALq;THws&vBx~i6~?6X^ewk=Eyr9Zg&(mC3&Y|j
zH$iYI`utt#+tTky&!Dpv4s0oXBt4RTiBep;{L|unSNbw}Pz#UXF*koC{Q#vnA6)!d
z>08pbrF$sM9QmccJCnYL{-5uRAtKEVqnLKi3Ph*?LAmfSWQ^-wAkGp)6jYRwoI0ST
z>jGNfrz7(1hWK_$&>jz>0R-AvqFg$ZJ|q3U^iAor06_(gD}x~LSd#Q}>CG?$g>*~$
zn)J`5Z%g-akQ0`Ei4Ay0%88&7kaF#>J}>>L^s;ncdWwQ#0GKHhct$FuUr4VJocBJ3
zAqOos;EJ>{>G_l*O!wwd!D<15auTN7WUdqJR)sz3<<E)gaLAro+5Kl^^cnxy9RVcy
ze@^<2^mXZh^c2Q4g1eMZJul6rkEC})3aRq{8R^^7SEZMv35Kpkz0w9ck<O%NrJwu7
zhte0MKajpGJue*?m!>l4^I2(6dQ*B6-`MH>7&o{_?6P8u&VjR%KT5)yc@elGOskN8
zZHB%8e5m<JJ8%Fs`!4<86?SLA(}iyzvo!5Z^8dE<4e12}vKhHk<kEMfrSx;@osdE<
z9Y|l4{;Bl)(rsymI;{I}W(s<KDE%VjUtqfaSo)T98x(3TZKQqvZbE+g6X};b^}ofe
z9h%)Jq)RHp@e0K@1hJGZq>M6HgAG5GT4~RUxD}h~Is)BbfYxl45*s+db&OeveM4u4
z^52tQk^WHniu987fVeMs0B6!IX(`R5x1_hj{)7sDSNe|hqNThk*0=-_c_<aqd(t}t
z{r{r$ZRzVoPYJj<0=aDsj!EhF4*3U88F1v-%DN-g^-Ov!orVZy*iZ(JaR??MJZ@A%
z#=xB2vL|%GladV%WNcok+E)JVYbMfj(l@2wm!6jn@f|hqJt3?prDvrFY}F<uf3Hga
zMEX<dIVqE_*oHH_QKb2Y(lP(O-ki@$Uzffk9iuK$Briz8)22USdkwS55$N@P-_Xv;
zCXydMBm(aRA!+HB$W2JRwD2jJq$}wo>BLvBF|4~~a|utsk`DNyWWXN}tfz3U&m;dA
z**XnRQCVivN@v)KEMEQ&-ZKanYfIcl_6lPno9m0G(u>k}rEf|1r6baus91~$+>)t%
zMfz*$;kFw~DZ`<aB3E2I26-uLQ9CnGNhi{~(p}=1C4aicZWY86&EU?tl^j>3&?#X*
zM`1D+Y&76s#AhnKAbm^vy7a8H4+dVFF{!}z_oZV%{%-SpL;8~RjC7CRWyWzDu`aiz
zry;gpuly@)o@cpW$!+)WkYj9EZ}Hp*u5c7{!o;Lx3sA6qe8Xa({{^`F1Q)pix*n5q
zM-UqSUEjGvwrd!639X%gAT$=DPIsoi$qGCp{jT&o(o;s{ni*!X;oC2z=cN76!gTT1
zrEf`}kw$pDIml^2ZDt}}NKf;<b-MU@>ATW*L9T1+F>{De1)_N%JtOVM#l8e<=~TL4
z{|DyQk{soR#S8<OWMuXYwP<bIHx~Ug+r~o_;nP-Mq32^@iKGI;nQcKpDF}a2dR4k@
zId6@#&r#_d4cqgJ52epbzbicl-EM4=T4srKizn!4d<W92((g*INK@&8R6k?KG$`4+
z?;CnDC=BsT!EW-gccmjTh$|LtJTO2F7wkir2-Cl`%y~+dkoyMMP?x$O6r5UF>=GGe
ztn&^4I_c^Q((g#GfcylpG}*QTGP;({)QNB=JxAb^=maGWIKBK58?rYD!qUorR{A~Z
zvxt2H`nW{5gh`q2G(cjPhl*dK^l2xj_;SrxpLhh-_VcP_eI9|zrZ;+QWx<hUmfZ$u
zc=EtXH^dzW{LeEG#p&SrBMAJa^a9cBnvh@*ZfZgFu<t-d65(g0FG+hWn83Po3;%Oz
zE*(kx4k7MI&r6?`o|BG&hk~Y%mQbkOJ{J!^t<~FGi{3IS%a_u=bVrhOGAKjEgO#K#
zHMPfkeS4JKvvkTEHU3MsTD07>yhT()w>ehIq$BA|U=N27mI4keP3IHjdJ7wVz2{z#
zUI8{el|GRQ=_&B55gFjbc2OH|{xZg^v~`+t|2=G$mr{rNpI&JpouFZY6`uUgMmjMD
zOMqfM6#2-a=M{dl1`R1`g$YA;ZTY4~VN7M>V$1qpkw^Dvf}IWDlb)7t<6vi20ct_4
z61ye<*F0WeF5Qu);I1QHV`-MKKtp>$T)Ojf(lb19@CPPXpB(<pn$@1{G(bz9Gk9wa
zvDb`5EMTx6l|Q&072mZWj-6Qb;|x$|Z=1xMuAGtWM)OQzDPSVU-`UP!k`2F0;+0{d
zoazwIU*N;8A0fbtuS(Bg41y7pO0kcC=W`&KNi5z>x+C2IaH`lY2k7wH6z~kT+Km9{
z6)|!^4Gorf2{f=6Fxir2xg#AI`%|cK01P#cHgy}=VCcrk><l_afj!zFzvXu39ya`W
z=^1Gb(JsY0BPyGh?1dS5WOs2=;pef{3l@KB-5_gYb9YSO4C-)e4H7dm;I$dR1XbU`
zHVt24Y+BixbSBbM!~-Lkax1qXUYS|)-D`tzf>yrVNuQWwF>9)p6MM*#&?(<G{VxBP
zq*tYT?4Q=M#(3M>m=&E3pP|1Fc&y1c4oSbR*ygoc-?{XZbjOP2k*1dB3@aSV`A!y4
zO|m%S$v4L4FQp6Ved!zrK63j}!azw&FeMMjjL$iPf%AcM>Ud89I<t>wyTEv5z}EgQ
z3xEEuogmdXpGFcDdo^k7DUXzsPFInOWPI}}W$%4ndS$vEg}<H-g!UzdP)?p6VrZ=t
zyTDWobtpwA7t$kXAstCG=@_TAz=<Cq%#Ck=)|lo6in26jdWk=~WVcU!^$Od)r77Bi
zuPCgnGug?5&H=AP>uVHI#qt*BPkOyL$pIwh&}}1=g;0Y1m4gKKh)As&nz4nf2NW?!
z{Y11!;qisckoE}BxWv~rrhrS!g4?iFC4p0J2}+<C8J;NT{@o`4(8etGjH>zCH~~k}
z$zTJx3I%dj(JV6JtIv$vT7tzCj$$5A2C8|~DTZN1NV749ZMS*C`mUvO0_V(HMoJ?R
zjhuouc>b1WE0I}FS{60swUJ4yc!U!Cbc%X4bO6j*)ZBvL4uuHoH3AYAu}*0ugw~X2
zvlGRb5SFC<Q&}-CQjF4^VTXCQ2Afng(55=>p7HHdef$Jlmb>LUv>NS*uZwJQ1Sq%Q
zTb4WZr-c8N+v6mK9|7m3SX~1Uqs1g`P{hnYm<{`~#-*%$3$~DL8l)(5<3V1m@cV1O
zxN`t$0S~C3kI<eC&(Ipt^b$YV8L-lte;TU>6`0Nf!#+mfoehoXY$P3$>R(ZoT2lwE
zOo4`<+T2o&t=(8fKF2DZ1JzZo#v{V_C=;$Nhz(;FZPh~nFqXSUQG$7K$uexVS8wnw
zQ9BB;2ZuGHsS17UHgHbz8Ub$FD2X(hz}TpbL~miQjy<yS`4wv!dXtfm&7Z*H2%f>n
zC{QD-B&8r~WnJo>4R_~KS{`-D)+^0!)b70lAzYfr&PiOB^d{d13+X5nF8M&A3NW9U
zS((!KTRXt)oS*L7w#l*n3qZ}NFlDa%W9g3cKsu0C(y1iuv8dw(!E7_wz%IfdUX*5c
zy@pX6QrELSvBOKO$GP-S%A|Ydm~-C0SHuk;aKPrth@@mdSj1iD0}_S$H2)<5Ocaje
z1tSk}LO|ynCM@I}D*edvg%I|9pnMUE1l@DYa#q|k0;9l>hMy>-j4h97Koe#xbBjb;
zH{cs0#u^Ki^2NCk_tyyW7P<QMCy|REvu#J1*Vs|s{T}Mnhk(L~I#YcL9=n7NQdl*t
z9qgeIRwlA78KfMf08!2AQ4kVPdB-|wYqraVBFrHX{Eied)G!7Qw3O3J&>^vU%{CGr
z3J_(=8fr>wg9frbU#0cupI8bW;8hg#W|q)2+YYQnX%1*nt}z2SHe_R#C>n%1<+vbB
zTS8R5iyJ*beudfACd4)(G|73x3wG5KGj>h|erjzlLj#(cWFpM>wvz%a*zhcej-+N*
zXo*0hK>1ocNy%30wNch)Wm>}IFf?n@@dmp&@!jJFD6C{=UNv%xsCCA#dP!8v<fD;s
zk*=^R6PuNBvE9izCs-9(PuTO&)?|iy;EjI*UJ3opa2L60l<;sJirE@nx5f;wFvB@f
zYfU6j`pZWJGq`k-evDD4#vZEK&fr3XvXo^H{!(GVXa$Tt+`kj)zVs{%2G3(F+P*SY
zVd*tU_4})9!p#|Q;=<V5wZ(};E0pnnGi-H?f{4;biAMKut=3dHtkpSa<1y_Em+X-P
zCZ^=BKND1CVh~gbC>L<#66tix*NI`x5@&r{?-GYv8jEB)c!8)cM2r(;Oj(IG^5355
zCg@~=Hw@BV#g&1I?nvP?!ml7-G+2@dC0o96ZbiIq8CTLLfJb98n6S02W#LmdpkmV=
zu=R7I$%Zs%hCK^z<&Gluh&7F#mDodTlS?KY4{8{ZUZ)r%k^bi9k}fQoikPT}!sWI?
z1-HGB9!q=DmGnN<(EO%R3YFDY)`aX4EM@qB1d=}<6cI~YSG;OXPBR4^Spx?y4HxKq
zKt!;?B3fI^#*$reWBQe&Hnq{grWCPT3nMCQV;4PD2Jz>HJf_baBfF7KrH`dkSK$pg
z@?M`pw!r%*-2JYx3U}adBhrkKdBV`P;28$M)ES@*{eR3;2oB^JbCDiuOnti7%N36l
zptJhfNJZ9q+0SOXPisCj>LyLUf-VcA8rG<Tuo8B$XY`SV80N9G0a@4}+O7aHhNQM&
zNxdmCH$&E1ciavKkfTT=BTtTqx@-1WiPTybVMHk-%21WD4jL2l(p!l#$aksn=bXnr
zmllXKvlgj(fS_~^Ah60YG`f)9g^|Rf#L^35a-7`+TQ{YaJ>$Cy+bV0%-x5MG!>^}!
zH)S#5fs<BVKuX%NhjVI=W0Ojte}%p3WV;nWM&=B0<d~k;a{MWDoX+~X-loLoQu>Ip
zOu!*Qe%S2X!-8&iYMiU^#8ZNZm6e-1B8hVWWZ=zMI+kW6m^qKrv*AG?%=n8rOVwJ{
zIKceoCIdPF1~0KS0hP7r#Xi2Jb~Vh25yv*pL|C=pe`H+Kqj05vok<__6$QJxq40en
zT^hi*!+@6b2_E7@#NP!6HAks8Rx7-I{?fwKh<n0IFHFY*zq`Ielq@6nS+W;JzPW&y
zn%(4p;}><`RtcS}thUg)0}z*`tsV@eNMEC+HYEqC4qE%2GfaOp4y6*qv0)~OqS4&M
zBCE}5<aq2o<h+4JCZ3}97$6GNSLj@Bf%n8vhvi_DLcG`pn-Tsg>yLC?Tf=R;9e`$Z
zh%FKO!~(OL=&dA%UHV2!KMJ1Q6gtp^znB<ol$b`VD4<nn%aRbK#fq+=jg=;&mEXDa
zcbFK1HL#Q*1dsaM$kro>;HZyGdeOhqvbGUTZw7tvB03up^~-9<d!XVAB9I*rN@Xb7
z#<oO2$^{ly+ok9rw}!e_R<hgRgx9D<4}y^4o&}HRRfQS?YkHg=MWoi4PjlR8MObo4
zq3S@|>-*5_ivu%Q(exZY&yZzs+E;@tJPOpZ+Led|3ZT6y8}o9Ie+#?24;d{dItvtJ
z4GDd^y?Buwj97?<G%qvjJF`T-M+0q9)e{~hxxpFUD+42oLg|i1q)>!0x2g!@Rxle1
zOl9jfAY*&9yluS5lucJqO59P$&CCJC$bc5Bkb$MO0AwS7c~>Dvd<)MJS)Gc%91ldO
z{t}cnJZK7{wjr`D1`a6bX*t{e(&li~mX^tY#2wGSp~X3VJ?a~xbELb%s%J6{e~b?g
zI!-V-SIlVN*wS8qYBI%JqyoMu4cXik9_G@$$3O+oZ~zmVV-Xlr(BotOWwB2q#E76P
z46dx8pH_6`1=z+SgG;R7ntPsEHD88e_G<v~#tr5s2oDhuL<BpCwXd~jY^0??^&9lQ
zVBfiTuSJ*04_D005Ey!6RktYrek?uOIms+iwdlXdu|Y}ESL8*}B-{AK(aActJuKYn
zgoJ2#aV%hMRF@!%PR-GT@ODI?u^9WLLQQ{;XxiBOrry0!JYfqDtvxJfi$rF(qMhfG
z^sURl_PI$*xh6B!9aBKKd;)|LA!7sDC5%=eFB5vqdO8^R-OLz0=Wqg+v}EVRG?f#P
zg1#`dUg_L&G8ve9IUrFFK84xf#%5Ov4rS8I1*vG~2BR``3E<|%=7RN)FscV52LL=~
zmuIYV#SR?-P3?a2P{g27FNjHVi~U=4)Cr^4pTq(@fQJymi@<Y@{y#&nP;c=wl;;%g
zdlbQ^#-o3DQw>9K1QJ{8&;}_do3U?v(z!F4qk1sK>9!VVrUiQ>oe@LL4IH@MVX{`o
z0PtI*JqDFwanmHr3?O*V;M59@uB|LQrD4A7<H-qef<X|CY=Pe`c+z1nS!URFO0)Ke
zVP-g{WMD@F4zC6<X5#kTv4OGC#yPeSBkCB#2&iIA#F$*VGTe5?zHbH&pfdK;lC>Ve
z=MvPbE2EF^0N}<{s%uJC(S_2GLOw({&peNay6gmar^1Gvk<_JzfGgsFo(+!_IEu(q
zc5GC=F4_0_P8wfE$TFhkecznPu@yH*hUE8P{0$VxJN%*qZTYz0QmR26I_cpR!?mLK
z;Q}8bLn<onH~iFal%JLEQu19&C+L!+k3A%^HU87c?B2`_W0a1TL}MFw07uf((p?^J
zWms#Cc{o6v<<93X!0aXjUL%{NADHKoaSI)c9GtNgZ*^sax%3p9J`H$0$N&Ox%WP0v
z#yrI(su}&%J5aa)cE%QmufZiZy!IGV+oKT%W)wXBDJ9DVh4U$H@Nm$Fkm9Q=rhWwf
zYhr0-gh63IcgH}jA(T<FT^qeM`QSTFglUo4j41&d@|7Hc1v_m<Hru5?B`L=WFD>?b
zZM>#%0L}LP6<pg}RJm&7p%g4opwp9q76#{O%}-ONeak}6heR?fqK{5yNth9+%|N>{
zM6l>r&$`bL29H*mhCg)PSOU3D?VyM@7T}y_<C!@w=a5!LLhUt@ly*&JDQII#bH$52
zWv$A+PlkQC1Vvclz#0PGIjed(fB`FBJGYr1;c0{6mZ~h?SJ72aYwNckktQUQmP!J-
zbRSjOT|tUscjP1mk#TL)jn{)LG=qY5K*&Df=Nr~(Mwh{An*x()KPS>iI+q%Tku_v}
zOT6lC@FA4P$$2eHJ+UbrCBNDXO8FI8PfgZ%zy>-XrYp#+BQv_z{u;RWj=>SJKcc^5
zjF=8@(jSNQBOwkhTJ*y0^A+7E2=0=Oc!d$)D5IG~E?@g?gE$^uB61ujB}GTtFam82
z174C7Z4B|3&BbZA$)tf6uHb<jOB3lF3fVoZS7{9qoty&JKE%;Z>_oSPC8RfUhzlP3
zh}~9Tp`t-Fws~+7dm_lKdJ$!qC@&RgVRR{U_9t2Sa~>g@2W1gzv~rAZVgZF<HrBr?
z2;gV9@z!GK?n*(#n(+#2mZM~K!-w~3kij*+fIl@tVFE=DAna<8K6GM;EM#N>=7<zH
z?(>Lq9}3r=HIedNn=}yJR(s6KtWdq)+m{B{ZEX5hfyKy}V9-)T+QA}70Ngn^L1snF
z4XSr&m4L#7klaxI+Eyif?=7H}_(_j~61hugUzov_^*5B<X1vpkh9xMCq!fQw5mq+R
zg>>c-s95DWzpjn^G?MNh(SRU&0Q!c~dci7`#{5_sL82s9*}-C(O82E_uwH3-_pk>-
zx{iYNtWS*_h#V<uEfpf1Kx*0>Kqw2F*cElqfG}i<&yAm&$KF_Wn_IlMHc+qkU9I(-
zjOm|?3Zn4pl?8;kUtC+Qr^Hc-xF>l0DHK1WwzeY!SW1ti19~}yETUzh!VQcET(Ah)
z$K<$Ev^{u98&sgT@~t`TsB+1ox0e6eY!xPkpJ1@dHj)f{uCd1#q`QrqkOJ4A?tBHM
zbC15TD9dKsHNarMM*C8TX2q>WAm2Mafg@Dbts1Q21|6OAm^~(WG<dW~ZXZlw%<Ak}
z;yfB`;T2Qv_Hi*qzp=R{MG1n_^{0yl3&7x8p!ZIKPjTcU={=UZ7eqA{0mZs!6u+ko
zIa+Mhio5b<ZT_mJ_3{XF5iQmeOIdrLK*DmoW^SeRTd*fbfDOa)_n{rx5Cz%7fUp#n
zE~Lj6*<CB-m9-qW^@-X~gmp`!$__Vh6gq-K7R}hP0aonc9X2>VD{So<N?Y|k)YR86
z-Rf9+z^YV8;fluOC@jpj`BNdnyKof)jLuNQrFAtI<_5bbj~ySfZ4qH(QCDN`wYzx(
zJQqEC5mps8!luT1u6OEBD6+@M8IL{2`akB_zn=Asy&7NaeDyF7B{bx50N7E~KgVvD
zAmM8Ungn$)yQu;k@K|M=DgvkyrHn1mv;HL!$B2nQEqvk%R$)zTs~&h#uSNyznwc)=
zxVMQ(diaTL(*jgyL_9(NUlG8q2bF|)`2-<{sw{EZN8}-mwbfk17s1L3X%&XZrNFWU
zSijnt#Dp8KjZd5k?@KmV@M}r~7ltNij)w0^hkW%C?n+3Z2cb2b5w&L-z?elYjhh^R
zLT5aNMc@r4Ch+{3t>lVKeQfc~^=nNi!UX`@U^E17*{~kHMvVC4InJS_d12Off@@Q+
zp&I2(oEN0I;M0`^K_xOQw<Ca9fQs`r?y_7_wVX=FfN?z*LEvjOCO)?Y&UnEq4%r)8
zpVqv4YfRttlju;+5%zdzC^lk5R!uek3=I$OQ`;<*q2%@v&jtOI0$V-chS_%V?J88-
zHWCSE)c*JS+aM1c*jTU^e1R2uUs&5qOSK|uV%K&{*7=ed1tr6hjNx$hVy}T26D*u7
zAmN;qFRUss9r(bKB4X4LlX_pm+-c2E?6BT7!CFhnt|Vm3%>iW20N?(W0=qF{lLj3t
z$R9H^DP1Rd5HunK5BxS`700O1)qsc>AW6_sW`;i`W0;cj`A!O%P|`~#)>Sq~Ht;LA
zXH$>iTIO(`@plyvRA4C{OkE8Q^{Nri#Cq0ROkD-^KIe7Xox+cBX$9%{9_eoi#*WzC
z^)`{PWwq!1dT(@pU*dzNrWBn63!YEZ+QP1`SWdAEddgc4-r+*}gcxPcGh7(O{3(*9
z(#>#7&H9jb$C|a8QwI(@&w9H8yRk~^2+>DhMX<tZ12T4hqVm7bb~_}#$?%MqR^018
zIX*4?ctKvAu|h#D=@roeoSs<LFtM1bz$<Jxh-e23qJexgCq~b(N^3HpP}O=H1!!^q
zQ%lHh8AESMOtc!n$T<qNAm6I-3KP(|#B5#-Ovm;HUh}{aN-ZsjkA4@=6|Y}D{b*!c
zL<BbzoMDNMcGf>L@4ZKwRN^RB&^|`Ek%K`6aPSBl<BE<gS&>0QGxkF5He^Xq>H%J0
z4jYie4aTr5Vizp$K5uwsMd*obu=)v&U=x}(f_yaNz1DyabE5HH1Q`sFQ|6@xVxN=5
z3))S*NGHO=%C9x`;*`YK?1Bl|_{mK^AmpE0Bsymambl>zHR;t+ozxJiL}~FrNf%qo
z^*t4+F+Po@zZ;V6Tf|@&PdE%WXNO<W62HJp)#$|r{5*DDcYS1Mg@r^pSh|cPI^90Z
zA*0DMdP!I8f=J?~gm-ew-8q0Hz&)5?rBM)C>rX$HM$*H90BSbv%-YcQu!m6xo|viW
zRba0mOV3!D08J<T9^%WLeE`C^T`-kvZ0o+ZlR+}#yIvzn#i0IM{ejb!m0U7J&<kIa
z=x=idV)PpXY}58Dm^%s>>Sq8`1F|9}oI%6%F0vjAD2ip#5U}REQsH_ZOa5qJMp|xi
ziLaa)hC4EP=MrZ!G@z}i{)tWHpBNrm(+E@{*PilKKz$qOvElil#u-GEGj{+Heil|N
zjv7*p@<u>9*$H8V7OY^&7K2EO$Mkfg*GeJww<v$sMjfq)aDrvg6JBBC+G>u^DX^rc
zR@JF2t?n71;BsxSWEt7anj~Vx3s!?9G60tv8u-jKVxI>Ln%=O3U`}dL6E_9^72M1@
zWJK)@qJznJEXkxbw@%o&?N9}yb~`rjdrs&y;TgqniK)ks8OxI+f^&XSSQAF*&khz@
za<{i|Q4`yRb0c<Fn7v*rLdKgHSf#lY11dK1$Tm+0L8o1DOS&W7hQheC+_fM8*u$Ch
z8d19}DiE6JU&slzk_|Y4O1q;760m%0?@(Iv$HqXBd_Y4KX>as@1xON)>!{*t+XF`)
z5p5@7;?w8Cj@b#ua94&%MU;=Ed(<5w-ii`_1-;H$Z5@C*wl`dxOI&kG=gex&JBoM_
zL|4q!uB}p<nKNDvjuVOpi7LT4<=?>LWrr*b(Z_;aGUGmh-vo=x`<X{OeOS%Nh)X)c
zM)vRpMtW>%YL5mQdErueQ(8;+r28brQR0o(pxP;D7dG@S(7N3ENTL+Hxe0=3Em&V$
z4`%GR=&6tLKu`GNey_ax+5U)A8Ct_HIu#1sxa6@XEZ!rQ^A<Y08v?CB7$2KD3XLea
zfp)T33h2g%L_8yk&?D-osRDflp5Fs9YeL=;Mz=xdXZY}jMD3P^@^TSO$v4NGBjwR>
zpx?EzC9f|Y0nv{EqHE&?jqw;MFgzOs0O7Hs^K$`YvDZIIWJCtJ!vO^Eb!xezgB3~m
zOPg_B1O!@JcC9{0v%vvm<Wx~1=>pJ0qR)+y?u9NjEO0oG&iM8qpbddzlq?peUWHB3
z3EEz-6j34Q+`f2X(QIZWp|^l*!W^%~2WhNEw{p^XZSP_4#VJAJZjqXvNEc+m&9)5-
z3Y3sS!A`i6K9ox7fixop>|o?@ojE%;Xu5Nw8U=cv-E=05h;G+J5YfmrBdr+o?Zal?
zq*<TPLH?0cN%w3PQN|56C_|6QCu13|`(wP;ptrZ!m^;GfmZHN$+(l$SQ)?}o5LvE0
zkq);aOC@AY*D0F2U~kO*JW1EThKV*uq{D@o@!SYD!wfPrdQzl@4O?JA5SxLj21&Pj
z|G>No^G^X?Pi$+91{Wu&^jvyG_*=0AQFmI|u_U`Q$c8-jLzt8~vC1A_+E^D_HE89D
z>dTl{olEx&d1;99&%6$wned7}TP->}Dmr{GB^gpN{8UIgqKu}W!H8qtR)Z~+Tl>J8
zmhe4e-g!b^cmNv87g6fWTF^4==?V+q+_ZlK<_PPb8*CGMp|vBTx&U+o2$CSUX!$51
z_Rj2!^1<T8x2&wyai8e{QG@UZzo&e?PwpIdYW@i{JOKZxx8tamgQ(X1i9o?WSfSPV
z!1`u*mP4FN!snwgE*K##E}<EizIgjHTCtB&;~D9cv;2oL$jEb#Y@+RoazhOe6)^hP
zIgBE3Uz&rhty`xw^gexx?FcTjI$|vJ1Zf9BrG*B2YkQpXne-M#z$Jeg^yv)?nUiKu
zy#gEL+znbX#?e<c2ufJBW~cmD5H6w*L{<ig)~v#y{+<e@3+B>&SfwG3mT8E$L6^D|
zKIe}b(7qXB3+_Q>L&kP|JX6Ve;i(NLxMx7bnsx21Vl-0jVP7{?&>JH}3?YOF|5xnW
zfET0Zai4<d^d<)^!vcatzs8du8pAor%snnouvi@7Yi+Pzg(oQqqedRSHk8{Mi+Xdk
zU;+6mTGDs(ao5tv(u8)ZiABas6#2rYCbura2~U-HdSerWCqWw?tpm`LPdFz18MC)8
z2~5{$M6;dy#J8*|R0Q>RWFbY^tPz+(ua~xB{ZA~}3`%rp?w%FXdQBYh^<(fuS^G<%
zCIuqs>}RJ;fmw`f-er(SO0;@yC8ENE$cA39J4Q5<vC`JYl$Z{j&>_bV)E4^|7By{%
zT{Cyzz5wY*Qv#f5`v@A@Z97t+v>f?b#ZO>6<b=!>ar_GgYK<)DxxRQIeI$KM18g*$
zl{|k-aIml)Hr%}v=?<sK)@JTAt`97%G|laHZ&b~je$k<@QEy{wx+o%r3=<s+8PIuV
zYjH&n?0R6j!vQRrK6hZQvqdX~g~|G}2CgqYkseF6ZNpHw73NJO;++|7>5}|JdI9Yp
zTR^<DQsk4YkL!QK*RL%eNl!axdpEuW0pSGT&>C@|VT#n)u)t1!1wQ$XH4%gX+9SJf
z5teomi7KMBXQ|#@8fCn&mv4xvI~$nN$4}7QNHAjyr?0NaeLEU*L)P(FdXdB<C#2cf
z@s@|uMtYw!v4^V=PVX^Bq2W~{m0JPCj@+5IChdl<pWs4eB(XCbQ*ZxfhN1*Y9>#A)
z>GDY0x0fF_5;k;uo&vx(Hvh=m1Si<2-1qN_KB{A2r{L}8hM*KyD{tb?SIp;Y@sj&S
zT&awHA%c_+KQ3K{!B;Ja`jgQ+Hc({(L1^Y?Yy^!kitHQPYhzQx0M>g|>=gd4IqPhK
zN{nnXE%3}c;<%RN-<#AjjsR+iObhEm>w#A(e12bg0Y@{k`8!9XgG(ZiP}pm~_qOzw
zbYFVPnAWYW-hwUtXs7nK7A97Z@mlC0YqL<I+qoX(QK=Mt3%16Rz$Y{7nd6l<+xdTM
zt)q!_7VT&a_k3)nn&I;=rH`aX(yQF9;vqexSmFsf9(TIo2h52%v=PP&Y)CXYPOX`-
zTfCo1KayUQUchfOfbpjg60W%aZWhhdPNcfP03<%_p4Df^wnusyKtaRfj2A2M#Fh0o
zWahx+^50`{Yb=K-|L;@WTirAcD1l^eO7BXakusplJ$7zO9=^gi_N>>b^qTZ0r|9L#
z=@ya;>+ukP^ID4O;-5+1m+o^cwTMa2rH|-o$f3tx?mST|qY>;LZg7pAn7|EvXq<wc
z0p8dlOdG=gxh1QC5iPd&e~vNA4Z=OOu7$ZJUhPi%ldQmN(odv2($j>_XNaRVr#ZBL
z4b`(p(ko>5g>~%&N3OMAhwkE2>6g-*(zBT0oQp%3MM8Qu;DDDQ{oI~uV*@+(0JE=%
zV`6!G1|+rOA6s>`F@i>E<e!cMCnALz7B4JQ)GI5C!z%7HEMg^F`I__}r3V<`%tne$
zS%4a=@G#UcUHpdh*U~-dMKYD>pbk#{npmsTp}6y35_Ocu8Av<+O8QvZm)6ol4mTb~
z&`Xl!mMceW+GsIo;N#x*fd&~J9``vZVkUip8K3m4R*kijwITlzm9B!<YpiO#K~)OF
zGP3RGe-rudOF4xzPxjWtcPV0DxCq~oek8ptiG*9Y{w0;=q*|xa$AgRCkp4#cJVA78
z?H|WRjGI2;M5?h(4JpXrQta1KvAB_aEL0`BV+wZShC64(Vaq`P5aDZWT)6PnZNFqV
z-dTiCKJ@3(Ur5h_umy+t6m)6KIOJ1Twnn;;ej)vh^g#NIjiQ|xBPo|26YGCOY;rA|
zUrK)_-I2z$&#ch1$V{UN<9sKIl0-6xPA8-o$YCxWNcW_Qi6y=D%aG-UgLn5?(cCI@
z8#^lE`hqnlT^~_pJEyTW8vI%$I<`oxw}cDnHR;c!+o0tMk3TkQP=R|n<7m82g?}Oa
zxpXZ34%;g?*r-K|7dB8wA^%@U|5iGXzQ}3Z6*nwME`o}&a1MCq07e8@f#=r5E=vOH
zL!N3i7>Jpnw|j&SVd87d*f~(=VgMrtF~#xFl+nzn=|ph6+HOM*<MN*L@94g}iz^+o
zG1sWh+tQDv*IkV^(tFbXDvhK+kzO`P`qBnt)qq9sQLeeZ^ZU}D8x8Y5ZXikSg3P2Q
z$KA03g5w|R5(5!L)RaN6IlvfqiQLrEtcgvlsR@flxYzY|7A&KnY;YP4DsX}I3p2K(
zg1wPGkp2vd{{p@x!?$eM5|`53(#IYLCTt*Gd?7t6-7?U<rlH`W^icYV^h-h+iO}$G
zq-Ugk=@rO~dj=aNF#KHlsr1Wj10o)8VO7(HiphfUd}9il0WEb|{F=~sMowF@TY@?q
z9IdOtFs}&KqRg9<dy5|00$$48!$;Tu52QbrZb=12iwIp#r4|$?GS)NK{|}_Uf(7>g
zU%iJ?&w0TU={4lvg#162{s#Q#^OT`78?kvo8Q^v4ZP2Nn0c}wGsU@DWc&g;krAwBx
z`xG{3s)D~%8IT;sjC|)L#pI%vID<=U#tc?@lr;ASJJ0p!Bk8Xhr1dmVdE^T<dokr|
zZ@Lz)rFW%&gNu5B=u7r!se|cb>2>K1zqpb<kp5S~fG<(^Tu~L78gp_%64&$`K-i!S
z`?iIIn-Og7*>Jwb4Ty~Ox+4Ev7}j61X(rnw=GKtK(gL4L0;g0#I)W+jAu!R7I!tnf
zx%5zaNqPVb9(ZCwruHN07nU7etLV>h<zJ@jJEMFUWyMSBd(zK*0U`)ZTK2K@jFGMy
zP9RRp@MGy6g7cvckBn6sgoW@Mv0&vugc;lAj2_YwoGuXqBX+H}$wMtJ_S6O*1y3Y{
zT98=;wcfb_A{9!h{ugn;BTFmN8oe+5RC>+TFcvQ<&nxg31eIOnfmhPcrC+)TBmMX{
zQX_pHD#4ubc4v71N79d^A8tE<)*3KJkS`ufGDgc76mS@Xi4bSOPg9sRWs?+^GS$3A
zmjp)<K4>53#@1O{Di@TM-9(@iivMeAEj<S_eFk!wp#mRDKahUv`rmNzzmY2Qq3?o=
zr4E9Rq<5sBNUsg_e<{5q{h73s-jJTNKAfPGFQoU_JXbqo2oYb@R%HoVzo<aB;Q6`V
z|FLudabRKrT+aVpSVB@dkG`-OtxH%mcfg{evmm?anf)o=lK!o<mY$LhrTfwm`?_KA
z-;@4>^mCU&TD;fU+b{8|r5(f3N{^(sq}Qb%59Gg)-T-5H2OMT>%_Ucaekt_5*x>*&
zW6LBw@`~}+QC6(Uk$P;>7VnyXm8igZ|3p+)+rca#Sy4B$F%c_bq6GlQ#D<#guHz*e
z_+#l~(94pw4q9-sb|=z9<K<jmd@lV=dMy0~22jRpXt9SMOCK2T{Q8el7JVlDM`#-X
z+eATaBR!Hn*v3G8XV(^^=Y%YK*pP{_&3a{&4M#&q+FsHSALWAdB4=)ZT6Xpu3{^fL
zh;7*`!TYVPys=yU$&H>$e@U>E6W><6-Kq2mfd6puekz?n2ne}Eh`b>({bc9j{aAV|
z{ab@TWRp{BCW4>toTeG;)S{=Y;r{#JTD4i2&JD(MM#_8!V0Q(yHZiJb>zk+M7=a^8
z1w`}S81h{d!|EsN{|}{y*qxcRa0XBDl8WyR`Ts!rgwq!Sw~j5epp5nLK>o=FrJ(kQ
zFou11r%38l`tS+odB_^Y3rhmRb!EJNZ5LkO{4vq%+_rjS#F5NeIl5M&)&{JmN0^d9
zA6O{eQoUI3ECR=?CLrz+>E}KfRAkwyF!I4oB799+Fi|>a&_QR5GTiC5{$ELNN|(|P
zaUJgHYVgNTr1yN~I|q=%yr_ZON~@gLu&>5k*7ybpxc`#azXYy*L>v;7#(dj)RYdwF
z)#VA_JEaIzO82G5J_-CL8?=z#XFUhLI}~U~FIEFw_OAJ-wvRBjJ4v`<gA$mX+XdRX
zd3DnUmKC99Bow(>;uH-(l0F2NxxS{cLxP;9oZRr9;fd+F9HYB__?FEuG7@|6`GVQJ
zH=uCcB>#7}igm4R;OZv%pGynrZCkv7bebnD-Z|S~aNBn4;@zLjHfnHyOFI$G3v#{7
zN*Kq1yN0<A3eK8;PX{#QD0A1a{El!5nRFXcM-3nHYOr|zsev$pKa$z|xAt~BL>L#p
zX|E>RLLtK4`hP_*QEu&-xIwQCbs)vMe>Q2jyPRikNi-ibd;R*0at!Sye2c^@n;LVb
zu>i1F09;Csq!Z~bA@kY@oGa;+Fev5WBirCR^#2@>pPLnNbMPenzmQ(HoimVP^Mq+2
zEjiYQJ13p=SH819y+*L2e~MCN)87jEUz)y2y#CV-IMJB2ZEV5jp+OK8i&Vz~R5nE=
z;JzcQT5XfC=F(jp!0rW*ndt21KhP5{-dHp2_Ub+9`evJ9@hjfBwf_dg9$Etx5)k!R
z`e=Y6C4}IfbVtgmeJA<du`cu4cokg=Je8i7Ug0R2Q#-zN-<a`2L8ia1{M{ITx*#@N
zm;I`X*Ra>KoB!rX*B1s_3kMvux7>&n6}U)ur$M|`;kW1ZjX5Rykw-t`@AfEvr)==C
zp~q1@ORi$`_o9WriU>bt^Hlh{U+-cMt8!~~N)~r!1}Ey^T^NuN<rIkLfOvOpl-6Lv
z27GaCe`1GaHc7GWCub4Bu5N-rx8LPA5n(gva{kTa|7%>|%K%E$Is(Edb{*S*i{w7K
zQc6a3>R7sGC`MxgG9rkKX%4kNQ>t#>wWAEC=x_wC5i&JTh@gK}S^ZiU|9W5a8{Re$
zj;db-%T4v8k}>qb8EI?OD#onn*vc~-7WgQdF*61^KP}zDKg)c@*rsBQw<4QgNf5uG
zM1J%2`%SKlI%Em#H%3}vu4|I#UN4>SwHF|>D=b;qmI_cR5S3mpQej>8DaQ`vCvMPM
zP5JLrcz*}i|0WkVNGNE?SJFc%hh3E0#F7L2<h2NkexNc3Ih{beD%ezWJKy@+c#WxV
z#&-qpIXc!OExEjj3H^5|!vBPe*TYhRrB=fkIY&Yn{D-Kb_kze79MA*k_X`^=b^#f)
zfW{al`)i5Mpts~kFRBd-3P`ZNX`Jsr^p8VTQ2>bKwSb!!)}ps^xy7DYpRSp+atp&h
z_Pvw1HvOJZaR~H#MC>{1hho2NpT?$v2KzU%{Qw09onDfq=Zzf~8aU4qP+*V0ximPV
zqv6U3s4JRH8sdSL`t<asBJe--Pu4GaCM@L>%b;4?Ip+f^gcr~bpm60jlR*|S1uoaS
zxUeA>b0+aGC`KIg&rsM6LH+~%bY>*c{T?$FzaWmE1L0iZ4<>lPt^yVvk1GP4Ln75v
z=`l#^*jk}GCmJ?oW6^y<E|UyfLq>SzB;MbePiX%hZm`7S*JyHq>ze`1bT%Zn3i6mr
zeQQ}$ODFaW2;o|X8<?$UFt}@i;K$OMYGObG)o-8h{Zli1X&>QL2A+j)J)wx*+3=WV
zQdkkcrZ1_$^e@5Ks#y7njTmbYa$)27H>~^^OZHp%iJ)hWQWlJ}IhHDrHzA|mIH24d
zLIdf!rVk}pfNL0SK5eaYAmLd@oB%Ee9VJxxV>=t;x8c)?p@FNJ6Lm*h_sFKnZKxHU
zIgw-y-EPiUzSPK5af()BHRWLiIm!T6Od^Z*3Gzq|pYlUA;kU#C`$|mhyodBDaK;T{
zYPM$?iA-r~RIN1FE`FHsqZCbjEPdh}w3ua7!5B>BhjW{Yd_*cy{I-UlW32ZT<O|sx
zlo%wFO!^7>-&=x_aTiK#cm=Lg5yU<k^l*C<NBYcYTv-|}wWNz`^V_HVk*uy6K#*~k
z-LZ5G13#y|x5vR%zeXHsat#4{#@AI?|GDGW1y*SaW4#8}5Z_o(+YJ2Zx8&2S0nC7B
zHdd1iy`3^0;}XELW1-YW(@xuBYT@fKLv<p!9rn>j$Lq?vN7Juk7E5FapPqjeW~c^4
zoVefaZwXBt0Nq~o`Sgpk{_C{{80;?L#|0H^g(c1~rsz~?Xd*InvzUVMnbQI@hshb0
zfCD?qW{s=8_TCMDTF`KkF1|&Wl|#*14(3?y{J0c+37pbbKY<G}a4`e=b$gQtx8?6d
zuGZdk_r)c1AWk5k1qyd*W3)<Z3d**>LRQ4Dz=EQCK8U$H8j*tP63E+#lK-6E*hlQS
zPYFMB6NK!70Uhc|@*nV529j?+b@TXw#kVw1Uc4ZhjtI8*>}-lD$`^AQd(#v){IoDG
zQDdXoV;Dd&h(1LCFrm|9W5<BR1Lsg`7dwv>^-uI|w_CQLd9oB_^`QW)a{G7t<csru
zeaR7J?WEB4SU|G5sF@9?3pi)QKgNTN*_cOOlHtZejo?muA9YWMLnjl?Gy!*Qp_jJc
z7q_15fm0pmwsZ$xK;7?uDrtuNWV;_R=V+s2dLW~j^U5aWoNlvl;?B3Mls7gYZN-~k
zaGqg1c>gsMVH1t^QaXkGTF_uO0;C_iR<(AXUTqQVh~;Q#ifZ5sO}v7ZWH2_0Z;DqB
zIZ{$aN6MbSfpZb=gJ|V;zHr77Cy#s-gkO9>^Ng%;+E5P?16F#?jB)WC`NuTR+$NW=
z$Wl*tc4Ef_jGR9$AU-@aEa48%c)VSPY3<DSnjEXNQgHmc7|4GE!Cu&)IpEKdguv6{
z8&SAm0~|tWl=T=^Y=;w{<}eW9U7A@7GwC(Yc!6r}SiH17Z_zT5(VEl}O{YeclkGdV
zylP`3aK<*2z5<$?^YT3^NX}o!jF^N)-|7z~3)9;ZVcmzcCoE)ZSzkE)psL<zfG5(i
zbeCrO5msab%5nxSJ``aTQ;#q{YeMG<RwT8U?7ny`-IDHs*>8+`oP@fNK9)Y*(Fq(*
zJ>#btyYGOEY>hFWa<p&}7l)M|^^g3`usGq<D(>DDVInCg>6}>;1iP)NKw|SJNPkay
zUiw|>cck0A_sS-pq`s)vq(75>X$Kx$U;MK4`_gBmBi?h0+%8do*QNhWdd=E%ueZ+2
z(pMSDbjaY0OFHq=0=^;r2kEWtE|41QpB`z=*4uD>Lo@YkAccm3j2G0%YS_?Y$i)Q!
zH$lk0@ogg09z;%+*>OY#;7^9>3sP2OgUcsSAd&*VEPY*iNjhLM>;e&H4A%R;^zWp1
zTnf3gFMUq>Bk2`PM`Pr(Fa~LA%Im&(>8)Osz9GFJ9Wxa#*psQk^{(_2>4$?}n9%>U
z>Gv@pjXhpc@O_#C=9IKL7D|b8+M`@k7$JVd|6C0k85=7LoTFrO=HoW~Q<`&{26jiF
z>B{G%FG?@Km~NSKm)UusKal=Rdefzl^#2v<52equdBl^a{=4_352bgd9}Ps9E`Cw^
zs`LfM5oHX&TGQJ1zVs96F9&xHGh7?^wlt$zpbl#m<$N&HGBeCAN@7zeZ0e6^9DX8z
zdd$XZ2%svn;i6l7^91`dlb)BpB7IRhmTD>n0$_Y3y)FGz`fKTJmqJ>+m!#j9K8va)
zy?8)p$hq`LdR_V}>8-)VGwB)WYtrvZ&ojZ~%Dm55I+gxf`d{35b+dp{_<D$CPptpj
zyg<d0%vtSyXEWzm?*kSivnKu;yOANo+JnfEjYl1ugB{r%hngPX>?RgOlEDkocckx1
zpJA`2Cprh`zAa6qpG#-b#}OPPi}#H5E$P3IUcq(UW(w>_JoZYuCw(M+6xHZ-@$=Fj
zN#B+pFcxwGnQUTnNK?1!!+{!Q1Q=^OQXoRfD3T;ge%nqa?O^<cjhar-XA0#~C_1&X
z2^&uzGBBSEkYa--Ps#meghC7H5yjyh@=qfCL+PJMFXK{5^yeYaN~&glAe}@_IWGRD
z^nYL$(<Kp1Nr;!;^hY!k_ZIIt>5rs;CcQ|=6hUY(<3D1|&G}%Km9Q+y(e7K}yXCem
zG2J=Qe20Yvhj+uR&MD{zrFUcVo*Wds4o7FqI~g$$H4?uIJmD0=Vs?}KpOd~LeN%cy
zI;M0pC+@kzN$u17^e{v?m0pnkk@U}{S3zFZWVS2mJ?Xx5DZN2RwQ>1BkiIVciS!le
zX%y)a8#ac1@Ibnt8Mr6^!cyjq9m-Z)WFB>bUMrDs;0w;Gor0e&Ny0L+!Nv|;=&o!i
zGmqIMw^7w7^lUhmsJDMy{z-)2mA)&zY`oKiYu%R)q^0z+^epXcJrO=FeM|a3N}nTi
zOSPu^R8mux^b<RJsR_&Xy!1`!+tQbC0Bii=f?{mq^t{6)gSK-3dH+!9ig<sDRa$ef
zVPu3u8?whkw+OvrjCExdb;p3aiw{V0Hnv?N-&h08-H5)XH@PSMp7dSmE7B2iZwUn^
zsQ$6ElwObyV*@0!`Z?*#(r2UxeCfPj{YaqLed$H%2h!toak}%@rSC|u;2aOljm@ZC
zJ(O-UZsBZuC{G37eqyO+G}WyD`O`$ChYn;UceP@X76eX(<yS5H_IxnSw8AVEgv`-_
z5TgXv+-7rA3LNtPmh^e)E-}@LX!4kQw9?bkbBxaDF8+%2d(z9&EvZFj(e@);|5AEh
z+T(EJ?#};6`eJ{K-GqR=!Z4gkzmR?&ix;1YPKlKjutxa3k<|c7gHx_oZcPL9ejf{}
z3@wdT`Jwy*XPKfZbAYHo@@I4+?+tPfm;Zg~52SxY`Zcp2{8-miT)r<oCmm4mxW=}=
zCjArXYtkNjvBLE>bZ6d^ju=UH8W-P|z9jvD^i}C)_W8L@xId?x=oRS|>4yXPXA~e)
zAIi)swJ||-YUk1JTU*Svs^;K)4QD$x&^Uu4;UtP4eit~z%(`?pn5q~(S6gjhhyA%L
zeM9=^($|>FoY^#(!kPzPkzOVk?Cj5%rEf`JlJ1yGnE{(-O#E%77o_h=r(#z;kiIQ_
zM|#Tg^$C+fgA$&ygy*C^PGB?$s3!bel3RE+D6{j1+riQ7IUB7t8fXjkvnA(^K<5to
zq%(a{>4@U{%2-_$PgL6ExS{(^_|4atBsS*a6#E~8jh1#WV%G~YmtHXK4T!w7mv>12
zjyF=xrO!%#B>fHpyk^Fnij1_Sjc~Ga1G^>2j0-ff(b(x57yNOrj+h~*2}5NLj6q%#
zK3pJ?E`6xXA%ta$YGlT0kKth-Z$E#M|L3HCBz>90A-AwvcB-DFr=+J4VVy4ijPz~k
z^U~DLZ<`{*bA}tv@!`{0yqWYW7tiqfwRNS(?rkw6lWqCuSnP8){aCtdplHJ@%m;Pc
z)-*KwD2l!TsL}C)m}lexR6~ff4^%U@JgnvZ=fHG3<Uf->EBzzsE4*Gb@`;3UMqn3^
zNhiV=q_0Vzmkt=*8D#q<fk|zI*RgM&SEO%AzsI?$CBA-Y4b&}J&JOu!{Q>w9_mspl
z8xCg3!H<VTszlvAAZwc2sX-z65idJ)7Esoov|6!$7d&Ki1QvF%#c<~){N@j&FQYa=
zSKVhQP-&y>_Z)>y`0+QS&v0mJ!{SdZTy7bMe>k}KbJBOEFHooqUq9+IWRnlS>%(C>
zKN!Wz6&Tpmh*>R(;N{Lbh|@lt#W7_)W2Rzh1%xc_ydv|ClINvSMK(r(>Ge?h{g0&2
zOJ9<n;){~4Ig_qveGL?$$3jW9$CsrqN-rVtu*aEYt3{td+@^~UrB|iTOApLr<g8VJ
zhPBcW=aTLC<c5V+n32|InZ&T4sdQvtef?l_8~!>%ZwuV@(*7()Ib2^{f|kW7%h<!Y
z4NzaxtT)?^fC2;lg7iGA-Qd7Vb7lpR`7G2V$^TUnhtj%2XUs4!v70%eRM&+#m!8M(
z2VhiDI*F=xNgTL?#GSt74yB*a(>Xb5Mv4BB756&%NBDS*{Db$|+7Wac(7^+@;kg~7
z+5+)M?Cx22LnM-K%Ri~`=cQ-xo?|n>!N$lqF}aIlWAQ#C-Im0jU*N+kWRVdqZ2Zni
z|34=kk*t>}l1HNvG45s%wAiO$00iq|#Pv&xIh%nGZEeWSC0Fj#6t?MePqIP!8qOgs
zOiZ#163D1O;AzL^-IrdKzDW4K!ILl9nwe44YR@2(2w#z2lEyaBH@B@hVMlHT(QjJ3
z=cJdVXN|B_83J)2i5;2YiMltu7GNMZbL`F-#Bgax>P4ZZ6Go66YHL@l>>P_ALydPf
z5(On<f`ct7m@n8mM?@gAK@)z6Vb1*48vG-(*%d27;Yg}7Z=_1W-V4%GkW@xizR7IS
zn!%Y083uW3Ou_=YRU3IAZM{2#CU)NdVcBwHGdJwJ3kJr|4GivyU~G}94<T&v?R(NK
zMpJhNG$+iv;K>V|esI3RA81<#<Ni}ym;wkQC5Hhd1~k<}0jKas4)M6r*Os1Z-_Ag&
z2J#QS_>l$ZVf0FNY*fQ`?BNXP{f=}8#C?yyF8G!!R-qa+GDZ`}6a!k?5L$13Y^|le
zGf1JTB3}+$8C__ylGu*tp99Z~ShKMq+)>XcZ3}d6Jn={F;+9ek<P+%=`>g}K-6LeA
z)?;9=H3?CaX(PU0GPQZw#}f!yZfIp;`F;%$cSQ6s4+qfD%lKHznKXMwz8WEELk7MY
zlyX~Cu(Z*0t&u3sfuI`f!j24PD%}C?3_`&MovnGP8Xe8K_q7O*2w;}@?O+b(gtQk>
z*or~)HzBnh0l3#ZMGOR90PgfI?sWo3=1?2rUXn{x;4-sq0~~uQ66V4hmLiT0(7I!j
zw`7R9nS~^xl^tag9hS434#dc~$t4k(@MjkgQ-hGb9@L@ZPe&Y27_8x7C**cA>w1s@
zHFj)PK*W_rCh@CCVs;qNoV$(eJj(DvK_VaT948Z8(+1^kFdBzU<JhyV)E<B;>6|Ir
z$FNw$u80w_6>;#68bD;&YhEF2^VIlTD<f$1@`%9gD{=&ZADt7;MnKwa;H*EQuE9mF
z!2;TTD4LsShM%xik+RifMgq~CaK1|80mKN*A?sVRHQ`%?K>^1#-iNV}PB?vZiXxmE
z5Sekol2dkv9KfIq1Zbb?D=GZ*@K=^7WuaRogD<DRAq~m-*g8wvJ}hv(axHo?B~}d5
zZj`iyQtxKbgs)$)`Ic4<9&w&S_}Q5UKoueh%xuj>?y((jabf3~G`@2#kz7j>*5ZJc
zkiH*qGVt-X{R{mUw7p9nvf#HBPQKg;;bj<l??)EeJ?l4@L{5FZ@<soPN{hi8<iBLr
zZ!Zb%96-aSX^8-%An7l#v2Mqn0Sqg*VV6hRRN&bbc4~SjgKz+OKZ!VGRbr7h#_8V?
z1QpbdG8@`;pJ;d=1AYRlvn#@euUgx)2#H_Wm|y25b#`NIxOL#Zj-jqGwe7a*gb1S<
zI<4o}aDkbn2<@N`J~r^08bGMTiWMy_i(C&NycQH9ma0J=8cH%X*86fN3nBxsnE^Ck
znW8qJfcpfB!*-_Nm_N32M{8_lOem}^?e8ug<GMGTqCNr(h-|E|*mXx2T9|<i>ont<
zf(d&SBD_F4T@sKRS5a_@rU*GRjHeo)I2pp<w_h~UDTkWPnMD)%|8hVdh*t<RIEHdR
zHAYkH=++u~Iv*-}RASUlZG(7fJlWX5!tT!DfO6v}w>ExcjZp7_g6;?)*TkqT+oonm
zr`paZ(i)Cb?n{^(QZR;4lJ?SVn8a&47Nz&_HN>Rns6*G3?^obg!GRxJZK0?CImWay
z2)!^UWP&ALn{DncE({)DSRN3A5muC*8js-BcRj5c9~pr))_ld=wPE=>F)XR^PAJC*
z2|GuPBem_-VH#HN%5-JK+Qb{LkxK6cGLK~~=Z)(lvcEJA;tmL<r3<Wq_<x8p9Z9z+
zXx7^`fGLquBDdY@AITVrgRp*G-+o|<3mk<gcSIvi<W#YTo9&0MVLO~l0)MKZzGfJ?
z8m$bFuZy$~@qRV`cMTD0hWA@@L`e_Am@r1Gv>2eJAbMm>(6JR6dY>qzjX6Vck8cbm
zEzql#$*wcYQ7LWlHB0)+0{>}w-_C%Jsi;lxI9D_rH5@Tp+HKa`2DCL`ueBOTg?0pY
zKhTZy?IRT~>6~0zm^ZZnZUH;y9)3^YhCQ$f-r5Kid#3G~hkrZeA1L$2{BCVCo+B0M
z8PK_%XkVEYm%Q)>>o*?Ae?pn_*gD1+L=R&#K#A|yBap`@8upTeXSkly>S>o|DS8gL
zFkt9Fno1{>(j%20bIjkF5A@8~D(4jblWqhelp&O{fs#9t7-!pMRC(5D`z=d(0}=0u
zpoX+LA<8Y0^Q0eP*RH=Af7cRsOw2-0P4i2()}?n)bQ!>gLG#bC*Uyl9^mB$!8)TFn
z3*|(*0*qXe3x@+JF#rpHhl0f6f;}8LK@?%mFj`makR4g%hGmR+F$~?<0>>HsQavI;
z4iz{YKt8~F4gBz)!)u82wh<OuMAc#|gU)bu(=^lY0Sl1aN_tE{w@+l7Le$`w53N_2
zfnW{=BP4{{^FE<K4O&uAGOjJMD|weW@3rI8nt1lv{W(rkWH}3bU=t23=b7hyFPT#{
zont+h#%k+T05?|5II%<TGNVDRq{kemwBr>TVjzbXDmML<q4m8wX#Dg7_1a^#YNF{V
z&aK!Z-71un3HLCAnX!M989X)otG!9`VU|L23z$PuifwE=_O{@Z(0YlL&)K|dZX3u+
zZm7exb04$j8$3pZwaC%2w82icPsDTiZ*1o@gscT`RQB`G?){^{ngbYL8tgJMO8IKA
z{;5`w=m$&VLsy3L9Z`o~IEUKrzrnIbrl5Yl>6Msv;J{+R(`Qb&)Bp&KXe(OqUoZVk
zZsDPHK_zWO7*m-c4fk5@Z~%eTJ?2|QB7avBsreXpI}AbsNQe}w=I&7k$!TZm!N|!2
zpGcW>Puc?voH37XiP4`Jv2pmRrAz4}X+$X|*z_Y@{D{#+C)*T)WIzv$KHq}p<yHe(
z8@adxP!FWL@E2P2b7U$#!;M~&B@I6TeFu-dwL#2b0CP-Qhvc=K6ZXC|p%p)~qkMzm
zxF*-RO-z6NlII*rIJUEIEADxP_YQ^GL4(WeEmAHX71-7w;T3NDY|zP>S!h#Wk{Uc|
zZc*zH|Eq0hY&iQh0_T0W?;Gj~^-dml)F6C+duEzi821PzdmULh;ljcY^TOsEL`}Zt
zk-HeCvh!*i9Cf^8iQUZLCJw7tA%z7Uf`T+T!)^xtwLrkVJR(qmf)djd2;z#O!wA*b
z8?g8`IIY5RoKVj-tC-BmC(_5;a|~pSU@hi4<P_+t(4U3XoURwIqQ)G?v}At$EJ%-}
zD?R)xD&R9Vd{k-nS+gm%f#^pb8oyLP3BR{A>T9ImQPUpQ5Ob`Z7y(xF8%J(q`8J5W
zyS}EGyk>i>E#g{R=o>_XE_5$r15Ux`GIN=Y>Bfp!W!%s1uW!`Q?K8Eqe{^2Vk$7!M
z;q{$!Y?!R%7)ZJ>U@QBZ9p1-7bI;tCxB;O6LmFMU??$>s)N|P{mV`+Te=x)V3*%p`
z45(YuLmhJI%7^|4!a3#AN0Q*hOz;Y=rRF=R&<(K19*nsqIrq$p1j5L7=|iC3(LysL
zrdruVrC`W86SnsWv@U|!kX&Xgs@hPX-t(`xtU}Ksf~$GhXjJJoKtqSy<38eATC5V`
z$??uUhIr1JMx+rB6>wf*Mf~}IMH57lecY(?zj7?$CGs!)wkwV}To6ZZh@{43aC6I=
zMmtFmVTR5OfX?{};R7$JhIZb!MaY>QA@020BS4wb=+#{jyCN=*1T(GtoS3@C85aE`
z2Yc(E14JHSF{6WIVkMaciQ;rn2Q5K<7lc6>gq<L@O<?hi{LZDZ8bb><uwOH?stZD;
z9T{Y6r1hX0O&|^k0gnL4J1$XN(_r+I*{d<s(LPiUCOL)*rQLoHx^6>9=mExWX)>Dl
z`}ZKAl=;tqM@npHfjz5?OVefHts%2Pb(xWcEwPQkQ0&35QDB;Kkb7Z_9FZz5q*G9o
zwjY^wIxr`fOIJ<}OpWks)rZHr2tvee3Cv9tf&;*lr7;~eIN3DPyLhoZqt?_$gmE_Z
zjw0%u4H#5~l8XzY)YaQeyfO8{2@WjkcngF-;<u}T4(E`?A0e2DS%s2;q+twu40vC~
zH9%esoUt<IOGID+CzX*zjR1Ye^kO;rA*k#%vGQu5KbM?6J+T_d1~U|vY{sa$9R{=n
zZ90_1lUL@|k~$nqh2bi_t$$!mC_&x{cX=Uwj58g%3O5$J<yfp)pDr5jwcGl)*3Ug6
z1F4PO5)jxW2X#MoB2_>-xy54@yKrJ#IP6Ief@rBMxmA)eese;~Jrhvx6J?QtyVb-~
zM|9|3(GIvb@ZrIHIly8!{dcLYiz`s^o&mk2y>nvJ@imO=32S`ebC3u2XT}z+i2+mY
zd<d0dY*Oge!E@_btBm-N8H8LJf3X}eIrpTy(mkAhdY#gSiZ9`>EiG3w2jCc1^p6y5
zy|jS049Jlsd%Zo-Am}mXs<f$0mpt30*HJ3!MRN<kYecj`jLA1VzKI1?5bW$>v|AHz
zSlkT)j3}?GV@(O7S~^Z=IR2IsTh*T}*YSjwq%2n;4>g6eK<^_V8b;7T<f#z#ls#Ai
zFRxLr(Vz<^eWpyREU*C)^2f4`iSmaVctn^bBIwMF&c+~&VfoMvP^7&%nA?JyTw^5o
z(&}5C^-V>@xefj}r<7LMV_#v|yO(dQ*g6G^2}W7GV1&=RE2vS1IL7pDEMX4sKkWy>
z*T2sCaZAe^Cab!zNGK}qW50Q6VeXoEdqmK%Va-=J#s0zp$o6jo=Ci`F)*wO`gKr6V
zWr^@|1C3j5P*cX*c=S;iTwk!9Q?j0=X;du1=q3kH^DRjnPhnW26r2q#Pv~C@6=H-H
zS`rCO4B+xF>NNL~xMfWGwj^1K=A4ECn07kgmSvwAt>m0)c)FlA{p*m%jLY`OPHJ;w
zg`LQln8<hSl=W-LO#x*ZHp$BV3s(4r^bs&szXp(*cOJ2IGX5eod}ZBTy*Mzl{)xG@
zFOO{jm)trym%Y^wvUB1@mo}F*sB_J>``+HeQ9F?dwzS>9*^XcXC0~$v=l0lvVOQvO
z=?b&~daa3(F5u8bUr=saWwL#Lp`00wqTa9+J}|hcJGu$Rlz$?WjE5W)ew#u1kV8ay
zmN!qrn-M{r5nM;0+V1oLEtb3{>ni!Wg@H1ozG3OqOTfhkb{6;C7&#fw+sgohNOQ+V
zZa8gbfGF<h!)VR8IrX*f_wMfz$*kN4wme?NWq}jsq~oWgf-^=mW)Xcv9e#|GRsBVZ
zj<%&0WQU*9W{jj?j|gDa?AAg$#z^ewrU~$4!32ZI>1#H>Fg?9TsGvYyNF!#k2YMe-
z!O94NL;VjdW@1#q=*GwZbyGF=lu1Kf5?jFfsIB@Iw+*A4)NHbiU&~f8ZdF>!o73qv
z9~iF?!OAT5#HKh$4K()8*?<QVB(1TLc5C)YRKhYN@HQ^F+~TtUi7s&|Vo@)smhEBz
zC5v}R3&9#BcV;UQT!3ou0Asr>fm;jEqTr!Un5;P;Fu@A*tFa$i0+-AtT5O(hAW7Z#
zcTvPU;1OzjkqhS{w3Lr7iR&{0uh@MB^`Qy<yxk}Rw}!E%S8SBf(usA{4fpT>!kSr_
z6ARJ6amsM<m0PcdVeT6stT8?`aD~Xp%0c)&12zccWK8YQDUDI++Gy(&I$7F+rRBVU
zVV4=hx7Ujkwa+E)dTfs!y)-Q-#xMmBz^Jt9MSw?Z(t;pm>?mRvy!?uwWri^b-#jI)
z+6=xTFo(iI*+`a?{#;-iE(aXau|Wd0wIamDV>D`GS<VjYIk5$+@U@3lgbi$YI+&&r
zG~#`dkj8TN;DW6UBh3BfYv@fI9OcY<dTMf)F)-(jBurKbn;Y!6ux^4e6H_auL$z)~
z*3zkzNr$Lv==nbL6bppai(V(-G&Q{GM6=lR8$5IV?8YQA3UwvMb3z7KpcfZJ!d(m*
z5Z=muG6wUGEm#o@UJblQjj77P(gU_FS^N<N{T_$AgMb<f?Fta13PlMz&l-}~sh^1&
zb>_KoPh*zI1v$zsrmGAA_%-~<oCWl9d{p3~2)ZK$3?BX3yhLM>Sy;anC{z^=AYkz^
zPBe%>VIIcDrRa5WILCd#YX>7_iZVF`WCstuCUzNNOd3nYf)p?YW?Fa(nGvbXtzr@=
zX?#V4-5I<5GgHnD4XAgCgClmVte0lTC$k`A(8@8z$Tc{|<6t2PFLwnSeazR*Y-&cd
zUzE7Cjz>ST)`Z#|K*SoY%@d3A$}os7;o7DeY51Vr%Hn~??h!$EFF(RWP7Eh&iMk_d
zUNTF38H+bE*A){i^1e&0!3?JT^YVHQAdsJceGX8VK;p+NO*>HGlBYc&+0TiDD$Dm*
zq{w4mKY`gSq=(R$3kwEXqoQwiCNa)*5Bk6*;Mo;lnwh~LmZ8E8$09XGVGB^M+3CGA
z)?#d2(S^-+Y%QnVoxT)=*%P)?P9zsCWCad*h(*SFRHSUta2Awckz7VL(Mhhcp3mEo
z*i@%fv**$sxRBlRm;L}1FOdA@1#IfMK|3w$v~~m9_9M-RG|tI3D)f990Y~}p3S=^(
z;V5fGS}?b6$sI^&ZZ)zEdcR^g#Eh^=(%xX8WYO=K+kn1~h+qT+@`?1pU>0*>RksU}
zuYg3OG8={Bd^?GW8q5OA5Kzm6=PY>&K@0EpP>G6G)cFrAa8Hi0VN0D498i$nG9sW%
zx&>_3P&jG`MGgiPEE(uo;<_sff(u&ogVfU_g0x&yZsCuyQJrS?v~=4sbQxlr5uh*a
zG>otn)piBVH}K37k8m{ZMuW&SEXB6gA?5`DF9ko4gi>q<Tc9DdtNWmDz(E<hcxl$I
z_bDySSrc+b*0i<(OBH3ou8!V9z1~pk+F0(i!LXh)p}roxc`NNp52OQnS5~}8c!J=O
zT<koNO$0C_lgI*>)L4pNZZk0>gN*EA%vepYN=$8@K<lp%2v?NrB9aV$9`EcW#i9JK
zsJC7b_l5&p5ttT(GKB15f<iPUo(TJwVRm=)l18AKNXL;x*O;o@oPIs99KElJ4v}ar
zx`JLA(0^tXt1gIN!o({8BWKbhX~9VF45;vIpa2yh!Nw4?ut$}JzauCBqbE3k*3ySD
zZYXN`_0B5%80OA`?@ndvLtDc{EB;9Oz{s_o0STt@7{Ak6;!^RYS2tm}BFtM;R^E`2
z=f<X<V^6vuJ_48829;$tTW`s}5$ignkwv#pZ3#h!$9Ra*jNcAF84e)F0k*t?*CcDJ
z7ImfkNM8!$AV!#0u&V=)KK5BG;VYv9p+Jj*(m1p7U%qX<T@jbW08u(<d7PDzf@Ctf
zUb%=kBHRgVcEuE{2?L(Gs!Yp_(K9UD)W%vg7IJ%`w%(S%BTs>RY)sl0s9+A3*ePOd
z@oM1Fk;F7WW2ZJqy|Wn&_uPj?6)8hFs;I_`l^=c@HcX_c5pwTIqP_cy*tAE)i}ZhF
z6BN>49I_W9rzz}-u2*sbG*FW7<yeT;T04VJydwh$#gYXYGpzCz_b&|U&s~ed=1$n;
z7j$imAak#zM>fZ%b@^vDAF@SfX7HEJ*uTM!>a&3SF`R4~RNl~MXGC-t*W}!f4Y!Z3
z@nhrhwpPmQDmcC&$Jm_=6%Bx;xygD40C$Co6b7fv@F)9r+s+I|f>D}sta->t4RavX
z!@)RT_nWbq3h59>vo!Q^vJ(T%`u{h;r`(pK>5oLq2is6+`HHtK`RlROp{LuE*mBAW
zDO0*@fn#IH!;JWPHw(qNM7iNCk(4j-2Oeu*zwn6ECj543WthlMs(yZ;SiFMZB8dNE
zN(z@i0JYh(W~UZK$#-R~t1Cl6SHw~EzzhW4F<fL}X~_y;x#3-E=h3fhX8GFoLa1uI
z;F<IY>$d{}6|>?qw#tc#V?uZ^-<g3$p=kxexF&I~jT|v0gY3Nb$ojlSeErDGWs41u
zRB8u<paF<IwU(0!M_lhwA)6b&qvys~(nBeOtGfq~ydt>?vf+@%oLjHBu($;pxv=qc
zGW>DJC$@K5xfv*`tV-Qr;fF-PB$7Gt!J063Wr=5~Y38pG-e*SDxgk~=aY91MOmA_*
zZ@2k77EnR>yyh|Xfj+PL+q2-`T@kXic&)bK%Td8wvyJ8h>zQFz?~>RTebsLb60Pm$
zI}A`mjq<KdFVCbqn85%7c4UD8l-B?;5%@G<F=OeTog8<4<(jP4NyO9sLAzc7I=qPm
z5o$H329vX6Y96>Wue4(i2?HFIxD6{3jRr@9IT(_b(nC_CBRWna8ynl}obALPWJW6>
zyb=En269BrlR+Mrq1j{WQz<Z)D-P$lBAV_6@y_?&ST(pH<Q2hHgOT4+#%K-P_c-c;
z3~~gY^%AhItHBf&en(AjL->CxJ?hsDhKrZNzc~|uM}TTeXfqR1?DL%mB8l+%NV<)c
zx-bgc6d+|}oWbiI5l>q~bBJVdjS0B4x!!XRUxUK5lHQk=($gFX8PR<3C{|YD9)2qN
z!%qkakMK<aM5m2>CVe<?z^w&0Vui=nP;h~T90TyIyap2TiyBJaZ$$`3U$HX-5OMth
za*k|97Fcc!h-(L2yq2YoF3hCYCqxe$sD%%tyM7={1kjQAXE=Z;_ceAP(+xF%91`?x
z1XW-gW@vKcvhq@F<VgzyGb^Cq0K+2%cS7&=Cs5+tBETii{tAraM(ZyOc%*GnGKsvd
zupZLNS2D^FHJhOzBRsRd*?6Og+2anqzcQwMj>TyD&m5(k-JgtrFbYJMhN!HqR@H+c
z1AwkAkC_>tYy~QuS^Gl|f{RfEDY`o*WR}4(Bg2_@u$Tl5@0`6hH3oR_umhc&4#I#a
zV3bhgrWSWZm2FJ~+<gjL?M0tBUK*TJ)9A9Ji6m@b#ygKJWIJJwWwaJ9Jr0kH3tRRp
zg1wccUy=D&j)D}(Hg<n7RxhQGIYVN!y>kBiHJd(aMa%vO=b$1)mesL<N}fEq!CTS{
z>S}F(Z)^6j7s0L=y?5U4KF_SJA>g#x&HzY6`nL@*3}0V?j+HwLFe%o&CEa23uF0lm
z^j61TuKhd-S>Nx*%CyU^Z5ypT)lU71#6JT|92qBKL`{0sCnFDQkh1h&c-$n8$C9KY
z={~dNdd6!>aCSr)ry=E*^#l#82R#(0CJ2iCyutkhBpj_pT_P1QRG^fOfmMu92@?Kt
zaH4ZibvE`q8Lt%CO~!kk+i=F-0tg~U^dw|tFf}Kd#q$r@;H|-ZiMerRg!ph8g%L5X
zpEC+9OI%kZi4_*_oG$+!`t3JgaaQBP@{m(z(~W`jWX7<&cww~Gkm9MGOfp3fLH6k|
z=&)YV1hVdj?-5N!Wp_xUj17>1xjfaUw!}6H8t2deY^Wh+gp6aGjJPHOy^zkf=ee~+
zAK`I>KO_v}5n^6!qY&Xk*S2YLgM=s8ZJ}LD&w$r<fWicGRiX1^OD?i)B1y!HG9?p{
zRCL!(KorZJ5p<N{&iM5KFDx7OeQNZLF8mx>zeR3cHzyEtkL_T{k#ByFMvgg?sR52F
zE$|+J!p?3gBgs&d!Www@*~n|uDRU9F7Fr(|pLc`nno5U|G0Q$Z{n}SQV*PT<;1)Li
zG_sisW47%0Y;4|e`1;27Y`Xt!`%t(sI$N8bCcLDQK9n}l$(lhoNMMWScHBm|$dTdt
zOAAJ>Kb;uPrMINFqyxI)H*D+$jyq?coG>@@Is&>PRf*7|Kqb$a4A7X}=^~)C03S;a
zIgzL|oMXyn-m_kT)y@=HBr#JnV;K#U3tVv+(2j-`FlG?QBAsjv(u<^a#5-PZfavv&
zpsoh+SlRItErwx75iISP_oYYD1JjI+6~s#-i}*+7i}$AV2Ky@oo*zg{C}BY@i8?`7
zL<$b90FYbALph2Sdn0;dd!LqhR99rnBNi;!hq=iwANar;sYDnOUEwl+8Yp1TfUjw+
znHyHVA$?dvsu+QbTx{!4av&c`C(=`BXbBK8H_0`|XY2x)OX)-Dgl(CT{+!vGW%%aa
zAef~8?@8}SPt$u*lhaJ32b{MxvYwV5*0X@baBl8qiTu~d{EQ#=l91e%I?%qk#VTWv
zuYI(jTfbE0dmuf}n1?V?!FdS+X148p!-Tviy&)ZP@N~43MmBt58ALI-_yg%<PX9;|
z$INCw%u)LVaMC#Be=fZb$Q49)*<k>s;T`Qx2+@JC6RLy}14ivzMrG(K2`$UEB1at&
zBDF>`PY_r(7-t<{(eU*dzuOQJRt9Rfg8;5`0H@Lqq`OijeT92%`bWeF!F(uv072+l
zhQE;hqcoGgXakTgDGP6mcpq~U+PL%2q#s+Gvv`NOkzfm(vAlZ^IJ~rEY)EGFhn6;|
zvKIl3QQexGjG;7EcD{#fP(dfdNPHUe5(_r)j1457VzHe?CL8jG^ke4H9nsfS5>xs>
zj7QSr$b%Z`Li(xn=hD9PStFTDu?m&-rCr$3C!OnmTYAyJ#R4p51r2m!BSLmKfHlVD
zfGWooJ3d0hmaWnYVj_h<l8!LFbA-}bLqR=o-NGzChQB5AvIkPe0TLT_Rk6*46B+go
zrRSxI^a46Rw$Vs6JE9!~@vU?ry(axwdH_^Z5-gTPA7f+dcKA!ew0|f)B^^sIv($6u
z)=jMaVQ#GA;b&w=Wd+oj*#TLR>zy$&yMw`NYyQeCM~wD|ibEbsD(k&EbYWAlbC$iu
z=`{SqiLDyQ|6F=a`aWe!87jYq0(6FGhXK%q^b6_VOOo#6XrIRf%?vqRTl$}c^?g_R
zg>+AP+ED1A(VoNYjM`AU(}%Sdt4&PjqY@DP4GV8%3luq7%M#-`vQ**#MZ6mH5X*QF
zN5=lF>_jyOr8m1BK#Jhrkp3-EPr=tj)#3_Z>|FX#dgL##l-`p5Qrd?>9s{Hzn<&us
zH>LN&d!z;Ux%983M*2p-W7V^XaAlq7Tn2?%iN+3wT(RR9fVs16ASNtc5CvncH&)83
z**fQgF0xOg52dH<{GpYN29A(t^MqcKmGqYM7j(<b(AcqsTuT=A9qD72!b*Bi`WxwK
zj<p>#mNMw6wGlhdeDSWN*QI|i-DLu4VMkcihS=AG9_qmcD9l8y8QV040@7eWQ^fUt
zs6d8_AA{2b9JgQ~X)NR(X?{Cc{G7kI1;{XA@hXEIXO`>r9{x)Dh4inaR{E!m2KK!j
z`#C{#JrORXH>Cef8cAQ4UXk|A%R6yB<;sr+ZRqbvf5F)GTh{#M<-;|{e4X9|L1Z|Z
zG5ASidM%834np2T0n^@&h^ZoWofwRqOz){TW4EOFk?Gd34gv)V3SViaVJQEl^tSZZ
z07*0B9R~}kw2bw!YvEdYNBVPVDt*IFbE?Sq_DCoa-Qj)8Yu6XQCH-ruk^WeEK$J5^
zVUz1QmiDX#bNKub)08Y;z=o9-!(`T>^PxG`>JYm5DPeR*0U=D&q+ciKoLfLu(G60V
z(RXTE=fHY3fHUbeX4_v{B3>ChK9W9?ej&Z<Q@zv0uS<8N`vyHmJIIRE>TT&q((AM)
zNm3;}l>VB9{t~A+O)S+5RO~0xn|9nsZw+czrl$6rUT>cisIZpS9)?yy(+#2LR>;o@
zoYvr4SH60IZH>v4Vsm+_9Kdq1y#dlQpGrR>_Fvltb-w}k1L<$1UvhNm^&?zLuS<W9
ze|w6AW`o5L2K=YeTYe@<Ej^OHN7uoN(lK<DHHU5Q(IfaH=_k_TZTSbzb483iHZt8B
z1zsEE*EPdVdF_IQ+~Dq_f}iL-d#q7s!$(AoQ3NfGy?el4pF=4;+&K=qmL5w#m5!vR
zr4~M3bV220*B?vYmww?YTuUd?-vIm6(tW656XKQ>Nu5hSlU`%o?DgU$AN*%hCVduL
z6L3z2-+V{<E9nP2<=^yacd=)qXqBwr5imhiX}c}3wq!A~j!mE8T2oT%@QaPHqJlP8
z@r}6&Z*C=#8|9x&|3{2xJRqAo#U6hw{Zx9>ZFpo5ccfd=Q&I*a@Kh3Op8WJ*OF!XQ
z=T85>&pgE-UId&~F)Q~y>BrJre&?{k!F4{Da_Zoo`)+L8_qaq6>rQM+;5jz4V0VR5
z4|yCB8DAJ3Xky0@oRjrknXA^&0fY_usZ>a>aX@utl=<X}K9asKy*G%%YqIE8dPBM=
ziI?7E8=Oh+OFxs|9-IlWkv@^WFD<3l2!ac9b`?19JJQc0c695XY}F^y1BwPYQAbQ2
zI7Gc34Z6u2)U`GaR%R}HY!$#!zcEl_>9+I?{Dl)rnREPbfO+BQck_rh(nr!?N(<>Z
zBHf&*YsHlxNk5m~8pPqD#%t*f&Nm&=$dw*GQM})hUK@xo-TBX@N7D1)&W%lQo$zXJ
zNI!DZ=bAJ^g%%9$k}=CmqVzFSwYqk$AYxQj49I{0C&ZC$KP2dOUkQGgR_>Hwe+-y9
zMweHEMwo5=|Dlvg-^U-WY#L|Un;%L)mwvIW!XHaNfRKI*ryX>RDQ@Tu>8Atvr_J*N
zX@MD=Sy$|WsP`l3m(m;C`ky{Q$gd2y9%I1Q#AC-0gt{=q<K2u*bqxMcvIP<%c}A;g
zXTY59y=1jUeRk5s3Q)Vdc~T_!ed$X2iF8{!1W;H4q@`ICuSvfcNTD+Qe<0l^l}b_4
z3`F;B>4(zm+l%*5`kvHC&+zbL($^^g;K$PI($9e=d!N?o6AJ^ZSLQ=2zIup41_Du1
zkl%+B(UL*t*r?Jg$OG(SjEfFBSmXyQrX8-`cnurD*3!q)Ur9f=s&7!A3SRH=;869r
z_yg%u`YE|&R3<_PuW(4iI%x6%OX+n>5=(>8mb}~9O+4rYUpz-wDjecSx^3;+YX?AE
zQidfn7UpJ&auzl?bi+mxA?|`@tPQ#jJTx5d?g&Wik9VX~JM=NPe$ayV+Q9~IDb8F<
zKPTkL;gUo`Qn3U(E`CpXEd2z`rM1qx9GK+trg`XNEB^{x9eu}V78dPE%K?)u!T9IW
zZG`70X(8>~c}Z7}f<*GWFWqCcD^z90LbN<)`-DcG7B>5Z^gUz$HY9E%XimH9aBJxk
z(%i9?3!=!80py$wn5?n$QXV(6N?~KY`Ad4{cj$lAz?O7-?X%_=#^$IDW9?{&IkOs%
z_=<CeLfy9hfE2;cy@;RLjF+HM2j@pd=}ezyV1RB?;a{@nViZGznYbXhueUFLPde|9
zj7f41@FRgo7dPquIaRitR|_EY3Y>m7i6BahhjwU11{4x-da6L55Bj+ygJ=x7E0BDP
zQkQt5-Xj#?baO}q(M!8Qh4;)K^;l#(`q7r)UV$6RoK<EIIs7ybjUMj=Ir>%T9UgQI
z+?Lzj;<r&{X?BJW8M`#fggF(^OPigrvQr5<AFwf+>;)^99^`hPcCgsC9<?3QnPaGG
zGawaq!&?$}e?o*O{V+9PXgxvxE%C;wby~QzLVmk1UeiyT@A|Fm;-UOA{wRrHj~Fqk
zX=`?3ZOrD5h}2@=g7G-B_g`bTGX%8sAhIx~c1|^{W*g>I-Btv&(;Kw2)Bkl}|66;<
zFh=!FDtv<GxY+$~nw<u)WpiKlkpush_NJAW?U2~8q5r4Ghi!>-x-vpI;44d0)BO*m
zJ@BX2WZWx)=KOSJqDo=F3KbaJxYzYfTUc9GV6X*bBG@g$l^L(hV$e|2C{jG>;>=dt
zcTw|X`xDRBV7)~K8*Q^2`U0=~ivr<^M*G4VMr#Xf()FhfLC3PM;D3#9`jJ*Uo2Eyh
zDtAhv+8Lnk(&rt&i~RjJytw%dt`x!61Vsx(BA529AMOIuZzuf(s~r>CEDSv#VQcnr
zu9dMs6stG2gXRiy#>je-IIfUu`2+*>tBCMPvWSbTU-!ySmH(&Tc4PGS2>+8kzfa-u
z435^02y_N#C3rp=iC2c?E5iNpP64IM0>F-46_v4|p=M8hmnTT!SCW7BD_$`8qn4EM
zH(b26MSTn2e~xHoB!q?Cuj4U%Al=(KLM_2wV{5n`5Y4Z5Qq*^5RsdX*kk7eIFYfxy
z<o~x);ZI+@-|)^ECAJmzyh3s%d2k}qMaMxlSn1X}RL3-=R8%ipJDF-WAi`A~inlh(
zPHhA5YEa1eK>y2|-se}+*xy9Te=k?YzDWS^6b@iP7njU7L_61Ynwmqpn^AtvO-iZ0
zbq9NXiB;;F88g^-6&0Wj^FzXnWj1YpybU4!-7mp^TKUIiV}n~4cEHr0HRwb(FhtQt
z;Kq?q#G)>Mr?0FM&{Z(Q`$VR&u^rdY&$_Vm^V8#rzt!hI`QkZ0w?S3C29{FXDP@5D
z!8DE17}JGyyCny(*FV`dbKzGcSmyv^4OQm}#k__x!ro0Z*8jcb_qV+`AUK~KmDyg1
zR^c)A?M@1{1&6tj+DcHU^iU~oJs3=r$t{N3<L0?_Y_CDrZ*3c&^4EIF|C~RWb!f?6
zF}6f{?>*@$sTpuV0%a@6LZ{aF9|H;27@*A4hk|(Xk}D@{rp%5BD!F0vYrNGzoKMTX
zKQ`)FPG6d=A2GVSvhA-}8-XsO=>;=6M#fa>U2|m<jwh^QMEwPOvxL1_{faO359O0#
zR_0W1atfyrtE_Dz@z{g-47&C)OC{`ngZ@sf`E4_p=p0>TwK?sP4SWiZKi<g!{Vn)p
zlxB~N{MG^v)CRyeBt*mNYqAQHt-7HwI7zxw2x>j!m090<WD1c`*OpdipHc(Z`EG+p
z{gveLgwLmc#jg#1WT^EKETHJciokQjPS_h9$j|`Jw?GX!$VE%YKEsSer%W>_f)xn2
z#nxZo$15Wujc%#{|LTJHTP?w_8sGeScMc3cC_FjQ{DgEi1{F-U<B%wVMJI3tJ#r1O
z85HNBVs{v5_|%ORfJTtVf_W8+@T>aZzhn9TeQuuIMlk1A2}``76^UwzdFu2(>Pu4^
zSfhf!F&5sHBp6M-iAp&;<cLo68Jj1bXk|mn^kfL`Q^oQ3U_XAv>z}0mtr_Z)R*cF}
z%qU5Xt+lngVQ{GmV4pRZb<KR92@pjOsY*{3tbr)@@3UYTtvF$$y3M!S0Yn9L=SxR5
zL^oYrc#Lc!ZLOMe<HZ3EM?ooq_$S>t#}Xe%Ev5bh>eMiY4)6^XF?q*Axw3MBXoQO4
zAJIs$$5f`F#*{418f92mGi_@={pty`L|h6E5yn3Pzj%@eV}EUX?@wR6owp4|xDh`I
z)jg$$w6fW!D@Gnq==NQC9V_yvDPO;2QCh|VL^Qe?w4r9Ugz?O4d*C3@*2YYK(#NH+
za|u3OgrUN&#jS0D-Roa$JzG3Q`{X-sV5mf=LyKIK3QziH(f1l5)BhU*F4y+{E#T(`
zKX2`@s_SPS^UiZi7HW8lkEKswb(T*U2IucQSc3Lfmw&cPicvxS^v#p?FCJ`x8;b-D
zp<j%Pd`v<A7WB+RlS?<2GwU<nqf8z1w<b1?q=C%2i2yWweJsfViSLr+EHm<9uZ}%*
z0GSQIis77XpHAZN?(kDuhSM6Jn68XYksB|bSgAU*&T64d%bUjG=GfPonn*+Jx3-}q
z=`&o$(2Qsv8-qBn_`iX$1&g5b9%s}wr&a+KvW(#bEe94n$pMr=*AtM~*vXN|uWte}
zV(~`y!chYW2ed<kvGQZ1!Nn33z+uasix)avAUtuO=!6c?b7rV;Z<l3u7S`G-#SJ=K
zU~PsdLUc*b@ImKb-7}jVkXycd<EO;?N54kgIpa>3PmsdcCcZ~CzJ!FZG|JeHi)Soe
zJfWAe7dFtg_i2ory@#!+w#J8+c+z~Z1#%nSRU+`*;DnJ)GwE5_tUqRS%nw6bt{8GS
zfnGd*viu9=H?m+d%IXdFXK|DMk8NCI)R4maZ>$wz_vV?Jq0DSLcE#)5xOg$=$0<M}
z71*HKkKy+h?n6u7EoT@*w3dnAHkM-d?AL}NR~dztG3oCGmgi79fYG(S(E*gwZRrK+
zIq9CAk6D|NoJj9TZ%U5`nwTzrg{f9GqXZ{B!;-$F*QJNs7auZA{K)37OpQWtD!nVc
zE<M`02bPSaD{VT-7(QERqml}bBV&KBrPF>j?J;va_dt$%gLGo`a^nHltaS{qYcb{{
zEA#LEq@{mWdRDr_Sc#~w=F%l|Cw9yKSbAA{fnaNl(g>|Qk=~VFlO7G!xRmZnFH84q
zaN!6aKBCj?E$K}<t$Lq`bPL9cW^@ilUrkFuGabZ%5xxZ1F}&=Cbao%I*5US0?Z7X_
z){S2=WPipNZP1h(BfwI+B|RhEm+n|XG6BU~NFPgYNpI|w|1Ifh>7MkIbVz@RY_jPi
z8u5nupG$|*^V0K7MGSFdz+s<A?@8}&_tCVN#nKMmP5|-<f?61j+_H&o^Q@`01ROC(
zkPH57YDjXA!je<{85<ulhjCg_4Dcj!_a`aObJDBuIHHbO@y=J$yV5VWtw5Se_q_Bp
zSz~LewW5RLW9iN9i<ADpB0YtArS$WzbWA{hB7G>mA)VaBByEiJ-LMN+sPhIfuN_5e
zZMIRxnvbw05wUCt>*ex`lwr+uo<nQ$%Zz^%l+fNVyK*3d+tTk!-;zEj-Ny`Ha!~M|
zR7vkie`)OZHkR&C`l9qF(yKUtD|qy4M0zQ`DgA%=cX#oT^m*wIq%TQFrk{s=SL(3*
z5d?>;?FESFV1&__LOR^o6pa`Z+4`HORQVI;C~i>kF(btzI35j3S%r<Rueh7no=24E
z3h88k!40(kNcy7mhte0N2d2Xf!NsNYzVs96Ur9f7DTMt0vGg_R$k4|%&vGuENUur%
zO8Ohf-QCK6mBouZe{R`8nlJiS(!Y^@zOxy%1>4!TLpDcf+{EUUj4ZOc&OIi^*u1o8
zvNF>h)P}ixkA`5a0RkD(jS-+{`mPltMt0XBLkqtleHQsAS&V^H&!rEg?@Rwm`td;j
zkEAb4e<-~uJ)jdc8p%=+`ib;C>HoW}|A*4&q;E-ImY%g)P_fTa==B5Xf0KSPXhw|;
zY(@u$h~8oqPo)d%(dq$Vjm>>b#v#g#Q~JVEd-q+d;aq<ryPDJIw!r{z$b(|@Jqdpc
z`F~sbeeCWYhs~e!I&0~B(*HcL0x947vh*G43yi$TiRU8vJe7VX{cBS3p8UTg{TI^b
zP^6EfT6$G_TDp|prj7pJfWq|jKWB$VGj4;^Yizt;X2XrU>!V?E!xowmRj$ZMS~#>3
zgI-6kjsnTjgq-u8)<AOwFT1M`Ahj@kN&2VKH#yX&qKe%TP1n-1QYO942|(A6F_&JE
zzAOEy^emx3Z8Lfc2xIr8UrMis4AY&zApJAxyVA4x(MYFSXn==OBmGo*V{jxu20lL{
z^PS_pqU|N`AJN(M5=H)&L)VPaOScHzO6v;U>rb1%-Z~j*Lv-V3Y_w?U)6V=Y<i96<
zM*3svyV8pwh83%y6AC>m?K8mhS_&!O_^kA&(m#`4!X8F5w*X@wNJrAU(i_ptolD2k
zSEN6c{z!VkoJJJB6BF~UluI8-AMQMv)|Vs0F{*zQ#n^^!^ditejDtpdU}CRvS8F?8
zH}mV4OsU%^Yn)&P(<esYjCMYADgb>>`i}H_(#vT69=X>PB>Y)v&*b06;(b>76Y1O1
zZ477t5jjeIU&;mp9YX*AiS!-m8D>95ZDY-oucdp^iS*uJCS+z1e*~ez;>WYpNcX}I
zw8>}#j^R^NADC=~^Q8qw*Pkd5UjmgbfSbqInk(*+8H_%R9+C<9tn|;NKbAfNfxBcp
z+68HSV%q)E^*=ZL|CaPDduC(-z>2Bo`_h{}jV9$@$?tqedIjF(T}jd_(p_e=<<fJ~
zN7B37`XBIMWzFoV>tbrBDHWVg(<8%XgteEJDQyfn3f}XuiK4|e>{Dx)QhiIRzAr7=
z4o>pjDgsZW7p3n?-@^JgEN?@P)CNYzvkd=!l%8iIJu7`j`XlLOGhOOW2FL--lMVY=
zYEq2xqVz}7A4@OcGLOmJmr(bThTM{F`4pM>L<BUpQ6(#8fJIB)MmmvB=#=PmI1vaB
zff^!(o{+YU!0>Ames38HfbNZL1DOGDNw#;)LK#cXNq;E)f%H6sm}EXM!wlro)6%oj
zf%Hj||5$of`XlKdOP?d*6A`4ievih^+b+Vi@_#ISM|u(T@j^Nw_H97LZ%NNdPaCPJ
zXFx{;=wozU@PBIbF={|P7DI(89b4~4M2#!j|5I3XHUO_OOS7VIIp+->0t=*Mqpi;b
zE&Qf*mqSRZe(CE{I+6CJ+tO3gUg-Z=dPe$v=?|r63EZwYBYk8wlsV&Cr+5WTr01nS
zlKw<`&H(t5KxJ&bR8LEHxAi|Wz;E6Eb^Iep(hKHQb<;FP)?PGbfL<`#5~y&8hpM-S
zDXy80F|(n>g`H!xLiw)-7_`ej>Hi-}&vT#B-1wpOiaag7BK@WGFhux*^d0Ft((|T&
zQMbqq7g>OjUSB+wUY7o`^vBY3)LWC0YZ3cIdYXkyd*tJR1E|<D7i_zli1#j=DyRTw
z0|!uGUV@<(gOt`}!*eDyF18yX3xL%z{Vr#gX-sSaN71h^;Q*2cc~$zdbepK-0L&*n
z)njSH-t0AFrsVH4(l?}MSm=_n(4gcNcA|`n52Y_i-;kb{a_NL<KDJ(xWKzQ`xVA$B
zRN2u5VsRRP@n9gYiK%-F(3o(-QJfOE)b`c2<yf5zr)K&^u`c$|7!vSuTkk_Edq4oM
zVD!9v*VxTQR^`fK@m`U>CcR`ZcuC1Nk`P%s*>U*h((}^ir3coCwZztiC9FxVrcYoG
z7e=^gQ2I5*@hQG(HN;OU004jhNkl<ZI7cCf8PTy_7)X>V$ZKPV^t$;APEI<qX*q(v
zE=qnSJ8ZRm|AZg>4&YlL2blpJk<O>a%v+d>PX1q%?pn{&$T;1?X?c}Ve(_hN=OIi*
z*M3bIY|jR{?0~Q$4u8z7y*a4-lmI3JkG_H>-pyjFt=K=}=8XZ!krr251V0H!yP_4L
zv{fvb=6S`|SZ`;*NeT%9en6SFh1eYfrY=c*N)CwcOvsnmJoA2)Voju9;Ljqk$OP>4
zCF%D7F4ipgzHyr~6mC2?%PZs`8SMfF&B}J`*dqJdVvB3x6?W8G$!;5yL@ceaSsI~4
ze*Kg;A6Y$TpW|JZIOF|+_vm2QG4t39puaiRuV#5`J5H!a1ekMFals!IXhmh)tu#ot
zj_~6^dR}@4TXV#_UZDetNV1OsneAMF4fZv+gmG@|5EThQ#SeSUq%AMDq82|R%!$=o
zqOGOBeBR#xIYGw0F)i;OjFy5D&83|kKaAi~?dt{3<f%D`Yl<4QX8o2<g-;N|v(hnT
z#)5^6=CklYVykCy=b3bitSQ`mOwx-`{)!-c<N&<O|CB*W;X_Bp@T|$BN@r1p0m!VQ
zXoDLG7DtI=yD$n!8=5;ezf+UJtO%B)8Mp@E7(RcpAumYx5%}6n*OK#P4y3z=h>qL=
zJcIRLvzNw%1QmsiAj>H9CoX=Ppr^3WT|v0<qEWUT{j^LW`iKm;0Ikf3_oo0Mk6>qY
zF;IiW*)!VJhSo;m%*WePkTMH^<~Z=e-mqcD(K))fzpejC{x3<-Gjhdg6@`tvTx_FA
zv3Pf}^3icygHDy0u&EKLI{fDu=5NpY9lM#GOn%N}n8kJgkwE}0*dhBs(2@Jggja1(
z1WVFdv)jYh*G!kHX)gMZnS*UCUJ5SeW|c1(4pCrHV;A)bI9(_gG{^%>Pa}M3@unxB
z>gN%J1%@~!@DMwBi7^S*eKp`<MgEymy~0NF*OvV3BbdwW<A{VzV`nX(L{;WT7dF*u
zGKgcHJKq?b(-17hPOf1}<Pkn?2yntmkNU)}765(?`ZwO*029bFNgQj8X-Ni>)6Q|h
zbm&fjg2A2WId=_q8rwMDn&7I#OEPOsZ<tj#VU|$={!@ZyZk{m9_e^@oc9Bs{3Zv6(
ztSXba176uAv;`As7G~S$WJZzBc15s2n`Bd*!x)XNxK`gZi>a`eoLRiMPYo)^dbM`E
zUbl_L;JXLpn=YEl&XZiaNzV*(iG3DY7-Y=4f5_&J8d61#|31h=%R(*<PB^n!SiMhf
zmBxkb;=<bFmyBaxc=$R7-kTGB1yob9FUN!@i=DH8QW$buI%58Fg_WzYaDhKfEwJn!
zVQMK$O{_TKc#IaaQS?L7p799&S!Ef_0k+7It{oiGh{F$U-bI6f%Ms_2*lxEQ5mC++
zA>}3Pt}8%<&A<WHOuUT8ZdsJZn80aYF8PX1b9-jAG4A>$anO>dx<K_4C9|sl7v}Y6
zC~ytv_ZR|S#?ub7zzEQSgkABcGZd>uo;x6fuml-COr*W>Xj4Fe<VQzt!)tQmOGKC9
zawA($J2Y{c&%+0`(33s^S)X8ZTcnuW%{?OU-4vjVspN;8DwtWk5Y)-_pbx=Uu12t#
zg&7raQqi9l*42T6e(o{m3?wuf5>mn)=*sZnBU3vg@VvCC1BD$gxv}PHg$`eGPSzpS
z^i%0Wqv1^mOip;&>o*T`S>hr+5~#pTLV0#~0ta#vRIzjL!y|x{)&?T<nqh19*9@$>
zWv`DcEL~cYa)&>*q^3)Qz!t3B6mhILlc?CKLM^r?<3&mg<;WU|#O7TL27+$LeS=uz
zRh0%^K9SxX5MVv}TC)DK`4AK7w$$KyF1KTd%v5F0E2Jdg5{nW@{@mIEt}hOH*vu$w
z34b{vj}t1?qd>JbFg+?x3j<Z=WR_!wa9-^UKm!+v7Cg@Yg4o;;p7zR6seLMWzXm(e
zq7-{XXi02GRzs>~Vbz8V<P4J75wOY?Sl&4?d=q!>Ht@voo5B!3q1(|X7?9|-4fMbz
z1Ib+CfhM*GE5fBbBA{RyE^Lf<Yq-n;qtRmF^X&}~?z1M8UVvrgJX8fIIUn?6Hb%3J
z!c)L-E%qdsD(&F@EkI$xw-oG(fDTK#u(XpzUsIhsB84vrsDcSrK=zoq4al(JLDWsw
zCu)x)wCaLq=?qY8z0#nfHPGA?-#8`z?SW4&{3$Gi$5`S8EU*z7K)^W@_Q~}pCpXP3
zYD=$|3~LNe*)v*!!um!^ots2cusY|Y@Obkl^#Nr3>6!Eag@rw~PK7_wb}!KwzWs(T
zT5>FPiNG@3Mcs3*VJb8>$8?1Qh<xQtdYAZm2a_`TXpV^oa#GnGA6y~Av60TNmG@kF
z49ipa%7748Mp7KIP-^46MR3wW{0IPlVTO6fC%3a_RtBUr?71r|NnLsp(wfDMeUjlu
zWNt7dR}S`1YnkfUCeY?sb-@hDY*Y6>!SidW10S#vudoOaOpg6THpf|ohV2ndWmX=@
zIm+ia3jGoKRgh&;r%J^-%bG_=c64_eYIFT54C`LFiN@c~C?8x&A9^EC#_CNh*jYhG
zh?r;$wI)WhbW2}YFGS?YY4w(7LtCq_4Kei05T-HL%nVF!;SjFjtX010_o%Z^tS0H|
z)v#&u!8o6Qz-F{x7nZrMunHAQwwni?5h>lmWlsqL9!ql$v>6kYYtYF#<q_8Gm<VNM
zPd)`*>2!ERO(cSf$JSE1LUjekvm*u0F#5-EK1N8PVoS{k9Rz^c`o%L^G4@C$O7P&u
zw(j76cJNwqoimFyR#w%C{50DxqQxipHx;DgyRg72%jGM}!sSw&aNFF*G3UI?j1AZF
z)}4ms_~4S?6~>}ztslJJnf{79%q?XPB2IXM!a_RN@$2aRl4P;A2IdT>S>d6M2#b5q
zKQ#(bRJj7WUt?5``$s*<za?<0rHz0}ar|TEh)gJBocStNSh+JPhbFQ%iy%x}4x-%a
zLX6EbLt44@^mv5SE8q;aK#ndq=v-lA1SXK26HGJ@o8N*8eQ08vThI`lTO;G43^!lS
z86x|fUF<E8BVO>M!S$U3sDZl^ezF8EPft1Sj~cpQt8^HyEOy?&+=#+$EN|qMUGiDc
zZ!ojAmxtK;HK_O$^2ME<G|-?&WR}Rzh;L+WtmVXsArE>=ny}*e1IDfZsP-9}ILrWo
zIa2nSd{f)Q;j4!{=m_GEn8&q7<8muS)s|xQbTA;12y+6e@TQQ%enG#(&@<d%P6j1v
z8?EKl1w$iu6yZG!4if&zNF}AY#08GMjSY~hG>^co)7S5_oR9g^jpbV1BLqBhh;tlS
z1M8(3fF0$x5TR^J*AaanIpwZ7MbHIlNvHe`&wt4?uTbh7OB7z|asYvjaC#-itThNn
z*r22xtDX2yR6<75p_D;M+_Tb2^i;@l-Rn;(JVzzsf(){Oz?%`WUTtIeS)Wz6Pux-T
zVaJs<$qx-^Yrs@XC9%baMnN_@IHt~pYO&P|=_6?+Julrd{>~Ue$J#n7hXGuqmboDy
zHNPEOoH*U-0CL<-aMqT5<H(k_MBcql_{R9r6C?aYE0b7`l1gZ2yd15n;EXLOMr_!*
z`!=PdyK|=lBYel4;`y3{<H!#9Spls?K-R%%YyKUj(9DLkuWUUFkKih_Y|cuaNbi%p
zJ8vr|3*3<bjQYK9L6=(aL@5pkFmy*Q>!lz!u?;OG$3zLON6-yP#mYjyiWGRCT@{qZ
z<<25fYhO1(MH7H}haDT;f*Ndq8iMsLmTF0)ydix_3VgW}!*rr+iGH3+CqQHS;7|9Y
zyF2>;DyV0roxmfDAsSYy#Z>kpxCME7ONnJp1b9JlyTNaCp+^RQIyN_)k#!wHk96+;
z4iqBNk%rR6A>Wnrwt;z$2Imn3u$5sk#uhgOTcb8sWW@B>&mS?-n(ZFsrLZK45_xq~
zCv(BpPXXM9Le~iCrqa))#U{NvbV8O7aFL?qTBA_`uq?L$>=~rhwMB`En_I(+8cQEY
z?-5lD<sbHWgb!cZNvM0clm#=Qiox{>2A^6B-G&oh@7s|6%G9J=ym)5ek4r<&!gZzq
z^l;!n+zKVa)3HrJnPT~JAe>7ECib*=YLQirB`--IqKPGH?Tz2IfQT9TcVOvP#-9m)
z*7ZVmHGtBtXkbPQ=h%|PUGYPC9j?JU<<_|{!6#<a7S?Ec&k05CvM{>YoM^7GHX&`t
zeggwK#g#8Oh`dJWYq&7MZt7*>Q3g@p5(ejFWb<lzL3~88v9p418WC8I_AR#y2qD67
z8jhCjTYYW?qp8^@Gp?i)>7ETSoseqg))pbd6MHw`0G<VfX+<=YaKrRnm&R(_u>o_|
z?2;Cm4Y5m9<4fbzch)x*9UgET_PJ#<!5vruO!pokhsk<i>DLt#Fvr%r(PG=O?IqYi
zqa2aDw$@mjVWCEbly;90{EuVff2=4Y1sz3joAbdVWZbPZxVqqfdeI^u+<E)6<}YSe
zfT(EVnHbe@7zX4Rnm~A>jFh(ho^)G!4sv;i|9VaBF}%x}ZMBk}cXd+@DIm0^#V{53
znc&IRMiuT@Kqawe_`uqR=4~uxn^JA*8PL$eb4!gPN-J2MbAZ$x8GwWFjRmb6AQRcv
zIFUXXlrf?Jk#2rrEW8!;gA!%$kvA)x<9@#w*<y1?=EH^zn(mt#O86J}(H28fqAM+`
zzw%b-Sl<BmgTb4#0WYv8`M?9mZj7v<lx~6EH%_<#Qag1LtV_RTyOkE;)QBfUwW1lZ
zcYPpZg>A`-pU>@d=wNYl;nRdjbC2zQD2Z2CQ5m|DK0=3jpV&r)0nL>Oe+@j_>!KTc
z8nkp}SZ8U_%!r(#@HdbAId+L7zA2!r(S>X$gcUw;L>#lWoPSB?89PJ+H{M!ySK2C8
z#?4#uAljB+VWophNYG2^4v54W>vJ`rki`8Hm+K?i;nq;lN`o*XR=nC~f@b)Q!sguu
zl0P9nYz91Z8SmK9??@JEi^F^5t?k9HjPw~)yR?e8*c}_2G}C+jhDdc{Iqn=;lsJQp
zk!0K0d22fpyP=NR5|f7^IHUlz1A@ze3C<APig>ER2ZT~A2E~vFU^4*Ir6o%h(<s&!
z9(5)p(zhj*=8)xvjWmLoeKM#%40iDpyd>D)OOB!4Lq|ujtv4>h6f4|fk*)|`*Hmaq
z^826&^cXBr?z#l9s~Jf*V~wJK)~Qg>0aS=?#@UrYO$gYhK>Ibw!j{q}(mhr=n#Cu4
zdr<#+tpJ&k*ejcA6m*mg2~G<-^fAxX6aA<^za+EQ_3skCHrPGaUNeedQH|S>xlPQQ
z*W9do`GmV3Ny2E)t@XCC3}|BpXg7=-z*I?2H3EW&B67{08ox5tP?mH++(jITz}tXL
zXE?xBXkSH87%2DJHeGy+c!k{<dH{`wJbVu976sASO+jE3*w;MRZDNo$)t3>gINJ6e
zt>ODu?61HXLqM$|yu(jQwvLFJ8+L3khvI1;Gs?4rP>Mli%eaYUKR%vqS5QS5u@A@^
zDQg7LjUD1r-DE&Z;)4MCR{&&FjAk-2XJ7<9BFq)t--lL*Z^#KIcC7P>bHJV3)c}!?
zc#0W1TXPu1Y-a==W;{1t2!jzkkTLi|HrT%vG?fy|l@akQ%z=jY7z!`yBNHRqhEfFu
zaB1CfH@-r!V$Uc5O{8NC_?m-Bx-hP_2Z=O!NxnNZBOiT%JrYf1K#w>QHyOWES{NsG
zk6r-Ts{u?2Rz@Jc3n)z+ewbLMjiU(3BYiBrP6`}^+ACwB?13;2Jy^?+PB3>P9B*)?
za{guoe`f~@(HNgIXC-GOAyGys@x$GxCN#QWcuNlSA61qWes~88H3l3E7`TQD6P4x4
zrcv%Ff>mJM7YOVMjUS`QS5`;wp*Tsu?%{epYOUZUZH(>Kqu>M!Vq#x6CJz*Edtv@~
z#|Ez1C@r9GbRV9R1WXNo$_6j*W$u8>3u1`iw}(IPu5So@uh`i~ScL{uI=<q-#={Nw
z?LIqmM6fvGi}JoWTYriS%(BGYPnic7e8d_{Fg3ndxBHz)lT&GaND?<i=i)Vo5om-_
z6U=aJUN!yWF&OrYzZ~X~ndKqLFa%pGDikaJ>uhLlv;Nl%6{^9qGPCC`FQ4y}|J3LM
z5$=SZZ-`SOW9pR=qq<yxBWzHrSp2aS!+K?)6#d*oy>cUB6$afe@QOPE2oW)!OPSRg
z3wWEy_|WBG7E;DnCg+;W{X?muD(hYG8{+$}ku&_JK!`?Ip*`-i;GstL$L0VsC|Y+6
zNiC=vM#vQEG8-7sP_zqB;>J!jOMh@+JgMFW4v$tLpE(goKsYUFNksWW7RrjyWR9UZ
zGCXf%QCc)Ybr(m;?i^!w2zem5sT-1`YWqx%9E%xPQe(JSj(~4jhw{*YRZwRyxqe|l
zSghM}rxZ8X{X=8Wr_YEVMr~%gvyR?cgF{&9667hkxwW5VG`PNj&$ec@#_Xgcn^dw7
zm!dO3K}0H;B3$)(=&jL)7MS|Zhl%A8VEl%T)xyR!t#1nd#Y?U&*e)&njCQVud2SPT
zqr&5h<(c9%GK*k0gu)xoDFUIJk?E$`?XgW>3jWZ=O^s~1b-NWr2{Wr?#YPaCHUxwB
zIIV4A8B{om3jJ})QkOvn6{JLeHvM{c*w)<zi~?@Yd@HBe5Z|uM7<O=2c#jd=yTap@
z=>Oa*0a4cN#UKUXW%Px$#9(4+r`yKM*FQE%M)aAp5C>R}a3j^ggv1tD63Vw$11;&&
zx)@N%g1eCZF(cdzQd{(I#^1DqW$zxqEmljIvuE|)77TAa7>P7u3{*`KC$~6!g8LC`
z(_?}KUGf%M77BD}BTT3LsV_S=uqb1+d}(WRn3177V-sF?{fs(FG=0uYp9*7CEZv6Z
zX38AQk1bi*fuId3<Zv4loM|%ScVn|kYikF&;&{c%X_RZ8<&v!ziaxPso!E?B_|#yN
zuKIA&m{$t{T@K!S@cbiN;T5h+mQZB2?=rXkv4I0IUt44xRf!9zerE$BRrt3c*JVc4
z%5V>>n@-dUE>YAWD>AjPn`5H>oQW4}N6{_-*5ZN_#(xIdxH0q6yLn-GU5yQy+3u~V
z2Q0P=V6qnu6ypGwQi7+~wj&nXDj0D4rS)=6`Bi3!&M^SvHMyWRdf);&*3`<+Vu3ID
zZwCubC~4#y$%-s&l3C!oavPlS!V@E_<d6z8i}22QuOWwegG0>tNp0X=X7Itxn%#!_
zKgLL{&2?lLpJ>DjdTbX1M6G6GX?|h6=cuG?7)CJE|Hal31(7CY);&G!gf9nTh|DW9
z)_KcAhiwip_1XhZrMuE|(sOncL%}kXX317IjQsi&HG#tRO3TuV>^7*t<DEfV3tF&;
z-EFL)e~&dR$bUN+KQP?Hc&)Kmo<gtDt<7K(csTCHGL>+`D~skU)92v^2pcfM0WAA$
zrIc6h<C2COIGlP#_uv73M^rq<{A95mL@cDoNNB?2)E4_SmJnTT+n@{~r)Gl%sZ3?6
zE3@Ro3SI;d8Qv?iX+dkN$*qm`(V4D6v%w{sHUO(jGSgxB?T;|G64x3s;1~<npm|Y;
z?233R-gZOEv>{$!(@wU)9*r3#nup}ZY`B8(?|n0LXEt6g+sOiI&4%YpL&?~PHInlR
zg$Cc!5@0?iUrWEf!1buVe|j9`jUBX@(S<g%KyPfQ_IhUq6S}#g0sRtOGC=DU!Q6B}
z=FN<nlK#6T9ou0F8w|;qe{?<|*fwj};|*cR7*7;w_1*yci2JAd#<6tE3Vjg?H^!99
z>?hYE+)$a=a3sJz?DiT&HMnfOCK}O6e!?qF`?E)v#zD>DL3A-(m&iqFu(rO**c**`
z)0U^|QJLpJNYObqvoNp4&Nth8Fe2EMneM2aM9DOWK+}N%h_7EmTM54vk#V|s#UjhJ
z>iSj3maL>DURc{pi5iVij9v=a7{g^^&UD4|w>W^}rf&GGA94gYHksrL=+pb;wE?hn
z$#-&>GnBDlodTP`z&7`?*bQ~fTjncjnvr9OL&6i!Zb||I!;MPs$21NcSx6ysbv+_j
zPR<==-WB|v;Nb4##%_E1nVD3tFfmiyZH@zd#89E#5Q?)AMr{2H{E;yCB}Ht{`?^wY
z%_Ah7D8rLaxgf3a@%Di>5y7o&E!X(7ph(Pa%79~Io$+8LT2$kGY7RsxEZn}{CoAfO
zb8EzEED0YW|1fEt@JH;teIsmT7J&wyUhaJOpnBz`EVXnd$woufsAd!fdMbP-oictQ
zIv@8fxF1nZsRx=@(sDAU#N(X}ZmKeLeCPwhjpta11&&JnEPI7gY~21iBs6CDc6@2t
zu$P#IAVFmQ-68-<8SF8ta%EWJ4)~;V02{o+IVLt(nQ14S^{w5I0~g?G(zQdbtSpY5
zQf}$tM}_r0=0*Y@Gqz_<DNhJ+cM)=dcWcc3?y;DyRdX%|J}5&3tx?A|<a~j%WqlI(
z_3mlOkw%ukR@}U`Cl6B54jN~M@J=xCM^qvzK#^dJO}1jXuI_Mz6kDikQB6I=9`)+D
z5m8P6=SN1EDhYX4HVI4H&mSmERBEFV6dcGat5r@tkE>aj34SLWKq|Q&n{Aj399YP&
z0UIi9QCC0%`F8B(A_%l9qRUHK+h)u~D)DWb!9bD{*AaBX8F+fcyOFzIA+JvOH42v;
z=zSPLE$o&QxaQk)Y+ZzF2w&-`QuurcmOF=6w8z#Qu3rnCt-<Kzn86B-s?Az;5U6kO
z3aH`|yAYcv3hg`U@Bw?y5XuzBW{Kn@5^GIsuNQC0uAZ`OCtP-l%;NzUZh&KX0_2>r
zkuy?RA@=c2`d3q+UlIMK3VvnBgyqI4s|P-`Ao1{6Id<f^^of;thcRTqVOik-8rHlp
zOebUJ+{87v+dw&B<`MuhYQJ+sFNX>sBaSy_jF;TEva~HUp_5^F&6&B8D*)6fE;B9f
zClq9c{Dl&`wKfyi616O-O|&}?1WB*`P`b@5<-+QCQ9dnkPdyv5VXscibevcxq;N(N
z)9<FiB_?gk-|yJm^xR5Z(eSz&jIubezmNqK6FXn@wp4Q*e1~F<2$lkc3F3LANfqe%
zV8iwdNJR`$8s{Mhkm>Cr2<crP0P4hC{2_(2+Qc4ayoZ0)+$3%E+dyDmxE(VGcD~aA
zhXFdG6qp&>F}1qUd~l{~kWVshwQu-Q_|ugE<2@ozgqb7M_DDK_g4bGOs@KGa4<Fle
zhR%AdQCfK}^TiiBb3hOr(Mtfi(L0h5#Y-QgCBx7+Bo!^rd{2@c$dg-l+7A9vX`Gw2
z5$<ZpA!o2<hwooo5%$1N-D$AorDY};j-j_!qFZ2OlE*tn?Gljplm*5+lqdenS^o$W
z3oGDMyu#jqfiuN6<~-pF((K40v<XP-DDd|{+h;_~3)DUb{H_^6-CCVY8aRNA*iHSh
z29e-Rh_afSbZ}*ZBhRcAeP%$?6(a2b)W^~}Lw=(6xuBG?!3o!x+8qWo=q1A0t^tw)
z(HLP&d!5MPa+eIBt&K4nn8}<k>QRtdFt|&=i-yzXZySqdLX+arO|wKAzI)0xTo450
z#(XOJM`K;D+=$&<Sopl4ZV*<zr799Sdo7+j(lgQ>6gBHVR6&|taJ=0P7EsPJpRvo2
za2-)T4?=h}Ob!k30acFRRmnz3kJ-%azyU;bxR2700Hs#gt+J0u?si~7J&XvdWEEQA
z#EA_I?1JF24HKNR@)hf}0$VSPO5QnuHHM<$<uhx4Y2DrIEJ8YrJYvBEaB4BYWA^33
zYk0y+%Eksy?as2mZbfTmcjpKWE&<7M3hFbfM6Cxh+qnhSSmxZ4-I^}SGei@`{p(M`
zu1@QBNX4=O10SQ>8#6v#1`v$V7MyrV=h{rVV-qH;ov#pPScC|r^|VY__ES=ni64s-
z5az^ikizV_Q0<_1_V!<CbmbA|<%(={Y=OlN&F>`=4RPhbVhW?X$9;PcPDTXZa^_Qb
zs)E+t(n`TAFJiRJFa4N7wl#}>g}7Q%=H8tL8(?YaRcw*;k7y)o`aJq;7;Q!KP$+<;
zJyb8cA4@c}ahY96?@8}7jbO@h6)a=uO$cSjr(opgKG5r9bU+r(O`tjSsNfCuH40Y+
zlAA(nud!jh^e4)<N0{5p61<Il*9a_OH-?*<^v<RGENX$4SD1vmEXUegK@vUVP3d)x
zCTqz0Dgw{i;;7m&)L6U}-P~cjMGa}p=)x%DEC#b|;!|76ZNwz`jl}?-qYrIxutkCp
zaA~J`%t&P~fvI{d7N@F2LNEcbUZXSXLHXR}zcLtnN(Qt+7Uv-8&CX$VOX(5EhHdZ%
z!7H0tidGT+cNZ_LGE}nvJP)mn6+Yx|#kZ^(e0yY3|Jbtj3#{Me;7sp~KA0n1e{OP3
z;+&vY%bhiZ3fC9zE+=H4^h4j1od>%%Ofoz(`Q=vkSTdx)VHKl1a*ZlJ1oNEHP_%DJ
z#DpMqwe5f^ti{}p<P<$cbLs(8aFoYR;wZP5VaY~YQ8kG~(4p>iEI?qGvZ`H4Z|uq{
zTP2{(-k^e(A@K`oMa#gxK?S9a>ABiD3L<RCKFc)$JxzZ)<8h+aQpD<AN^eVVNcRCR
zYKyF+{O#7Gw*lUfekr|RSoI!WVg{FJWFg*e7SPyu3Q?oGqBZOQ825tH#k!Y|^kG8&
z-6GN4qLG~4ei=5fHIy=FCpC5}Q1#npQg%de>F-{b-j!Ztc}p{fd4HVCl61Jc_!rWT
zrDvpPiDNg61<0*4WMN=Qckye|Po+${Pu-=0zfsya@)}LMkw%;uo4$cj@=(g8Lu~6M
z`@GYF#!{}aMQvlkof$JVPHC;|g1C|8=_^AN)AO%5RlpgpI|?wD(mT>S@JGB**0APt
ze8Y#ne?x??(FAnH{wZzelsoIVX-PhmK9HU!>x*nEm@8AbSRYfT=zUs3^JHQt{qFq$
zk3vR|2A(A236G6CzcT9?)Vz_k;daJrZBBPi*HVeCFInwG(Ax3*jdUXYSbCmVY0l)J
zwCrJb1;(@qPu@t6rPrjlrI*ZyMHgMm{;9ZG7lWq7`=#_VD-Ld8DvaR+XP`N!gHaYe
z8fZoAI)ix-#rxXW@MqGaK`%~?(VrRh@;3YFP})cz5=v%HBMbL^i!EL;#beJNeUH^D
zY|2D$0WPE;NykzneObE2V=oX_i33_7p3Z??Nv}zNA>Ec9z!s^AdRBHuPu9;=ubta_
z()XoT89Z2G_cOEYV^r<xCNr90>&7+=?h+uZHkwE%?Db_CQfT__MuufYO@85Jkl1>s
z)Gmbs3;DHH%@3S1+m`<?rT-}1mp(7u>s!K&EkR}Dn64jTDSafpD?O5);VGhk5Lo2e
z3T@p6IG29J<mQO<E)0DwaY<Jw>uwfMiQU=QSk98#SiE_QW9^ubA@<6KYS-3VU04Uq
z+)==QUDG>XaV}j;BVdNp%rU7uG$%RzpGkiqJ&>L?^t<Be9&%#L6k110h_6NXW9gnW
zlAbXndt_5PBARzl==#n-m;OweOJ5;z-e9XIcAQnp03PoQqGh3Z&D|no7w2TGp%t_)
zi(mnz|GhE@r0O3MQ4Cm2v26-0?i_<tAfY9xK*3AZJMGUerN1H$KaxI{Hqui-+liQa
zMXvTZ!q{~2&!m4ZJuQ76oh!KchN{LMT67kZ<kH1I=j6OQOxj&h(BI>>!9Zy|eeE1T
z3)k<MP&CZQS~>wvs4xk`B2HsGm2+eC)R3`Kr_9B6{x+44f!l&YBRdy6EZpX%Hq>M;
z-jn{lG?ng<NM2yoYGWjvNUur1WGL(P7k?!EN9kC4)~1RbO8c-YYV7ex()*-yl2l0_
zOMk<0OfR8>VFohx^JD1^=^ei~gumhSCWZm$fRPi7NdduRe?Vi+4Q`zfEH(XfIQSa9
zB798>xv~cAiLGcu;5xOyrQMeQ2hyJt>3v0dV9fqA)Hv;`_oWZLaV7Qb{(x(q0WsYt
zrdb%N=pE@TbnN=h?@NCv?MrtFx!#vfqzBSnjzLIl*Y~9lw&gFQdM!}<g4i*myKn;|
z>mDV-9*`4e^@vPuY$fjtn)`40c@~~N7i67N9w)bKG}=#gv&c5m2hx9#Zb`SKmwBQ!
zL3lx0m2#RB&!bn;yVAdtTIs9OQwCeDkjo=l<v(Dkb0`0Ir9Wqi(DOXrrF1IYlAglJ
zUqN%b+A05vOf}eU0V0enXk3%obt2qgS!?2>%2JUQHE%48=y70|;6KwoVm{$3rdY)J
zV2+2&|3m5f(t&hWdI@d0Af_J~QRRK<Z5Lts(Z7@K&;-!(o+&MwNOv#?A4qSJrFF*Z
zd(s__-V_f%C2qWAzV0Q_doy$Zr43OSTd!J%0}xfhl1a_IB5DHgZd=2_iIFd&9&m{P
z=@cL+0AowRqp}gHx$l-C04r4NJ?XEced$y>!m1W%RYEsDl71@vz)v<!7r!h0g>)=E
z1U3m~^MdF4MEarhQ#U}#xW6I&FOsA$0)!nxhK(D%Fa3@5BVRoiVa4AS7CRqUXGQG(
zyX2^y2&332(vPK$EvYSP52R-(|A>w%(4Gnf3Nq!=n4m-XUrDc%qnt=D@bDWOQJO5_
z_odfd4O`-vOgfid#c%9c8)^Zq`TNq(w&nkp^ygAZr2D?~mUK^giQ&ZwsrZ5P)16s-
z#(h=<<P(fe^ry5|Yo2)!Kfx!rj4~;Vno*KdB!FQt(Er8;6$aOGV2L6i9N$C(4JrIv
zX(oNy+V-Yqp<3xg`jPY|57Jw_6i|LvdMLeVFOr-_Ld0H|eld`Ly7=Epne-XV*kkFD
zbW6G~<<ev6J?SUX+oU+XhYxHtYBfQ$EhvIVFW`bKC6i?*V<<2u1hc^&UBc*TtzWKV
z(d1;BbBo(@u<*v(aPLU>cb<olBE~man=gPx&45DAc&SSIq4WkPZgnF3h4la9KpO#P
zopVt6Evc2>mwvR<{=6;yJE@jlmhMYa>0KUSX-@AK($D?kj)_!~r>-pm%@E8nQQrj&
ztsab;**vHWrfE)aDbVW?0>}rVTjRKMGefnla-e#fZT!d8=u-NHw2;0>&Xr!&9YEsL
ze$;0*UW@TH>3^2~3U!H^>IO4<E`2P$3qa7lLdv2qr2l}?-RsYhOcCgZ())uut6xXI
zvVprZNI{Wxm3>-tcmKu+(lzwuecpKuL2GZjf?rZ>j(*e?cbhX>XGH{gv2&Q+U=J*%
zpWAGFFU!|h;&%r#6_W^mBAr+%Hzzx9%@VyQy*m(LB*K@{_oYM1no|<msFHpxy|*oY
zq3~0XkKF7|4n1{kJq?`}#+7@ZoejBHOCAxh=g9aCJr>Fp$=(Q3P4qqQnJxu9z9R`v
z`u|gDA^j!CgD!0^=G^HM>HTf_UrBFDSJE4}?uZI18wPkPeXxD;h4d5Yl;ETQIh~Qa
zE@;R5U>gF;sGc0rKPN(mC7y6&8|mCg8=d{xXL#8uq~{}>gRxIk7aYHC=gmo3CisxT
z4qh3d2pL$;WT*U-3jbU>mHs1sx5Qn<RD(+TNP2Tn$xLbX52O?6mUWiI11A*tRC*|V
zux-39xbq!`N=Niuu?1JsL+KqiUcDN?#-OvZ?{&j;pBbO4!=Rm9nOpzCL+L{Vn-=-l
zp1E^>HECNyLZgwPF*?hN2ie_??Q8H$=^aZFTZ2UXW;;qbXVTB5H}E^5D78Il^cwU&
zHAmXL&c!8P+H!Gq)195kIkj1P5rIZ?dE{5Yxb8?2QQ64AiKeV#5Xi>3m)^}wBTqCa
zeeC`@!Vg0HZX#1yf|c~X@f>0me6e?n&@Yb5yy|)5#cAbVlio%0HD)9@Hr4hl6!pKj
zHLhxFut9jmv9(1EQG^8~fGcX3BO9V1tBGC?=Fw-E<s;I!M}~Yo%f4vr?{|Na7XDOv
z!=i+s8HsnPc1odvpZak>#|S!2V^Z9$|5wu6#)oYCiD!M2{4?s6ky@p=^Mby>u~j0*
zOzds_;yG)RXgg>2<_oZgV^*%GKS@IpFubG@ZqFJLVrxvieEh5EzpNVH@;@7EPacZ!
zG33jgyv^2<^WFNtklwR#Zn7)eo$?>POvKv}#8ia8CEU0T8F>c*HAe0%Y4=Rw{}Cus
zBOOZz{8^7ew8H48=c$YrR9QOK*x0Ze_dsOSuJ@rTt^K$A;uYL|*-hI1KR*59f!*^f
zcVAra<ReS+TjUhHtV_5E3s<0Yt2G?|XcsMvgE{5_YiE*0T{xJzK^WMu5J6+xM^9_r
z{BcAhvj=oRJW2kscsoD8>cxJ;tbgV1BI*VyT+uNb;lhHVL2ZQal?U-DyZeBnfFhNy
zfk2}qHr#MgnToELAcPlSxY5zH=CHKQO(F;_T;CwV_Q@*T+Oo$_1`b?Lmj8MuPw7+d
zJfe3X2zNRBkf4GDt~~cVG{V(2WwZwbnq!=HO58KT_g)N>+wiGNnj$CGL=Zrf-{wa7
zhX|kW`4s!J{*Cp2W0k;=&XeRn>;nigXE^TMyh&w*?9vw?NON_6qm7tF9L<1U+3hZl
z2IR;*V^xC{=Pe++A&5_vZ}x<B^NaPX{N3O2#T)5k={2h1EfUR14zjJVw2OeI)C6Kq
z$+;0=N3aMY1L+m<+x~xrvB7gIk*x_ImQO(8Zj%46B>BIWi+|NmW3PA7A7j@VB{D}N
zmrev*gFKvIKnj5F8g!?$;}v=_OfV>dFf&0T=KWr-)J%QyWCQfKlEUA?#j@)MG0>-c
zf~^*GpwiAX%1D$0Z|j<2M>M-^=t7Jwv?iLr;LoOJg|0ti={|=L6;J^}5K92snh4?7
z*8l&6^8fWN?%1%E>GZbC4|?Vmz;aiLjr{Gv4&aj!5Lc|#8d`hfHar7y9g}ioMk23Z
zJni-Io8M48|9O6*aQcaKCf$WCa4OXhm-nqoe&9e(IN%x>x&ne|U;{@J!jdNM(kp0-
z{%K!h1KC#eAsoWttbYSR{;eecpJdr0-WP6QirtU8RoWK|YT8{9uClWRGNUgPFe%S~
z^U6VTl-Y>0vVRh0gQ4>}-eM<s@Sp4F*I2nQL02}DqTyF9tk^@VgmnfyYUC@(i;ayl
zDxpT@tccuHRwUlHX}mbI*6Erplu><H{Kod@Kf_Ot23lfkax(_e|24LO2#*FBXp33e
z2Z&f&1+-vB@FiBQ86cUZWp!y;3Iej+#)_Ukc_jTm^iN87=QOS_jO`#s3#Dq%sZUh#
zc&njmm<-T`9dq3>8E`W=5vhXQ8BtF-0B?McDI33u1NhJUlQE%XW<#Z;{5!EGh{g6e
z^RVF~!%SNiGLo$*$ax-iEy<bmv~-J7?26G}$A-J7dmcXFNX389p9Ze@BT0R-er^Qh
zj8ferFC_<6Cki-MMk8*>l4FCk?!6npin%Re^jxLCIj6IvzG<N8KhRId0xAf3atpIB
zNkr%P&XT|HeJVQ+sWnDV^tMJi*IJr)E!;IN^GnP8avN<Cfmicn4D28DC$&!B2R~?W
zbTg(QME<LtgLGQ!z&HYJNCmOfXnqD4Vd3GcSI{c@9cDJhYlK%1lH`sG>_7BR6yakg
zMs4$Vf`YNJIsd(${NS$~N-bRjM}(y#Sa7<jh&rYha@ilaeFBT8u>x5B+b4hZ3pKUD
z4=HNj!y>e_<y^QAUC}Ukjz(AXvThhfbK6F;tsK56jpi@&=X0Zho--_aG{`;vp?^9E
zs6{h_<g);+Ij|9f1+HR9hKdSk`qopc9^{DoxSuIshp%zv8nj{Gjx{K)a=mBAWIyRs
z^!xn>h*o3&cgH8Ui`)Jewd6#*FAkE9lVtqwf>E;jwt-}x@z=+U@Cee~iWxIY|Dx8O
z>fD;Y5;V7#K9m~i9^qrJ88#Z8_QB~!V0O99;apmA{z;#ZLT0Zf^V>loyHWl{f6bz%
z-uB;g_not?HIqes^-anXz`_{X*#;nCY?RQMbWeKdeJ~lB!agT>EbSFSTQ|1Nde0v}
z%8b8pjICO5j_)B&ET4}56t)Bo<oXlUPri7&j_ZrJ`{Jy>c(PRtneSftFuDy`WW(2u
zjf|Tvy8>(I$|$+bziDh7R%=sIYpmFnA7&DYxq;%Gp8t^Tuz>%T0M?TyXs)a9KoIe1
zo-9IH2gyJ{v2C7Y|MI@^Yq#`&DE}QmKyBRd5mQWJnxp#pQqKWgF$5wKio&MsjiKTe
zg9>tvznzi1r3hf(Sko8Sv+@SPb^@4i1oIxYF0&~-@pWh0Y~-Ado&)M|4PjyA?`PEE
z7TXt38A-KoJn~=!v^3m4-g&BDLn`!$9@2g3n5ezQHmt}(hf*k{ed!J%XKr2VB`C&-
zSt!$OCTMQ&Jhq8AIhN>@MLoQU2Q2`*BZztpuwel=GG*Z>(xYuDg#3^D&Hf`hT4}|_
zJLEs7_kC)kEP_I|WL&`M&RJ4X1de$J4a4XUY#>A~9ZI)s%ER@=RUbUdZT#(;X`oTf
zFFXUTP{|c;C|Fz@98k&0Af5+)y8ed<oJ+R6X&{4NJToSEV+50!Ikz5MoN-*_Z6L^o
z=Ea2#l25hr<xMuUFcy@o3$V87IE{2j#Wst}<}lz(UgQC`Xl8>i#>QZ|4nS8V&T}J9
z%;*}p!XKn%JNZ-t96R@j_ixw;QPO<0QwuXp$XyQ5%`8?9pyXUSyXoRX>{kH!E8E4%
z{`BNOWTiELH#vJaG_k=^mNwt=T8G1_UBWa^b+8MXD)#6e7QgEJP(c&H9uoj+qe@iN
z<Z2Y_#%T~~t)G*=Bz;bL8W_B!pEot*J(Av#zAwG!?JP-v&r4sCUV#l<OZTlcCCwrD
z8|gnt?+h-UNY6-LmA)X|XJo_FPN_-#AHR^kcT+D?0Pv*+@j2p*Swc;pw|{LYYnn$I
zoL&i|C4)yk#&ypkk<Dn-9%F9T20=Bn)8#Bj3`NmR@_%0Xs`M%oKr-z43FYUd^f%I9
zNgp|qo(%X4(wC*rGEHGa)E$KSE9s}wzn9)}5vGe@mcA@KE8UjvTke=B9H-LT($A#7
z+3rG9f2enZ>t299huX))$ijVaW2ZuT1@n?uh%SS<MKYZVuh`Rjn6(-4P(U38cN*kZ
zLmTj%^t;mMr8}%*X2rPVYF^)#LeA#-n)Es5RZI*SNZH6I(vPGcNuLbLAXDjC>Gz~B
zOZT}Zk@_C;*DH43``hx*Fyj$@wdNixfH0XvI|CShbY!NpB^cV54lo2ckxa*7UxDw>
zu!kvJdW>%v+Ys306Dq(t#hlMbpJ%XJ3kz%tA?9Q0htl^3?R06)pOL;SeU-tv1#^s7
zgf2_zE$MrMUdW8i^A$RQk3kd8DMMA#htiLwAGrKG2T<4ul-An0avO|QlM`sb<P<*U
zu{4pMl8(^$5qiIGj(13u&8@gl&^=hu^?gNB9nW~9_CG6qQ~H+l8PxD&X)ZlQUs|FW
z{6KmnJsg~=@~ren(m#=2!FLx-zh6V{{z%%F9@44PUHr84`_i9CzeC`@M+4~s8s#JD
zc^0!5K?Pc~Z|*zC9<PBR8`f=Dimk0NtR-11ZT)1F#%$Yw<koas+HlY)zmHh__9hle
zNa35(KaoBM?!Cl&o#DMtB}tz!R<ntVpO^kb`p42|K)z~Ld(AA9QhJ`2(oI}Em0pm(
zEqz;hmN+QVzvTB)CFFb3l2UjNg{iHBAjG-HK!LT@gg4f+e*NY-;M=JkD19Y8LJ_9Y
z)5P&-0}5BfIt^&UgfMku{T?};e!C4&$o~(ee<ppA!1vsaXI=B=&oH{Ba``_ieM|bM
z(&sQ&(UKnhcW+4}>15D}n`o3@m;Sl*Rp}VyB~da{R98tS(qBoZ+wvDmHZ%M!fQ*n!
z*!eR*Qbbl6*x2;SbLk2!XV&K!ZhZd)s<}4CXPUW_!|EN`Ebd}w0F*@dob-p%x1=vf
z4=6^QFlqaU!p<%7sGbRVO8UC=kEAaX3{RoYd>}oPM$#?mZR+3~U%cm}KbHQ9^nx9T
zlfk`7PyW6%lFpzxHJt;<P|gW4N)B^m4GflnC?sjkJPr@Wo4}{1NN-AZ8~!SK_B!hm
z8UBQUnyv#=Qx6Re)N}`SNBUjqkEGvWnn3WEZt+HE(y??Z-D9+G8yA0F`lr&@IfQqQ
zIA}>IF=1NV1LnQfaq%~#e=hx=bkF7(RCYE%BRwUJq#sK^B+=<UEiai_bQ!HREeB$V
z#@wD1LWCQ0#xh1#@J83$>D!t<r?h?>UhL2gK)AH=TDvj$Qo1L7UHVh$>(V_FLCa%T
zsL6}cOVT?rl&6qxNxv)oW9b{xZNS3~mNciSDk0%7O5c+n`<=fb{fYE>49J*YjcvSM
z>Iiy(cJ|~SosXkF_%pTc?~&m$+9tBitcA<T{b|EQC2e9$qi|p;<@tnZ6oIxz_jPIl
zI2{1kuK#zXuS@?}`lj?i%Fvmb@gHmK$wqp~C;G(2e{2!_+6F0yC7wwiNI&z5K8a=d
zCF%F2S3w9TyxLs4D?P*8J(eCwKZkD6DSvA?Rk~XRY&pXaEa|pi42l#5r}2ycjMfkz
zVx4kxW?lLg$W=)nN1D4<vm>GyFd_`y-H00YC#i5)F){zXg2lHdJ(iwjqE%;qzASxH
z`dy?oqcQv(Am>PWMmmyyB>hl&?3?E+(w|D-;Asn!*#tCZCOs>?D7_ggUk=tFRw8Hj
zUJ(%PapZEfRUEyR#z?xuE?@KW7V96ed+D8cnJr~u=Re2LxH0HN!%kf63?HV_i_#xR
zUt}0k>Keao098zTdRlrWYC1^=UXs2ceMZ^?s2m&kk?s~Y=-7|*nMp58-<Ez)de)R<
zWyhk-*tGjQG4zNR7xt6FzG6<5yhWltB8itwYXY1-{ypb#;o7Wfx9l~3qhP7WSi-RW
zSA-S2E5K=^e^&ao^m*wJPf}WM=ZIw+OD{>!MQtTS_)Y2acH-ZJO}DXg$@e(Ca^x4k
z0PJ+%`sZSo*9P*htt2yaU}G?o6>xuJ2XrlXgtdXmJp*(LLpw6uxFmF{Y%**!Sb`ka
zv9Sbq4FoqOh3{3*w&nk-^eyR&4ACrrQ=*x;HiY~zF!YfBx3K;blX*^vnKtZg=>-_`
zneV)>OD{>S^bwTqiaQkM^q!ZVCdll40_51oX|IUjCziVw)V3;@-=K|q#01*MR6@e~
zudNt%{rsh!__=~59aXWcKZNB*{ZArH_U9?W9og9+(Ndf<QZ5xK8(+N7@W2&`TPlyH
zpmRc6cNj5rq`%*gz65VLJZr(8Ns4?=%B8nl{-w)sX{E>lb00;dWC<hokuN}PwkjSj
z3f>{NXx8XxSS`%D1@TUxuN5(AK_f`zBXInaO9#@+(r2ZoF+dFlsOIf9+%lC*uV?zD
za>uLEt9F>)lz@6{$HH6zjqQcxa%o?BRr(^*ulhuWhC#@Il<lrzv_|<4ETUrezyw6;
zOnN9i985G0n^M{Usf?GOn%!BmQk@TtsRa#5*p%2m^868y@ObAivJ`}TUivHtVb=r@
z;yEWa+W0OhS{94<8R>VVr!Cta;fBTx#mUSJ<Z<zl^cm?zSU4l|l_g+wbW}ClT|}{m
z10-3q^%^F$6~+jfVm)(z|1mDI#n6WL3j<VPPkQ)Wfj5~-clf&tUiH96<J^+&+Yui_
z`F|1Q?TUzPYKy-#R%PaNkfg$|09#Bz#V_q#@fd<|Wik1+*!HCtrO#sbR#rGY0J?9n
zHn-uT^`5`QFa$-j!Q(bY4HDC69y^NMQe*#^L-mevahrb0e7y~!Mg?CC@sKT5$zIH<
zyNxIb4a?_;(r2ZwNY7aRSd09}#^Xpu?4JEOl0GMWReIJ6r2*ki42!EQgX!-4dFgkh
zyS&?i5M_@37aUSB8O#yx{?%ZyN`s3xSmhC^am}vkK@bf_dqpo(NHv#kfrZFwN<$ud
zZ7-K$J=W})hL|jrakzW;q&paZ*l}aSMK)HF7+GF>9X{P-ujO_uaJ$8%o8SPt2x5=9
z^99JkirJe7HlSwAK5qvvAA5L41?!5(%Z-XWA~{)*2sZ&BmKKqQi7VM9K^|EHBVX(!
zBIT5vq76I!@V3q99nARAK!i~-S=&rlSr}1mLwFhF$~$H;|Nq?m=d&c)b>#~#SD~&*
zS-PMVL6B%rfLgSA_H4}NU!LjLJtSX~O$87jp-Z6(rBYoH(#3o@_qTnyhu<s!Y<52~
ztPH5i3=eldXUnzM-X~}RIRYo1KW3HQEZbeY_Y0;d8zRs^o>CC|vh<1(#0J(s%y@36
zpkGo)OfNDckIek~)<~mQ)N@)Y@GG0KUh_}Cd4)+ulI;uW2~z=E^kT!HgOlT7h{+5m
zDU6Q!mR}e4Y{oA}OLxEi_XxHR9M+Qo$%nhyfLtyj0!?AzeR^&0k!H?uKmqM^lT*aw
z5rRp3hhp`bm5aC~vWdkAL@1##=LGR5453+&{zO#SjZ`ouEWKnj(9HaMHB2QZCJ|(u
z9sB?l7Ed0(vZIl&v^xIH5}t6xM6%&qL_5e&g|%hX&H_w#e$g<e0;oL4Y1dGbGU@3k
zdN2JQawDP?05P&oi2yKn6k%>YHOW^nccPA1*s&lxQkKPN_f}vt9Tsap)SiR=)}3Pv
zH|m<LP3-c~TRA~o%CxUNkH{=oJHQ8PiLq;9r-C1i5doQBT-u&4dC3JZY}8Yx*>6Ho
zww!Bpf?&*SHeJUb4v0Lm_M({cQ(^`~m_$x>GMc27qNV_CGmBtX$Wv64E{xhTp~e+r
z82PJE+8Q0$LKxn91?-gW=Z2K0V3U@d@*>t~fZkWwjRi|^FgG|yA~X86?i?R09SBip
zqP~?br4J=ZpMee)^@Kq$UQ`LLrH}aenME&^MO6Z$8|-|Ge~f(Z#2|=}j@Hv3|1{|1
z(jul7>zCPx)N3}P4}u%Z%QJ+mp!OdadP1cyZFcxTf1-mTqaJb?4mOLD;zS)gJmCue
zG_!#GlsbNcAiIUW$qODzcaixy@l3R~CgQ--qlb8^#KKw|hdoT-fzY$217|nc4Bz6}
zGwk!su!kB4UQvm<2y+lPVdQ8*Ik})gwBr8Zz4{tnV<gSAJPYG~Eb$cuK6vMnJ8%FU
zyr;rK!Te}4fNT_EZ$m3welYE8jt+Q1TCozjIpKlbi2DDUFn0$YI5VXy438WO&;lN~
ztZTvX)XEAF10BkV+V1eBIY2}r927L+w-6k98(7+qo(15=0xatZ)4NZM3^?)dImt>8
zuC6Kkq!&pU<r!mg&)orZmepnm?Sgo<q5>E%a%U95opB81&N@yjlH!hyJmqWCM3l}l
zwVBoQT4XS~&~_9&#!TWhLzRPklcL7xI+@@AhL&D4P_yFt5Zeuwq`hsFMPnxw$6gd5
zO0|ts3P#NMhgve&rRAaVb1iS0IW7@J_}wtI4ZyHv@q-UIQG^c&H_G3mnL&{XgoPUv
z@Cvg8b6e`9#iyoucck}66QM$cPK{;|HN>rHZf^c$LIuJTb^NX@rj5NE8}P6nOFMyk
zZFqjlrZ>_@NObEzQ5pSzVP3j5w5PV-p@~XBXYoxCh0~W3`D{sRUvB-jZ8+fIS?%l*
zYCOj2@J^ktf51~2veX&`RpCzqa`9^6jXd0ZA`z_({Fo8-L^QL*4Ge1Gg|Xf*`Qe4(
z&N+sq;druo<O2%hy@l^unO4+*gws^q9H4iG+!cK9z=0+!T<Lz4K1A|=LdkAtoky2L
zr8#)`lo>g-#BM~s5hE@xKxHQ|;8?<xOW)_7*T&~+U_P9Yfn*~GoLdcL%c1Vk&ePIA
zb;9lx{{Hb|Po+I0cso<No%QHU*TF)7cLo>*1?RxJ%xtiGH`1Qg&Q8g(+iQOJ#1fK)
z8_)xx*&IVNGcPYZM3i!;XGsN-DzgbDg^jj)VlZ%LwcdUSGYn`B=WT&#c-3-`uw?Q4
z&!mr~`vj;pBHR)|&$y<ZjG!~?&1?+ODy#r-K*sJ$SED-mf-|{An&0u~2drO905quK
zMb;Pba%YP_vsZ|OZaM%eZ5l~QzM-M!tx$w|G>|0LzoPaYiBnuWhxvEmA`qc#WjJMH
zM-k5m&aa4ACK|!Q3^#Cggu=uN2N!S;kEBnc4n&4~gZ^i1Qn;rX)ULsuSM-WU(9rR~
zr*@J@XJ%{40YvR{X+IM%en~D~a6I6E1{x{w6&%8Q#EUKFcE!<B`DhK}xeMzylN}_K
zS=DN1NZ$Aps*@Yy+E`4Ip7a^qxWT8k+%8#(6Z2tPGK1*z9K<nk=NZN+DxjfQg>(Y<
zccKQ+vcmUa?rg0$H(Gk5nl`8dX6#>-sd58Q17SUf7}h_3#2y<MuTg<MaL)}^JtvMI
zW6&K@*9I9b%?;#sgABnwcLj*7v4D~nKiBZ^QG*#kSRH2oUP~{7T!epXNk39$ElP>g
z5ol`z_yR&&BARQU_E?3P2SD@Tp;S+->b9pcnzNi5$xqFEvxnp^9X?=BU{vxJ(-}-<
zMBm&5UIbm&sBMXp?e#Y*!doi_CV|~s8*oE}SC52GFz0d-H(4B?Q95gp$ac~JcW}t(
zc$HY4jQ?HWp>qF;)`IE<;sD%0BfmLg@e&Mv7jnqnYXA|zCAWSF5LZk0&4kI$)0h^X
zCGbJ?Jr`6}&MlIv5u}A@@Hy0ir4@J!t2K7ei6R^6@uLV`YP4-_USbcYAtIS(+5w1P
zTJlS4P?((5zA#Syz=qENE$>PNM+41>H#=-oVdp-K<H#1FU6N@W=pN0@mF{oDpj$f&
zI(9X;Idug#;QHA5N2M(o$U-U_D+@Zgz8hWK66T$AS%(;(T6aoh$Ky|KnNEu*$@r6<
z6?(l|bMuKu3X((4-d?k)E8v*z==p2ro1K9(R3@CU&s+3nag3qM<_!d?Wsb#|fge?9
z^64;WxiRC3CVqzMUqk*02(TQE61j0;H4VJ;;mq)oyQL8C0Ht|%y#J8|ZIPB8zf~|_
zV9z7$!3D>?WElZeLFd!K*c64qVg3GP0ND5Ndx04R?JOMl%);&Q^NT&7n+SJiy|%cX
z4R(HjA12H00boev+~(v*Cwd=%WJqX*i`tvO2JLNT5z+(;D<Yf9<{`G0EY{|+0<0ap
zLe0NKH_aR%D<l10lRngezXYPOu)nB~^NyVA9-3ZG`hb!G{ajkJTR9xJz&<uKVhtME
zf&o(SPiqnD!63Z$_%Mprj;Iitv#0dhvHk}u$>jdnV>1+q%o7rfAa6viG<P5)w<$Px
z?PSynz$J`X#O~#B1ih9%v|(I1EQ*54NZO2P{z_Cg1mF-oN8t~G$9~4Zw9eOXOE><E
zMJssVGdpiO;-kzj4i^`7;K&JFd*k89FlZRJEtQ@FmF3p1pMfh~cquf3#q<{^W)J5)
zMG4Z7+o_a2g{wvHG84<Wg_0MHdD3k~$buH8yC`sG<ei{nbQWg}Ae4-z$tzBdtFYC3
z?D-kiemBY>BjI<V>xyx}8Mn=iAv*rF>{hsuIT|uE*L}cA^n%zK!z6=!cWnbX8cUou
zR;1~5@D$ByrKjfNY67Dr(8e@??!=bD{6kcAa;wAF$4uC$gPw4Zco5l6tOgi;$^!~r
z6!4;zL0J5mJyd5@j%f^ik1Z+rc8~{C^l?Y3Z6tpm1BI^&|I}fEa{lSK_?aK4*BOER
z1RCpx6^imwpsUmPXV8X1T24u}cci?zWyU*ND|;1eEMiz13V)zyD>&2|HKDY4x!j&*
zX~|w|Mcczj{!4#>j6Z6L<1$+6O15Wi(fGiC)@F1ovY;RcED+``U+Xd4hAxnXzQM|p
z>Ke=`XEaK8%!Af2h>ve~wk0x7?YI%E#^bHot{Gmb0CztaqTuA>zVph%eAyC~HW+L=
zf^#2qZlm`a8tPA>Nv-Wb*&YSi+qkPxrh^4^3xp*24ukTmc&iyS&I4gyY(!~(b_~H~
z<a=kBZee;V@Sj-ei5f^}gN!y{W=qzkLTEGK**=jC0~9Q-4PR~f+dYBb?m5D<0~;%i
zfVUt`Sr1dLeAJQE8t*d0q<6!%gkaS=*m(2uf}p));7esIBI(Tf7!Jo7jH|%LUg3fa
z@XWYGg}dH(gbXjUWS{nya%DgW(Ty{xq0Nozci{blsu%9vqo-*i&Lydp*F<Mn=#Kw=
z29|nr|G<xH9LUyWP8L?TW`BD8B{=r+=8-|B0^m7W>fspwjaR5Se{6}B-=dTU5T$}R
z?U}PE14PJ&X9CGuA)G-na|hno{QZWrLhe`w5rn(K>Tx}TrGJ5gz2cGxI6de?vgiwP
zoIwy4G_v$ZYaHJKeTYRXCP!)~X*vNY+%OgD1Y5O6=vK$l8=ok|M-ee9NVOU8gRwYz
z87{5TmSDsy>j)BwSjF@AG7wCxtyzY`VvowA0o|5D)|S|1$Y(OQE8{YxgzPS8_n62O
z9y5ptXBM(=qz~+X`SH$gjQu98Y{0EED%0CZ1LS^3CTKI(EC|`DKK0aR5_ZVk3G+6V
zR`;0^Kty?CKw=MKSoG|4!|<mj<l0zs$oP{PhM!wo%G`c5g0tCV5}Okl?AhmzAokRb
zi{6daKdx`N!ww8Lz`J`;EC&3Mk^F4<r-VjaqYeQTM{Q#O;RfYniQg4m<De8iq5mqg
zy!Xa>39W1__7AyiEFliCav}*6JcOmu)=I<BWrO48yBOft324_-tK)6}bqX7oIgbkX
zoYmS8?>hzS3KVc|`0d2zH88&Kqoj?cm9>SAGh()J7*x?=7##T0V#|QQ*G`1$jO2F5
zsM}F}jp~zZ75Y!<HsG=JE*%L8Fxk;Lkl9(K;gAL|5L&XwP~72R(z8Z7HQ<nCwk#{0
z{iThrUlO}sz`oRN5YRQwt}si~pbr(O>NCm+y$t8lIYVN0geg1LdciMdX4?m<-|+}h
zFOOg&>E;{-yjAo&ve3De(l&@rZJ^L<<OYKRa>@kGikPakPSAw$T+pF8xL(L-H|(Lk
zv`8Q@mh3hKNY&UgOh;w_=F+ku(TlcsDX63~=_Tu4-J9K*n*%A_hQ#96cA{C00qP9S
zXi25Y<HgVLo{`JXh(vPMVrK-S{MdkwSaumT&8T|EP@#DK-g@N}x0V2o2`i4|CMpTH
zdfx+T_llt?0nOyYX6kyB$3@%{DiI)E+<9-3G7y0ge6Yk&?<omf<HmC`xLa9FGh49I
zc#}~>_>h5nccKy%k5^!&Q-TnIW^f7utX{bp%=!5!bS#Ki0xAumT--K|qqW4LLfQ&a
z{w2mdL9_SqYP|{;R%;2v91NNnS<Bwg#s111M{x%e&>8b`Vg6t}0$)pC1{sXJw8F#E
zx_j<RHD~OMv$&dxS%r-&Yi!Es3<0<tAyK0WsOxRKVR}?Ah>SmxTjv;n8ZkVzmfY6x
zy#5tRYj_H>bp;R<FmPD4-iHO*HWp$_q}%e$XEqIW0z!#ZSU~XKp*QJO*T$m`Dtzz2
z3Jll+L-Rzs#8Gs_AyKN(hy}60Zp$WJI*B~8ky|@~1MrFxjU%~*Vav5`K{mQR0{8GW
z(LY#OtT|A?6d$aOvzA-=Rd|KM@V|*7YFgVz%$h$jL_?-=`4CRHS^pqXU9(yXBO@Pp
zmk9g@ab#=k$><iFnI&0c6{AgVx`^6Zo*%e%gnS{(;~J2L8E_4X6B$^^2&@8_bkNcp
zl#NL9YV#Pa^pUia?qOiY<#QK-6X_+2Q7!eXOX*B{K<?O4XY8p6p8lkW@2+Ld6(@Q{
zOLpeD=N_1$LcK)9QqTku;;82ba4U;G!=){Zk(e;57km^-$ZYhHgvH(&@j}Q&HF}DY
z@g0w#S{7EUZVcnvF_yISi*HsxZSgsQON%{u!8pltJBoIU;nsj03uZ-C;AT-7oefJI
z{U<0qGh{U=90G<vm6p<nlN6MUFh52?WZ2qhuM4ZRJVpWH?^Y~*l7|<~R|G>a9Kd+@
zD?<*hnIhBKDeU{986Ceuhl!3Pz;WLLMrut3X<)sAmtHbKaAOD8v?SGglAykn9$0x{
zKzsCx?ywaR$}A@hP);}ei&hptF-ISX<BUM194%oWp)x9ThdsNtYF;0HN8$I{s5Xre
z*9#skpx^oA^NX#zhRfCxR_^GC+HixJ`;d+e{{Z<}v6f!)%Si1vK8!5zH`yGEwB~n!
zU7txUM_RP#^MncnywF;v*;rZ{rPy&Lq@(J6f{@NF(#?n?4;Ib#nww0|6X`X6D+est
zLr&PDwujpcsMz);eB^uxdj%ppm##)(K>D{e2=y6g;m*p98Lqz_MNbC<f39JIE-Y5t
z5<s=LRbM(|s?;`DddU|WglEoL4wCp_VQq$kCxQf!Fs~i)bT`7UD{4(!Qj6S7Rm+by
zqw`=jssRXHdyJGv59w1p=zW?&2c?{Npg?>Jcy|qNxV0l@&)xln-p9XG-1!{SGsDh3
zJf0*a?tE^wkEo*r_d7>#PmY;rokjE!$JSOv-6J$RE4PjfU~UYHn(%F7W6**EHC+S_
zd^ezi3ZoyvO%7we=Mr_pjKDeFJC>v6+s&wso8Gy$0dl$d<AXi$l78;#Ua-PuxioN@
z3bSxaVj{7HgA7nsdu|cn*u3b22o_U+aZAZBu;#`VHOjIX9{$8t&(&xpy(+ze0SVeq
z@KLuE1j~^D%B;ij9M*ls!4PX^@e28A{r;IH6%7l~4QEHxh~{(wBc$lSEcV54U!$Ip
zibTuNG>5ozVe!qLWFyetjF@<^ga>-CYfCLcTXsYOr_f`kUm>^B!Nyv;gUGdpU30?y
z`WPlqoH@_dnb<5bB%M*S`V6eV4DReO&ns)fSh7a97BDi_CRp0J^<rk`x5Er?-3Mjp
zLm+g~5nEXFQ)Ai&Pm#fpJ44|FhklDXFKB^#UwY#3%fOYlrmF|otZBZ&c1YxiI#_Q#
zqmO>a_Zo_Tfv4|`P2US`XO2G-gwlEfM(aGi0i|(GTWs*2yfna*aIp&y!W_en_64sr
z?GU#OUA+SUer^eKa$xts&|6#G`j`Oe>)#m3Bc8k==WYPQQ=x3_KBzUj*VycWGY+?S
zN;P?9?dCVn?>2OX0}JG4NgfhaLOmx7Zn^&xsfE|Fv1({M>=+0;0Nu`UeI2pm3Ab%n
zh1`8;&M>WWekg~@xxt1<5>pu{GWaANdMMq+?v@0bPjMZM%|`1X#EeSiOVW$Dj*UHC
zXTt!OR0#((N^emwI2Nb@EeX|_F_CCoKFsjadzL52J3%LlEU!S2ije<wt0zojonbmx
zbgs3;GnLJw%lzgu%bIp}vP$|=h2Q+pXmaCEZn5uyh$9#n*F+y8T<*<S(u5`9?}n5^
zU@@81N;?l-z4{QPpFKSh1r8`eqWyEuBPp#_?A-paL-;b1@N4g6&gfJ*W3?N^YeggB
z#`-ERk4XTbr#T9DDZMAH86|U#QLin_z7c!*W6W5|1$$T#IAl!Hs*h99NXhHr8H9l-
zL{a=wdg?sfxcL&hJ2yWn(+ZKlWRr#X1`Kw?RHw8=XV$<Rm9=_ArH!J)HMO&7+lboV
zi4jZl(VabZ-LrlzN`DQWSxHZBV-q+0&=ZVS2;;LMdcTt%OCQB7oeWzQB60w&y9cS@
zj9<)g0EK(M4&)&!eX;l=tlV2;bbD-oaxBXltoFd>@1gE(*||uldign6^w1L1omf9)
zjp{|SUlw9}N}pM0%&Xd3Qny3}`^jmJvf$#{rgrS`#AmcoFRa`<2;$RIJhTGD718(^
zPgEJ7dJz5<!zADYca!5ZGHW2ON3PSYXJp!Q{$!5zi;`Mq)#NVjpLXbyffQRr;6l2>
z6Lcf~PrUgV<NEd{NJ|`>%tPh3@gQQi3c6kbh1y{kW|TxG>_^b-qwZ88@RjA1kx(Dv
z<x5O_hMvUFN6SNv8Gbqny?p&M5Y~dKe@*BX<(V*wV-prn@r0SmbKXeUk`SK362^aY
zW&&GQtD=u5*}@k&E%K-c+_3<!5N1@C?a#;}<|EOMpIPC70&2LDuI&X{%TXs-03uAR
z%)bP@8)1Pat-lPwt%p8+j#ry8)xU;?();oF?b7UCg}<F!gUVruv);VY6}9;QP4}de
zr$#70Jm=vX&V-ps4~=;u()JV(EI454%zrEFO8_f5=B6An8UorG(D<CF7j)Z*_pgnd
zy&FXk@rhuk0ZvIhPbb_w3FavU^}$bse$36*q_8^r83S4iN?UJ0JEP0x_H#?DUr^Al
z$%e)@s6l(9+_L0VL$oU({oN=CY>6*g#AAV+?|6`i0C&Vd6T%l;xVF7a()Nss<uns9
zDmmezy)z-oSZO&PaXjjc4V=S`Np^2xzjYvAE}7$c0lLtWAkGl)1O6qmN?r&>kX^<^
zta#7lz?OD~<zX0xb<!n4@W25F=v|?6F>)aoEFq&i@1$+uzTBhk5*xm>c1+QzGlBmG
ziJ>wp-?3mB->J=d4-9y~!#V4>hA`9;RqgPhvttmU!>a9pD(86LbL-M5>7kyF9YDu#
z<ox}$jghapbpR9t4rmZb!q5f_C^zDNNg|YwiZC@EImzucnp_&N>QtxtI0csV`c|aS
z1<#+M$r-`Tffwu-Fruq9;8#v{y+d_Mvqa<FYmmOMvTH)-_`}8~9Z#@0q8YNZ<w+~Q
z1xZbs_-Rg_l<%L6F<OJR1&KJK?ET~<t=wv41p!Ia<TD)W#cdR(Xh(@M`<fSS`GPDC
z9NVxc5GTUcg`J+z;FxxFHk^(I*@(JYNrKt|dY4u@2n#cj!iO82+tKV1#O+A&a$_+K
z^0?ZD;5OuuLKc>`5W7(f6k2s)0!2&7aBIx|nDQ|@HlQ-&8Xa-9`G5ud{w*xLo=4!8
zA!iBZh<!;MEORpD=+40}jYU2&DXwg_W`^odtp9-$NpO~Af{>%>QG?0i&4VNuZ4S>!
zLV`ILl<06cH%q#KT2VkySQt*$nh!hm)1XHBn^`;Gmgm1_3FjzZX0@4t^4G(M+?t&Y
zv0WN1qu;!YBQ`S*TdFWS3ryqIg4BNT`!@*~zXgbRW@l35{LISAVof}KYUA6W*-;SO
zK);QW)8W|tMLs$sI+@e@8i+zUoRB_YfYMFw+nFSd)k|tZybKs~bu7k7be>7qoY}Lo
zuKxgd@=*a{Py0bej1my{fsn7ZD0?~uE}+M}D2Y@cm{WTkYi9sSUj(k%yeosS4$>ns
z`mhK2FvYrAo)$%$u@(W^M`XTWvvdq$bw<Pq6Val~9XUZDH|6LN;(ZQ?!xL*tniJMs
z6Ke(+ZQwU6v-^c{a4S%x!||P60h7|6aeajOQ4OD2k)xjciM0_%3L}G_Ae8N>i>c%H
z)`XU^79Gf2i;)XxW+DR!F4!qvDl(%WWd)!5eAr!eBY$VqCn{bc5Wx+(Q_WruxOgpI
zdJpn)VsyIrt<owH*=X?&cKU2*3E2uumJvFh*pU0d%3qNCpBUP+wRj-dQH3i{Bum#c
z{@k&Mb&n$4VWismBnQ4VoajIer6&Cjc2`bRyC3Bd2V()vjV@f%(jQ^WlTjGf0{WkF
zCe6aUvRlgzA8=FyNY4Hd{J<HYP6hP68MfdIV8DdzzbU;X-Irwe!6k21uqzskVh9N3
z61Ccp!qoQdaQV{|l&~QSOSL<TV;a0q!1x0iR`BT02_mE6sv`@XS)em8;1R*)R=tR{
zJD9AgKK+=H%Y&cj7y__nKQ}x`yl`%Hio?i<CJy2URtjpk&x(>`PEOEz29@OgLv}v1
zJ8vjLHJHwg^btI{3G27V+yx|iV#X}PLzNbX^rg50xOa!TK9w${f+qSX5w^!}v}GD)
zN4%Rn*$z0TxQ#v#&%c&FkUo|kTF=sz)d4%CD>?)E0BCD7GCRu=B5*7*&npP9z5H*a
z_oa8~pDW0kceJiH@E%Xm&gnjw!T^s5Fq80HNupGeqQQqTZ7_1IfPFS>VIYE+(gp5m
zXTRIq!3>15we)?@H8rup1x|88{+~deDv2_$uuu)$+Wfc)AO-txOFxwk^c(Ehi%Y1h
z3xMR^h=#l6srDxMrR`-$mrQkAABoV-CpfKpL@yg8@|q)x1z0m!f_g}7iUMLZQZ4}x
z$14{#;uAYiSmwi8D|Jki9&T=zZ1@|}=cSjVdla-vJ0Emr8rECl6zRPoeT(xcSH@Wg
z2|HM=bP#^7rMIMS({*raW#$&>dxkLTHXUxw2F(q$?JS@<u&~9bl998LnRUrl7B9r8
ztB&KW_5q+)HsG{2jSnjO66p8?=_k_1lL~Mty&-*%9_S^J&Z!Y8vLT9dW5!#%crLww
zF_;+)o3V3a2NHMwKJz}VY+?2+M=)Zd{5)L*E_j@bw72Eo)0%Hd3wQ2@azkF?g0LHF
zls<tfk30g8i1M-Y6Ue{|o;i4TJF9RUjvZLSoPHqv4=IzrAUz-zIbjRqw*uy9N9+GX
zI((djT@qGz)S(N*d~fdjBk4QRiS${^ol>xN#{Na0=tKr1V?u^{UBc*FnozC`7s!q?
z$O}UrXEfL~l;IvorSwzACU&8Wna%6ip*hhGSwm224XGGM(AU!2*q=N6Zotk_e%@1+
z$UTO+kbWq=PWZmVV9l|$6<T-&jNf1UO!|rR6X{I)G!Wd*qV~*c8=J{y?2b-_ngIwW
zbnXR;C8CPJ#d|lBfqLYi2_;x))S%`_&VBeoz4y+H9=NyJE3tkD;y#ZG%mJkBdRzKy
z=|!pJXyeu**kDp_2?+X9+Fp83`g`dG=~d|=MbMhs(K8B1Ehm#~LVz~XPo)1Qb<%Ic
zL`#a2dY0~CK(|bb+8?`t0HV%OgPMeS#`3hh%i6g-NfZMKoEhODlIJ;I?Rv5iYeuAe
zX~gFR4s>o|VKb@2Y5m`k{ziI1`gKb6QMi&-Gc&_m6ifI+>Hm`U($_f`JL0I82dNpj
zHBjK4^rrOZ(oFga`+P>qwgddg8PHT8=dV#>*qeS743P*KUP$)<)%s8*(Ca0JGST(!
zf^9}wI>RLO4lIKJoZj}7SulkSwb_}**6up}`I8j>xl~aGTO<ELol6q+Nc!08plS2o
zkp4`XNx#KBLy<M4-iIr!*Ati?z5M@J`ZHpcWTGEY>8wE{!pJ-VsnGEjge+%he~kzo
z_@@DEX*>*Q0Zdv#ht}NdIo!q^-_n;t15n?gExDz|$qk+J6sO1FYplW>(qBpsrC$TN
zJVAsuZ1$dsWRHT5lh*%D>2I*%ORM7+M`ef_y!z(iwe(}@FF95@Gg)paj9#GNA4~6}
zRD(}46)WjQXby>abC0-dXI+G+MpNuf$SIqjSsrwN7~kSO79$)aY9O5z3Y<*eT0OyG
zHgf<;XdX*{4{`jf98i3q0(D72__6dI>HBmj-dy~Q2<R1T#U+sRCDu0q5pPRx0T16?
z{7Cu-{{2^_yF`6YrH|;Mc}Ax9*5n4%jCy@l5@YNHfiEdWBxU1XqQc5_0i9;nQe4BW
zTUa(Q5MXyEm#9&0Dv@OKAiX_VL~5ib(s#iczrcaQa~>yb_?7hI(Gu3uN7CQnw7!gS
z+Ox?^2)sX&ej>f=OPJ~|-y&pqkubNR3$SD7wu}VZ92=nYi8j*K(54L$_!6Yx6jR-^
zn4+|IAP+l5t7>E7T@TwkZ#M5ldWk_oH9k0~qtOgrA5X&b^?xe;K)OTsd$2OgAs6if
z=`HD<(fU7;{(;=%b3}8AQL<*zR)QJ6C%x-SSW8c(f07oY4<)eAN76^qtK7MhzAt@O
zdVJhXbAY~ZNf21GaxGZG6?^oQWgZwXk?BOaPQ>1428wh}!xhr9H>^4~wE>6-;zaz)
z#QHyy{#weVr_z_D*Q9%_+!LZ^N!QX3r8h?=<cais>7I0-IR3=wwKY!eW9e<_9c$6K
zG3)<;V77L8MEcU4|CRJ(=}o^l)V{KaufkeAV0O+GMn0NI_8!IrDqAoPDp-I~TyJ4n
zY)0NYg!I1E7aDeWPLNbf&!k7vg-;?F&{=lUd(wZGo=E>By-XdpB9zPsPJSeP-#0+g
zLHbzwdr8udq=z<G`vT}Hb(6m(eILesf9DUSZvzLt1Xg)TYfvsd1Aq7)j%A<#HSGSC
zp_gI04rBuz&o#&+G6ZFBT}U%txWVq+0j9b?Dq{o%Xko3B2dhS=&A7|?$kSzvgY<#)
z*HSHg2_jRA4GSW{rSt>odm}MS>;D~TC;dcvU^7pmqhu|;Fa22h(I|*acm8|nAic_C
zmsso_D9T29NBYri5Kskc;FLuXA<v%LKt@$y=6O(OI?#~~v@qdTB){<@^7}V*Xx+ui
zWd?+VuV~ruGvcF(^?xLNTgs$&r9043?hyARfAoR$hV+xs67Hpsq`#MT((6C~=iF+K
z^?xe;M0#_ygefoho-~u*A>rRkA4r{a&qy%|EV`Iv;&w!uqI6MVu1jOI9Y)<WQL&j>
z8||7(qs)v(7&VSSqibVf;eX4%l<f93xzhv<H|)<}vKF6~UY2sQp*bx5pGrR&iEzjI
z{|%EQPdSTk4tSjc+xMhzNk2HgIBDKqdRux3WB399KC*!yN<Vbt<qjZlo(i#DST+?k
z>7BK*C~Ww_`n2YjL6$b4NkbI(=962APehDY2<(}meOo(B<0u0V(eb;|GwGkK2QJEw
z@tsS;`oYBsMY@o_ZKszNHo~+df4dy9DAL6rNY~Pvh+(+N3=P{@kJ0#((`6Zi<e4$^
zJ8LqmkhUH|sX-P`G5RInu5HNE45;p8WW8kclDTvT40mpEXv#$|ZsVcHD*RNslD=cJ
zSA&QUM5Jdv58cK1Ljt6QwcLk2+`+277+stS&u>f5jIh;OTSU-H*T*{_IuLB384e(4
z*`mKAc(#L9gpS;0$3_JBl3}ibOgA_8a^o>N5VB{~OGCvv3{XvX((<+<YS^H)^aJC+
z%`A(K8qme@`lpL8rSFi!2P7YG;+kRb^=)_lN9(HHf?wp|A5TY<<YVz?B(V#`F;Yf>
z3d?FkgHI4hXG8qs#J<|`P-7ZuX8%NX7Db1Ug&9PX_iyW^N~`cQ=~DVW-DMdQ2BMqg
zF^6jHM;9mMdCZGMy|zF*6Vv7q8PK%meGee!U`^)@wqRoOVihXTzBR)D7D)IUH!!CR
zQg|^mym2s|3uCwWLI$s}i5g@%)VY8|pGsF~WZK0ACGP(D$5=!Q_3SB6<%<+(U+daC
zG9x?bW2@0*ToC`>JZJZle+YaybUSX_P2M>+U_rTHLoU`C0Wy9!vv5<ga={&&V<;F6
zOVWY>bp|kUR6sknctJP)fpglDBqq1~$cs?NOPC6zA00P5*+Mjvs}M{0Y)G={>~-Rw
z={pC@aP+(JAk!C%8}7{4#nMJcjaY*BExsI!R8SStuOwer*f^JHOBpPFW^FgYKk2L+
zWdomg<H`4={C_At9ui!{Yj@9=|1Cs4jc^iUcRcUn52Z&(*JSpcc5;qFSf3^96vXO|
z7oJ%FbV@)vKoK%)_YE#mWeQsZAm^j-HMPjzG4jd|#6Rp-vO#x<C#OGdf1JkHvD1-m
zPcLCKvuAd_FQ=D6CocZTcHY&l8(u&Dgi00^D$gMN9E{j2TtYc20*5!K?U`Gw*n$Ga
z+G?P!KY0`?TMI*%?3oNg9XG+0x4q27TmC$s(Z$(qiuN1<aN!tf!Y5zZBE2)?&Nj9~
zHKgka8;}`VkhUv=*xbBVbOv<(Y+)9*6rpmH#}hXF#7XS8En$81TtDwBbiee@X=>+3
zn((|&s`tH*(~SuPYg}R&nHn<eU<-ma=0hBUKsp0Uc_KX;;U__kJu%F!Fgj5N&E(N-
zX97Qe75-UE_`mRC**eXv1#k^6wns0VEYzE|uq5fXg7eZEOw?K{$<4(F^4iypHyuI8
z!5AsI^pf;~9e4dNunNDJ!TFzd=hpJYKz{a+sb?n3wVgRKxbwAn+yH;P43;3w6V8n2
zSwIH>@A%0RC@VVy^(5Ve>VESnf9@CjWta4S$@LxhS|BfH*2EpbegGNc2&5qSjj~2z
zM53jE3YBFu{ml<Xtk_}Y!|nw8ualOrE%TrA)&3`};=la*%SbZU)~g$1cY?r|QseW{
zOvuclHlc>)Fb)kLd;tp5iH&Vo{3ZBF%$o?_TSXUR^?V2Xukq8FV7E2`sAW;E42A2C
zPsEEl?$+?P1|O1QopWd^*92C*1Ic09uW%Zr&8AzTOC3GO^M7OOpFS-aP)785Aw5Rz
zb0qR$*<^oZ&?ll%I<rotYrxBvng5GX61Jz`Y5`hYAQu^%f1hbAJ%0T!>;V2(eKOM9
zOXm13`11;p&TUB4$_FxJSeeA-=m6jr*4tUqR27`~0f1LCG%E~MO%TwaA+hMiFJ%4y
zwR{p6=b`j~i7K5<!fFhUogYsG?-+3xT?ZvtUceq(P~j6##4{U@5~9Cg^34T@nx#Fv
zM}^_vp!I325pN6SqJaDF4T_1C-8Yj)Ub}@SlvDeh!)fSjH2T<em&}ovvG_rPsK8IR
zSfb100^q-aPeky~q`O2fIh(Sz;iO_f&vFE$H56U$V9SK9-cmPki!@mO1(9h-njNKs
zguESW6z)Ly|AigENYQ@DQ1<5nl1eP@3hTdynH#1jh-s&GcKOYN1^qS{kJn&1vdIoR
ziYNo{q5$4GG5T#`b4{EkRD9yQ{7+uRUl@q{W$v7!o)@LpNPn)u4;FSv;Q=~Kk3w{i
z9zP^^TG}9;=>DD?&0(;F0ZGh|_|nk64*IiDum4p+&;NvN`*|<!0Bn+F*wBx9Vz_@&
zipCB{t|9;m(7A;Gn%fq1Y>F`KgCshvyfg2*AvI=0G3@HL6TyC-h4|%{@Rv`bf8L*H
zY$CpR!-8_l21N+6+@PPkBZ3zt=L6;c64p&?J*t}#k0knN_VkKZMmvfcTw|_#iUs5^
zogIJI{_}ZIo09@`%axye@#N>$i!(@f_h9IxY~V4p?x5%{Sm(Yb91&2BB)1IQc)--n
zrS(6J38W<^?u5quy>(|K&-~O1^q(AoX3t%SfU`dN;^Xr1FTZ(uP6a;xu0Z3jI5+eh
zaSw{}Zg}{kgt?B|RYxahNn_^*47-^e<P`n6@#N{Z9$*+Rq=uutXFuoqPnN><3oYSs
z4W<3Wi}UAh`>pa{fq%`=`z_|?fJTTlDtw@1{A%zWV$RnAXAtb$jZLz<Ngwu@^=ktH
z7dCq#Lrt3F7QE*yp@#qab4j6Czx=so>k~FlQ~%pOXZ<^?Tg|NHBsJy-4cM#f^=K_B
zJHWNEsjoqNIasZ?*Z=st_`N+Ul+j!fKOJ38&n*H6k2MMxVQZqSvz<iJWwQhG;U2`x
zIE`uiDTX7cz5LqR1cZq@XK*#6^4&p~+%o4XGfwzK9CDz4W`!qsCS6OiL*``9)k|dw
z{@}6a0k7Xm7skrWKGFK*sBtxv!Jtz$6qlRHi!0j#5s=A%-NT4QaC~Pskxt?Igga+w
zzaUYc85?~E`)^5~au+t_U=wCC7<)S+oA}Wg&iHJU%Vx&%ZP}EH!w_l%X6sLc;DYX3
zT0JP%GS+W;38P|B*gp|mw+R2?IqM&q7ZxWrzkZJLl*2Bp;E=BDPeNEvor{y1i3<ql
zir&<QBsQ~_=z$<N)^Hd+lL&BI>|SA`sh$@D<(A6j_J9HN*3UN~!NPO?VP~dtf7|*8
zMb^6>T=Ndd|1^uPw$e!rigrpioWhVL!ZVmBSdjJB9RB5!Bi28|nGXh;b#z-kWB*gz
zZVs<8X0q&#$GMlsYmmAsUzZ-TX***xOL{E5D}8j#=1t9LpJC8cZs&+BX-0e|y)V5(
zOgy;rne>YEGMQ^+E*Z^qA4wldAB<YiWR@|v=}|N31^zsyG_DLM?P27S4VWW+cWr+P
ze0uPkm!lJ@ceeE<Eo@gbzg!s&@>UvVNdKAis`MIQpd0WgcfK#Zb=(e=)6M=mqa?zB
zbO^wu^nvu=@e&r&9qAS6Md_X$dJxe_0waGWz2!aNe)EExG}KKZvfmiPv;}bP_urYc
zY2M-)6-dFI&*@9^nSZ?j3VKS=Ei<eA1|I&DJY+fuOmx%Nq?e^r$l}rYR5Lm31193M
z#}~gs;<&^UM*y=$c_yX_#KkX5_o<qO^)1Mtek#2)*$5V^5OCU_lgH<V1eY99(>ait
z);WeIMnCHc?tI|(*)arHVCd$?GW4o_hmO^sm;om%@Urv*RfQ;^CB07LUOtrGk)9o!
zkUP?6q)*wYo-xWJr^xrSNfVOR|5dukmKHLHTtAWCn^c~RQxWda!dEddpf(?L%|Z;8
zFvD-nP>eGo+349jCkASPLkIX#1HnJID=m@Wx#@B}I!RL_2av4Km!xk>zahPBwDc8p
zfLeM_`i}H>(i_gl%cXPaYtkP{pXDU7N75Bzn$M+LdSCjs^j|pFXK?2)OTQ<5m5I(7
z#J(1I?LFzc(w|E|W<ACVaNzB?HbQx2oz<7-s9S&dm>+aa;p|0JHzR_b06;tI^4tJ=
zhH@oOaza<)`nK>np5j#cqVz}7Z%8j813p_TiP7In|3&)Y@%n#J`d#Taq*ttMtmO+^
z=_Bde(x0(;gFAm&`W@*D(#v?C19mIfpbP2mrT?$=gX3o4)@-mSXvILo$I_nD7I&kW
zsL7_>=d>bWwD#<4&Op%2$Syqxw>7AuL`ZXM;Rz~sOC!*9BT`7=?@M2k?l3sw4CYFN
zA^#}-w^2W7Ea5k$&r2`c*x-1a)QS4H(w|E|aS=|N_bbxxN?(y)v*FQE&D}7u<G+q}
z-WQ^?OhJ4jy1+UN&U}<X)i!zhU`<lNZH@(=nN`2}vE01e0yjR#nN-8ki!*!DTi5@$
zr7uaZk&OftaG=@mpQQgH{nKdu&!n$N|4I4=0Z_1Ow%q>%>08oYNIyPSp07&3FMS1R
ztL!0@CH|@Ox6)roZywv9f>Bwg*22};K}B-}VL{LIzz1}etX5dtE9t3CvMZQNH2AdU
zVzxH1GlSWZlIw+$y{0mNWNyDG{Rin!q%TPe;-w{HJq4<i=IuO_ewOT63avjc{ekqm
z(nDO~QzD%+?AZfpD}7&jJC3JM7k^6nrt}}AFWSkpvG2*Y{;Kq%^s)58<UF$4IB3_*
zq+9}tMgA=#N%abl*7`A0x`fhog3;aM7?zl!UV2id^hlC)Mn}?tAm{`KdSE2@1oD<1
z^|R6+N&iv$GRF#Ev-=yA;ez>L@B5)nOX)LQ{59#6x81YAIX$?i()-efzW(Xr-;w@6
zdX-^v0#)3xC<zgJOZwTUm(7ciF@!ND@)Xit5S;^5wFx3gD&VBM?b*nh>Pb#~IiP{c
z67IkpqNaba5jYvGH4(~=*MB8_LHc9q-%DRcexH%q-UZIPkXF+Btp7m%|5*AX=`~u?
zYlO1nJ71OVNskE8+t9pp=RcJ`Zv(_i%xl9~lm&yor+W#v*oOz$fNO3Lb@-M*vEPE6
zv%lxY{><5h8Mt$fG3>i2J2NW>oPLe#JD_cm5G)MZ8C(D41OG(&rt~VaM<X*#0ygLE
z-k09>^^ef$Po=NILkJ!~boHiokpw05%9Aes6X{Q-&sy%5^mM~2zAAOnyS{lLJW+!Q
zs!b?gVFAdsbqVyV&@d)rDcv)`bW7ZKPI;yS{~2svGEDbvep5so7j&nF%^DP$dP_8u
zUXgxF`fcfRXl&B8`!sl*S#b8c^%H6)y)6AU9yFS;=XStWMsU88?mCZkF1;fCy7WZ^
zXvLC-E2|+jydu3Mt?0Jw4N!|@#_De=h(#7CJ9v38%3%j%a7U^bU~~XP9l+eh$ZwYH
z$(FW{in%mvSgEH#s`o}b-1t<|E7I>ue<Xd83~EiubjEHa$MjHoO*)evB{7^!uS(yL
zeoOiSGIW9Wol{&r#elv7oN}}CuSvfr{ekoaJ1itf3Aqi%e~q1gG;+YLP3aHxGKx9p
zSi+rUcKzlxCY*s*G&a>gu(0HeubsoV<J6%5DX*y-M`~SLC!{6~P^`kANMB|ax(FXT
z<3o1RU8JyI!dIo=k^V^fDiqh!u&_sTO1&U`Uiy@DDm_ilo0R7p(r-$yQ?;!OMaay<
zC3Nsq`uI2iIxwc-p7bIu$=C4Eqm0l@;uq&QfGvT>9*;R^d}Yb7tlZ5=XTg48)tRMV
z;cGg>KgROEkUl5<N9j+b*D#TtF&t+$rt6{fY3Z)?ak4*i=~L1lNxv_>jKZ`;V%3m3
ze9D{NltC8Kr=;JNzR47)1ITrtKn<LNPctL!+35L$RaINNacwD6Zi}kX&UH3EIl&a2
z%@Qx^C3Ec>>|gq<h;mCT{*KW?1!}TmBeQ`6$fXnM^U~L)R~fywC7@ojV$qrtoo9oK
z|DE(%+=36jE0E0;!JRQCBNLh1uSmZqy-rX0oTp62V2%6NuvR$!Wb~r6aDkR4NkW0g
zOp8D%gI<!`nL{<f<|)3R8ewob?*E)fJ0z}S!g`A(URrC~;K^6g7p32mz9OB2R$SUh
zwhfzkE`16co{LJ)*Q7s`zAn9pGwR47wrt0eO!dS$fa%VEDE&H#>D)%(Xc)~^qYBh$
z6{8GNfj-t07lkHlZ5+_eDje**j1qBe36{=~wgNnV+{W6W%>jn4`Sy%i;2n16m>|-F
zZ{Lt!kveIE^}TPKf61^Rc`Hb|I4RHTL}w*)`!d$DBAQ+Kv8?Ih&q-eewtFP)$c8$M
z)}HqtD9`byL$+q5r3>)vFf#jNEHYFgGqnpj-4hsfCd&g9Cns*Ja4}mOE81C)S`>uG
z*7J_^tI{{6FL8{cS9LmOtT#BhRV-oB+|Lub7AEi_3CviCW>mor@aMPjJ1u}wWpvA6
z63=g2{ElSv$}-TxYH$^UW@6IT0D)M;6H3X}rPOdd<qT-^V3TS447i4Kx8{a98O#|*
ztsoLP^ETq%0R#&3CFvEZrGf25##`S*@1nTSuR&VF&oT&li>#I=^w&fqGwCk%oh&Z?
zlJxJSSMb-Nq9x){SXfaV3sA;O$0Ve=H3S4PY>VR_2yn)s*rz;muy9(8NoG^oi%}z3
zupm~9`e?`>BZ6C_l{>U=V1tqmeT@ow$+-ifAhpM#H@K1B1}E+LqVyFzExwaZ5Uc}3
zMLZ5G;?4`{f%JlOk9ybM)U{$j^@agx)21i*FLPd7<h-4oCRAg0#&PHYxbxgjFxgvw
zM{Onc-UnnhEhS#3z=wv1h){gsJ$(J|N?(#bC5d&u<kwH26KyQM>eoLRpckZvz&=S}
zo=QKJmeR}Q&gbYwjOXedz?Uc#HCA^!@Z?WT1k1?)C^yNi4Iw&XIm4On5$Zt_bzzMg
zJ9a*|1IEOubsy7M+$*0j*Cm^LA!SU>ZS0uA!r-96!=IBNE!gdY^^->sT@E{QdgXal
z`V@7$47;27dut$^lmy>(8=ltxQ_`nw82^%Z;$ylK&oS@Y+ZY@*BLZui_ZwxLmPjrv
zN6$s;jObj^N_2n@l)k;CdKrP`0Ds*vBV}%UlZd`6c2UwpKoE@`z>?|Jcd$|!Cy%$o
zv3^l8?FC?ga9<I)ZHIy}!w=1kBhcUZzI0EzE1enXEVCZa3+{0;IuO)9Q30!s?|s1H
z%xD3Nq^_sIl_sB^{Z_b#$ltC<RrC&ZYp{E*C4z~F6v#&u=O!NB<)g)b=cEifV#ki+
z(AozA&TJ4)&KsR_pUzOE#-Ob%F0P1_N&<@7D0NE%xDH09?LQgF{tDw51iakl&mP#9
z^{5Eu7YCm+YG)O>bZ0PfZ+#bFE;TIE1@3Z#4qj8u)g%))<)N=JrXzuzk4dgF6QMIZ
z_q7{6|AIgxcut*_K{Dd7cEp8APVb>~AGGk)#7Q=J>QuLXf~r>YpRElr+yJL6Y%c~F
zc8v`Q@bA<PK^A%Z)6q22jBsN|aUrAGrnSNJd(L-}o4(QW&oG*iY~-Yj4OaaOj#zx>
z#)iL01|H8+^UfVcsl@n{ZhuOUf*?O-=vFN~h86A|&f^R+%uZ*T(_;U^A9eP-gAAaQ
z?n$rPocZMC!WnKb0r9+*`>=rf?^q9hRPQoVwbGvA=D!EqxZ!B`<El^8;&TgPdQGYT
z7HVjz@qR0igB6MN;1eX)yRe@k7tMI6$JmCwLCxbmXn+@Ld$5D;!HnYQ`1%&+!WqhN
zZLmemr))@v2hG4OWulGAS42Ha+lk$9+~3%S<`$hgA$iA|m5?a|Lg}S%Po!U1A{g#J
zn1JO-0mN>X=K5No@7d_9R+Sk%e`kP!Z%07a2b|VD0_x1)yrfI_E_p<0b)U}sW{24I
z>04!ol*c0^?VGc;X$IVK1c3S{XH4b;szf`RU~piA&e`eRXy?-XFG0R`(i|?(DPk_^
zLi%WQsCPzE;Zs~TX0C&mc0!;y{=}Bd=)8*>M8G+r1v#a=0T8*9KDJ)z(EOB#E_ug$
z*x7PqfC3(hoGenw5(u;5JjRJ2I=GqwEnVS>H#|jzI)gUa%<v}-nQtxrcws>5v&nJh
zl~o2BY8|K23*2g99YE7ba7LeKfF&~gIjUk?j!A1`{Z`To_%NY)(biGeY`+$MOHZMz
zjHh-+lv7&_SQtc7SQ36?4|<K=aXRuDGJK8bmL}vfF}nE_)UWSbE;F-$q~h%xY(pEj
z-7_TlWsZz#FasUv<JmB*n%IL9qj7+}D#N-nL*-<4`sP1sI6FyiuIzZjj8!f1%&j#;
z-n@Krmye|a5_rbSwOp3=@TsvV2BH(eZedocBpnPCWbf85p1)yRuhECVd7?)xq`DYE
zC>5#IfxxqnF4=sK7iy5m2?s0;K(vZwSh<-o@n)u{g9vDb#mQ}|ee#}HWB?^kF(@EJ
zWD>7h7&Yrae6=TVnoVYq-4JxHU@$axq-Q}2UX6$tQQB;56hLL^Q>2A!Mn3O-=flx@
zq?VCs9EQT%Oh!Kk60CFU`E%9ZFmCc-XG8Y#R}y<R<a@QHX)R;KPK;4Bu%1gWyNp^z
zO&(B?ilsh?%h4#65VGi`t*neF7P7IU7OzHA?Zd>zLe$1KtBntO#r+2XNQNS0c%RCM
z;jO)J3R33B@)I=Z+=!Va7JEx(SyJ9x`}$XQ%u!7zb+kavY^H56OL`n4%1Wp9oKdlx
znayh_&1U~8EDos%sB2SKPYd>;g8^Ng5IgO8n7~4EgMLG#`yxhd;9!f=1roNi2xK(^
z4_ts3!?8bm{yZYv8VQe5R1YM`FpCtpZ_VRG9W6I$>+gI@Rw}^2f`C3+Smubgjukk=
z=ttBb#5i1Uv}1*A7j8r5hM%+uzyb_*O%eW_zZ!V&IZB^djj;j_sI7KB-3jRiWXqGE
z@kS-FbH~1RlRl(jon|H;Yw06g!kijRKDv2kZlGfcJHo7r^MzZ(B_?jZq4d&F7+E0~
zwVey^AaTF`DFAwfRXCWV4uaYqtK7H^EUo{rGsW6qtD<}vzfw;+V9^EUHt;Se$brgi
zZX<j8=RadqbFADR4E|vC{R&GqD8jdv2Q+5rayypk#JYF0NdvmI^DOsx?3^-KO?mIy
zwy=vUpIK$Cuu}Mpt=)rej+@zAFz;(FJ~Mx|zyQ~PBa7RT$Ogl9u(;t;`l+;$J`DmA
zHJsjrWGMUw$*(~(b~Kbk4eQj;Q|-u+G7{WKqSg@bqpEN?J`bZ0f?EQg3XxdZ*w`J*
zxVGc4I$y)ah>|HiJ~v|==tkfrJ&YJ0Y-Ogeuw!r1<DN5YwmM#Vr%`nr%GKDuwJc!p
zgKIZqX#;{Hm&1DBS`TN2ZWg0~h#fKhOPI(f%sZ&8=I=Mq<SRsgox#@Fni<~WXAb=P
zY~+TPlsKaEDY=_V=|kybR$7xoS#oY0)U}N8rQ)|<z$#2c(7Ay(B83l^)&Oi5Hc+A+
zy?jR{U}i0;d*GCcs3aU#Z-e5W#^w+-NM3m2#xmIP;zx9NmpiZE%|;)D0N<`hM2x7n
zrI+8s7K>!JW4z#+H5w>SZF+y;QMV);D>CDt#!X=884`LyrqW?UgchDqo-0N$a><b4
zyU5Q9p{hBUb7_gzK>l0IN@L~6J)u))XCfVd=cj7`xy?{(Fko|wX~W2EoXFMzfL~K<
zTUcM9Y&zG@Qrv#$BR*UMJD%|f5i823Ak~v9T;iX0ATK#<S<^lg>ag@yn9dHN3f^jZ
zk!QSfJ)CkpE&{i(YXaxmS{X8wvakX~|Jc0&y|j=i0OS(KH)DqvJY?Z7kUD$sP)Dnc
z6fz?bOmDn(9(MqEftwc`UmCdgDK*jgaq{g3WDUqMCuB~___5V4b86LVe}uwhCH#1a
zl*N7=FkOd{40MDBfmszc?WREbo=88FKECZp$f$#Dq^qM|ni&zmlB3@8<Ljf|cV@x1
z2<VrFv^1mVPdG^w$Rmy}*`qmhq?6+eEMzi>AdBJoX;BPlnQpsrA5>%CPVk`(Fmk}P
zE%r3LM_+@f4J64d#-3r4_Es2~pdm&Oe}b3k5QiNaTLDrxj0)_ZKkdX?dMb$w{Y7G+
zXi?ZwjV@hzf|e;$l51n47ChtDdbB1Wtj22OGm@Pv+p~D>Y81!DsuT!$XTDnaor*8d
z{NnUtPe8u|JJ=c{X@LlzjJ#j0e@-=jNrEdjHRnfVT#doSB`>@-+E`TM4@N9s!1o)Z
z&%ya?keSK^X=ez4CNr>%Y2){aWOJ774iCJ*&R-+1JqmsxagP4Uz)mXjn5jbh1SuK^
zZ~?owb}ri%r=1%LGypFLpFrXoDrhMKSmD7_W9?H+WiNd#NyZB68^Owsc9I}g9!4H)
zD?PF<unGrIN)L?88QAi49jrq?N^5yuOHUwB#U_UXxcL;u*gV+DJ9BfHQKhXdmmRQB
z8pbW<2K{ajz2pGil|Gtm<BT9OidYHMNxfXBfKr#kLivOZ5b73ra)M(bzq%TorW&Y1
zL7&*t9${udY;l|Q89jwROJf75b~XxNa6$B}B+a^j3co~$W+u0-2QeL}z?K^Z)ECvC
zEsbm?u5RKLW>}vV5hxL!WPMJC3bYs{AxW5)7>!ukmSEx(dt6&Jv44d<8F<V5s125&
z!GY|^QufF4)1l9&L<w$2N^9~>pL*m&lfs<<lAJT$XElUS^U;+u+<S{)2kADt4NK#2
z7RUHs$F9m+Fi(b<YY>ZT)ND}7oZ;5QDOi=!A_79Ld9U6EX1Mh&TTs|mcZMZhn+lAT
zH$^X}ma;U+Nj`kXJ$YnL!`;(jURmNYTKt^+U~`-S2ZXdI_gb>}SExg9Mdb3|vw&vs
z;I9q9+9CR<Rz(h|Zy*Cf(65K-Tu!`nAD3{KxxAS$k&mS(P`nC^K}QyztmjkdT6#Px
zpST&T?cbwtmXTkqaFNlYGN3>OV6sDiW;8~2{8es#qL<;clzTe?BfQ6km}E}iR~{Qc
z4IOAkuu+)MCIt$n(Krdo%tY)dnO(D%7bN(l^hA0BC^j&lYp}V-n5cam`PBdbfB;EE
zK~xz|BSQ3BO@H7oQEWUTmD^B|y#$h8Si$1v`GbToCm`A|Hsyp6D}8&6Uo=EIkyl2U
zI~tjyIJuk5-=l`mk*wBs9%4;jP6k8rq4Z)j;)yW#0CQwc$|$hB=u13j(B^=Jl^MCT
z1#rxyHH|!xN<W_LBMAylkcsCw_6}}f2^^8x$R4=^xZ<~4i^Ku~-7(E`Ec`j9cfkwS
zd^Or0Yr{|WllmJqfUVsj@&I8`_SlOH4<o}|FWB&w=x=T3?{#KF2NmQ7<T#>K5q_Uo
za#`FKJ-8r6p{TZcT$Jw*R3a{q`J_3qPEOIOM6LpV@6d=|{-fldptdXNBb&$BnoFMO
zqU~*96ryVkoh+1UvG5`zSvnY>*-P&&{7k>HHuIEA_uK)<y8bJRU0cTM&j?FDW?aK|
zQh?&k(p&B+-k+N>4?VeS1Ltn8EG^iW4ga&SB6j%ae(P%5SVPf9bg{_sW^_D^Sp;Fg
zBM1p+5IbC0Q%QasiZLS^2}t=GdmOoZF=~YCu&+TxSnxP;&(f-V(->%r^?xRPBo)%9
zq>c<Gz2K=eBQIU}!fticJ)#YEZs>$~@;(R)2r(zLiflVJDS9xUNgqm2ZliLyJX<tU
z<&<CJwG*5WKt@l*sL;|GB!Ubg-HzRfAMIm+pf8kmdT0c=dqR(z=pn9}mj95vj&bO@
z(NMPpDf1DKnnd6P4rxsu-{LweG&-6KdI+nd#q<P6zhy3vh7&+$Lr`l%esgy2z)yrC
zc0eGBOLTpVK*V00p|BfHXwA{0qz(rUfEp?W=aw}^4ItW5!gU?odyGDn^^vS?G*E{F
z-56vwAmW`60OvU15{WDf`8zceG+w*~5o?LxEqI)@^vK2m^~yhJ=Mt`DZBn_Tx%mVz
zJ_@C|JE5fgOFIXmL>{7YE@R5{8ALR~JRsiPlXz4H`ktZZ%gF_p4*_HWT6=@Wp2wbh
z5Y`z(Bew{k<AFPq`@<;zj0`O!<d}n{bZ9~d;}%Nm;@APi^EZ6kk;nt0EBx*N0t!=9
z5|d?=qB34}jZ2wXzJ2qCGeCP0>3TIUA{eh}PAG6o$XQ~jM+A|If0>Vhn8G*$9Xl5-
zQ~{0E!#<3D<yO4*Df`qJZ?XWn+Zc^?@M!^8cVI@BW&?M)t|R~jLHfQIYx0wU0m`k!
z5-DG1m_@JwVnzD(7b2CO{7QNzompRy5Tw=$GlRv?%>jhK-r)-=@p&dSban6C2G)jt
zC*4|McUso()W(tyGJwL4{K+kRkHjHu)xz5Ir)ju3X-dP+U4cK##?Mp`VFm$QF|<wT
z-OjMS2e!G8?l7va&me2F@*R5GlJ!PCXu}UpJK(7J2|E#Wi!i-2K#&vP#lm8($Y4tI
z1Oc8dZSa-Ce$H8fR0@u@$k@o#ev;d{NjIO~k~W0wGfP1`Vw%!?T9kO2z_232i-2Zs
zMq|$F<-|eHq7ItD*}2D&tsOc4DKc2XsA*sUO=Mwi5cgIr?s(b)bVSEV#NOI_Ivakw
zvbGuFi~<|)y<dyD65qZ=r02$@3j*ha1B=4l1(tM)@7`JS$QgvqfXRBx+}M*x5L&>$
zxg_n}nW**E;ef6yo1e79Z$t(VOsi=OI(9U|r&AWCJ|eabVDJVTv@pcKq3+aSx;I9m
z?u9S1?abniC_@E|ej+`X%)kPFx&$G}jg6i&&^3K7t;2Yr`flGg!@;9lGOH43u3s%d
zqKNck#`(&3hJ^l%cFM8*2<urIzCD-jTkU-ZDef8bhPu)72U})tCG#17TXFrk6FS4W
zB<mAVuSkDN7G-A<_(TlR5}L1Q7r!<xQjK)f<k7RZe_^-Hxlcv8>=M&iKr>s6qQ3+1
zQe%bV=*0|9M{ASh#`1H(Z(kb=DZ?4>0d*>fz11jy+f#j6at1^6pBAit#0EW)DzKX<
zR?mP6H#};R!c!B|Uh@XMPz}xQ)lkLf@OmP7@}gT0pw6uEI>XZ}aqB@&-Z_epkv~^9
zsB3P7)&@z;k@NOA2<<3h1zffo#tU<7(9XTjNcy&T+F&1MoSm^J=S**O#~na3toNkt
zO4555>)&7rKVn_R0@NTVA4#QjAJSd}?!3Z0S0+;9SLlo@SlG&W|4D5)WxoX-AW_h~
z55o%j%6O{NOx8fTS0r09@F0q_^$3HNS^qOc((7d@{N5oQ#mIX^ju!<kC(Bp-RYHjG
z-^OCf*|ulAaL!DT1y&^otSarUH!pdDl}ojn4>{RJL?0!E!l#BV4nV3iR{jpdIV-zQ
zg;$z*cv;F*84WWCv<=4mibFceQ3P7ql6pO=v+~x=3P%0@)zSrnXez91MxfZ3MqT3)
zn#uJq&DVsUM{`kQM6_x|LWw2F38N}p#~J219^^1G0TIv!epd3^0X3G0Lvh;{M6=B$
zc01;1v>1=daGk{neyMP7SFC(c*DI2fYw073p$ETtZ3U!`DoNn#sgiY$+3CHXjv=6q
zWjI*)8pb3L_R2k&6JcbjQVu-3vdC(M0bO_m9rf!y-mQZY+MojgOdlq<e#@A~k&8(G
zy8sS)W&p#$bcJzQ80}!ja_o8Y`KS*eYWuA<J2Y1N+AxFsgivla8No#zwBz`k#gMvs
zLNR1+lL`8ZBiJZ;q9}X?f9Vb}(P?OZ$3yO|ZW7>E=tKIg1BP|N0d&{|uQ!%Fa|cM=
zGBI(SK?cLCH8`Nf6Xl@q;m>;qT40=3{AU!Pmd0fZJawW7A24hMmb2i>6lO-sTN%|d
z_r))eykzK7n6|Wf((<-5p<B#FJX>xCAkgP6zU82)Pn0s#=I!y-g(aSawQI~r${$H}
zPRdf-{I8b8aXUH_JNQGbA@;!l*)t3<8!FE@gA56cZa?Si2WKWa3tIE=z5Ik0%y_gg
zUROhkNRK~<ct`83*s23wy)~l}#%BV7n86Duso<}<@-b_1W<<zA3><*_h7+V3ELy>i
z1W7*g82A88(AvT-t>LreH7l~A{xhz_m6*$d+byi{QDP)x8wa0;z^}Bhvf=43!0nb6
z>+}Zb8oxX@_MCu#D&n(Cd)2W51oF|D8Eh!OR>m(M)Nup0s(74=*e#6F%Ix7s)}waw
z$w4lhNHV7?uqEo8bGE_+h8TYHo=u+ac(enGbzw!EK96wMQyZev5iez640jmRIM%`F
zc5BU6&gmT55Jyd<Fa8r7Yq^MEBFb6|;^?_E@xppGK-7gjNFY+qj_Skjh<Z!ItsDL`
zV)r!#)eQ!Bti2V3a2{9%JSufL8B~q+95f>BLD*K%c3!Zt(Q{IX^2plu;L{r6?}*oT
z7{CfQ7=2w65NK2qBI=)8;WHW{O6i<gM2C3!Fu-$6dcg2AbD*7d>-L`rH&*QG-g*LJ
zk32sae>!u^?tL;IwZZy!He-6cg)+ftEy1a1A>Hu1=N>~820L~pjycq($fjbkCSD;b
zi($Dg2wHL`yIyf+5RrN}x<@z8DR*8CW6MBU2K=q?&vEm#XAj5#TFmx_cNr6Cdj(jK
z&fSqR>5=rzu(p6#TRN3`54=MxmcvI3+RA|}GRGtiDrh@~&lDgW2ZSd0d<W}GnNWZk
z&|wtYqD*~`J-xR0XgtuM0H3{(?r|W?9&S}<k2a{_#WF_QQwQ`NwX1-}GHXQ`D?m-_
z<0*v_8TD)jhTKW-13~nnpJRshJWh#<M{2txmmFkJK}iUsC*CmN-Zh3P0PBfHm>Jch
zXBgnl&`S|Mini}_N73@3(VrPrX@P7XtlNB|f*81Ihdx9pFz~xQG|CAYZgjvzBEBbl
zoa2UtZFuT7sDsD-lmh=LCM7sGHBn|Z!qB}~dKgx{JL0u7nm}p;$R;4nJtsJwTSDI$
ziV-HU9K`^oML%mB;ZsZZEI!VyF}ZO0St4QSR#(zf>1B$Yl?}4$CLdme3kxESCs?E{
zxMFKVjU!_2Su}Ga&_?%dMhjpv+Sg=510qt!2&HJSDX7OR%&}e~)D!aWRy0ZV6~|kg
z-~dkW-eV$8#K4W69Tgym%kmZi=>S5_*}oRXb!S}l=qn5TA5;(f8bAw&ud=atk-S$t
zRgNngvmm1W9@^^}X%wf{oQ^c0GB<49sUe9G0n0q|n7|fGUr?Z46Ed8!lM9Q0b4qCg
z7EqK%6T~?u2zRzj;CZD>pUf<7h+{ibz#tbN2u|x!`rtcRAvD3LSsD_%2alcm{X5n#
zC}z<u*AcZXNx$l29(_){e2)i9_Hlzf>8Js<qwv)~!4uhIsJt7argiq*!s@~E9|#;T
zIbAy+63#-rORr;v>As}QyD+|XEO=#eByvrC!q+E(3-ZW@+5ENPQWFqH#L;2EPjO&T
zl{g>>ThCx-;0!^!4y$kt5D^71-NvKOtWp`Amx_m~>}&O?43rsGd`<1VCLv!KZr$R~
z2SlEdaA#+c(28E9rKQCoLfPc=N9X8*Asi)ryK@%olFo|puhdp?sJP+IW&@NQ%RR0k
zMuHNtT4C2Tykm(C9yhYauLJ^7KpZ%u^jG7?wj4<^4&tNGQyFlQ@#iz^@XmaeT4wM=
zlCXfvT_Vp|m#dNd_YNSnT3t#nVj}Kh4;o|nhE>b`9ioC0Zl<<@3N!MEmV)LWh|bXI
zlAfD064t^pznW8TCIINnBIS>)?Pp;j?t)2Z-H1lkSr6~hjOiYDEm#1to&z7$8j5ua
zR1ntpgbwH2qOoZOh`!r&hqcjbYeJ+KjkVARk$dRBr$A*}mb2nn7Lba@C{8dFR|dli
z89GNFt}u!DaS$Gl5SGvb`hZX}3wPEIK?I3897PHvB4z7Xpwa@mUiA|x;Uyr}Jte~i
za~n6GPmUsxf#V_uYOTAbMI3h&K?epTRhJ*wS>ze+HZYzXI91-snK8vTfXVOyB~KB}
z*kORCBhbQnXWAh)URpxb5eoN9ctB=yBQ6AhzQYrQB8>Mxkm(&8xpbl>uT!Cm#iR;^
zA{NBUxhX{u8gh1I%mb03SRr$b<&-m!x=R<n;Q7_;LeAoKSnTA9wy@$S$}o-9$C3|y
zXaiAdE5_~dCxb{T3jCQ#+quc?0%G^a9zez#6bM{!Qi7M5GTU=Q`i@vYGLlC0H?B<Y
zmX@&3iS-6HFrwbtqT$LcW&zc5TtzQ<xfRACSeBWg7zavm6Ve}bp@VtTJuupeM_3uv
zvybD_SD#8xK#6nwcu5_zHJ{pyBKSxXW<<cjs9CWDf#Kbn0%ri&A4y`W_Z%A(T$H33
z?~f)TcNWLBsQ4Ku)Y=~SAU%~H2Mb~pn$4}}Y6%LqVIkY2(-!(AxRgGk(>};68!I`A
zh1?Ko^bq2K_lX>@A*Kubp|QgEAb;M`7?k?PuL=ChVX&MENLRNt1LtI&Yok?1C27xg
zW(LFeHH<mX?Fms^!_F1FU`3YGI_1W*h8#f(sB4J|?2QdEO@i3Nw1|AEw1EiG?9|!Z
z(SGx;IQS$Oc}tSu&Jeu=?y2$3+YcMa55pl8g{2&od6kJz<eoK*Fxf1p3iG_i2KQQ+
zo7lumkr@a=ZUpX*K^wKp|A7QA!+LfAVOy}A`&JDeXMl<4^MUk4x@)F)OA?=xzSovh
z^s7+QCKO|bWa(pA-UVi+m!FOqGndk*jjfquPByew&McwmKLaAb+(v*n&e58BX2{mu
z(}#pU?h%w{(np-8kh8qOOW69kTFGt&CR`XDqqUl13S#crk&a^nbRoSXeatxHJ??*Q
z&6GjS2^jX~`LCs)Nk5}T8^L;j_B(7qMXY~geUtY5RC*+R!EEpq;qww5*x|s|lPP4N
zXd9TI=d_Sy*4&>F3Tec`x(kM>VX?k8rE@n>`<>e(2qIFBQ2i1Co-jbk0lY1}E4_*)
zCdBe=Naxtnq1y8@(&qhGdPBOyFs*XfLf4oj-kFoSS?c$tCj`4Q7GjMXE+9o*bDrS@
z1Q!k<Xe${KUQ<CX0K7Xc$>J$uZ=zmWv43s@8XK`Yz<`6Z69vHq&1ctG(t{;e<Mn?g
z{XqI2Gw2c<{mcO1u$ZN_9QNz~w)CO&I=<?z9S<<$HqI>Q4d}J>uJj(87s}Tn)YnYI
zC`VIl-1r2_ouU6Be~qyL1kAe`NnzTI4GgnAI6_Ewg=mZ+<VgRUp`+Z<1a;4fWU~?e
z<?{cI^i$~t61Qvo)(ZLCu)Bj&Y$(q=((BMqB5?_qG{Y$0gT*&co*zq}mR@2x8+KuD
zWq{21L&F+CX8Gf&Qq%$l2X(BpNiTT_@FhC9Fiv|z1arw_<Tm-Lzw^v`0Hcd+OUO3o
z&T~UcyW<V`f%F~eAvUNYv#4x6YHY*P@Ih&>ek^@gdR2OveP3D@z2pA1^hA1(nE2+-
z-<SSMx-Wgz25@&qUfQuDm(tVQhQTeQ2Q+GSChmc`?7=w)WrScc1xgz+-WCTCOwOEB
zH3t-&InhC8vo}J&3OkTzKbeH3m;9mh7t)#Z>rCLtO@@Qa*1+%YeQ4UezmXnDUoybL
z<HZa*&&NIoAUS|HrSC|evJU!7OxKL~Fm314(GY>r7R0A!h-GH)*cjO=GTs|oafN1I
zQ9-PD;g*8p9=kQsN)b1V+#?KI1*kZ)V?)OdAW7k0ORq_vw+7tY0;}4F;rBK?$^Va}
zzhN3d5U~oAw$|1t{`2PIA4q>Db<)?31{|@*p37Qtz5TKL%j#a<#M1E0fn_#y*!N4=
zS?8e;p6elVv0$`LGm_ujP`EAMc6Yxpi&%~-$S(gsmi|h*D}4<q%yA?4*|{xZ(yu}i
z;U7qU%b8f=Lsy89gEyTYh}>bk{z`ht2&R@mJJ^j;;@^yB@%Ij34oy60bVBKO2IMS^
z9@t-;VQ#hv>TBjPJhggeS<uZ3MEJB|4?7~23h#4nyu0Zl-cEW;`g3U^{T7vsJ*!+B
zcm7&>LwYl8c)Iwf(qBt2vS=Z6Vy!En>vwPfH!`%B-jV(<4De@-47LR`Si_O|ne@R3
zLl!Z1@}4hA9YOGhb6oC7g^eJ-9<}mEFU#J5sGW^%>9AN67_1jAqj#V-&dHo6go8Ef
zjX!(oJ>1{}zL!A^o*BqqlUJ{O{nN$gxciw^x*9I~Sb8G8Fa69nZ!f(o{om3F3Xo$x
z9#|>4C0zP=cwlKS!kKye4VzaWlrBySM^QRMhI5oY!um_1&WL?i-iH!J>?`R*sg>^G
zI`;<tJ(HeE_iyV&O{?&}^w(tFFAzz%8;erSW9b7ji+=O|QkqM@Dt*Q%_l40(lHK_b
z5gsVdf5)EW%zbR^S2B9I-;~~@+kNmU82vZ15HF{Ivd8qwnhgi=MT=RRTSz0TEeb&>
zT!b^wrZY6um9nyAbC30$fIwpj{|CM1uff^7LZX5{`S^Gw<Bs(|lMd2DtnF@is};H4
z6F*q<AblYHl~hU3q%U!lyD;J5@vfvdq@Vic^=p{(h>5;DGtej_9bKZB+ffP@KB0rj
zxH2BGtJjtB(Qe+YHe|FkV_3uOyrP4<9rdvKCn@)L5Yexo{iim;FoCQ;mHw;rJ=#}p
zF20t2DE*m~OP`??J+l*3avEcQAid+e7vWb*(!Xphs(R>s9!viy{lrx-J`Ir633Ath
z*srmqS2pQmz#`a!>{rkxGV4Z&I4U}^H=_bVYe8K{5i}lNSnu=E2p0dOAm|P0&!w64
zA#SMQB*6m7c`SWbdc%Ea5{w^#pFLvFD$D<q19&FADSe+#tHH$!eC0U+_WK~o=^fvd
zek{G^JMT6mqf7J(X^AX41YvJnGL3d280b&g$mEhw(Sk@NPka?42`f2h<F2`wP!%uk
zO%Q3)`u{}wZ&D+DQF_5(yA4{Lp6+|nkGv_UBaV*>_9fJ~69)6`X!w0g`oZYpwe&Ww
zdn<h!>lOT?WLMskzAt?bcHxBlH|T$jqZKA7V9%%|^blM_m7&CPm6WeV=oYoT^J5<x
zsM36R5PJ?gO460|GwGw-rV)2UmVY75umT;3V#xz07UUb!Pe$^e;`kJLeuk8xC8SBK
zm`EJolHN3a#7(*4P4MShdSojX$xbCblKxWqk>7a$<6jY)c4S0Re=mo)!c&}3UxYb9
zZYvyjz-7SzP65S>%+eju02_m)4_5n~u>jEvl;4&FG}064yV4`+JJ^sp8Eq)(W9cW-
z&yJ_!Kb5{ET}dyKEw(lwwxRs=NP73U7v@s>o^&OB%iys9`jP{>klvSm>L-f(C)O!2
zyohNcmrNUbF(M-5ykR@+nB22-q6iw0>YxZ1weu(iSG;gz32#&)rXiF@dM15Gx|05e
zz%#Qhq!upM+tM2&AS{J8-;*}dcWoG4kRlrCif;IKMnQb?{{O@bwKJ=SEP)-pPB9I^
z<s|&gFmTKbaOf<d9duK41c*DxUn>lIV0jy}9s}!>6z(<Ch1`g6sj_=dI;VU;%>qiR
z@LlPd^b#mshB`$}J4xZ&qk$+%{{InAaEe5Di@=^iLKEwstlvR;LwX1!FDy*W#N4Dj
z6Qfu<+vnP9a0^(`1@`!WCvBq)5uC%uV2Z|SEiHS#vPwW7fL==POOjsX8wX7P8p#g@
znOJ!j;qOS7(o0roiTor-sgek9Cq+0BG4I+8xytO<MtV>B$p}P9>;I-C=}l~FFpq){
znliA#oqGpxFvLCm=^k4<x9;CSpZB9uW@gRAYw4PaHc@rhQ)zcfX<wu10L_HaNeWOg
zU3G(3THO|erYkR`t@NJJ1q+MUI(*B73?8Tv{=j;_y+^6Gp%u;XofBH~Q;SX-BlOBf
ziB2<FqyK15$lj3n1dMmhPeglc&txm@NQbDp7Fe>z?9t5EKTy^djV~!#tucuOYcc3R
zIR4b+wm-2*+39Yr0W8x?_<iXun<?Nsw|~z1*Bq4;vlr6#7AAtxay7C30Xv@92%rNI
znCx(%{FqIhn}kO(E#dyx#xaTVjE>W<3Z;)C<bi#P{5nWM&246=CS`o<C{qajRLKAI
z5~e(^vcAU5$ZPfSX)LaJKbGDd4l4-sKPXJQ+X9Hvj++dkb1F9mIl+;-#&4b?gB2db
zopxe!ZisC<U&07|_m+6q_T&vEffRm!FxiLXOZb?QN@?HgOy-)~mM}K&?O|W3%V@Ak
zCf5I{^gZcKo53Ixfy{Qkm;Z@sQfnNhfN=}#*$z^}!Xm?)FFiH#Lela_B!V;CNR1kf
zgP6jCg@%TD5tD9Fqt?LG=}*`*-SVaGIez>=q&B)T8(!H?UfkH|bFUqauAjF41?cBB
zdDH^ikN~5l9asAzY_pqB5KR{Z4h_lLj2U28lmw!wq^Gb1w`OVfsiPjUFiSkYZJxV}
z@!FQq|F?VoDu^9+=+hUnd;aFd#a^ZGjeOpnTS{{$$Yuy^ghWostK)~F2p=g+Va7zp
zA&wrOf$(=0D9o*;wY6ui(c54vKF<d?mSe{Hhb8{xB@E5?Rs4C^e|w94j4QjzzpKq+
z7e>Orw)WT#N!=gkL6JT;EMAMdOTvFji$ghlwVPeN3oa3K%}Y{-GupWx!10-V;`(og
z-|!oD&oLpfd876BE1!4qddhgoHh;W%eH}NEF<+D(l0P?yTaER<HVYrXX0H(G?&o$Q
z)f4(w&n!6I`ZU1|TUXimykgh`Dr>`==1`74u|A`f_=L;i;`a9a{e++B(BxnE;>gz4
z);>R{da$IpmVoIGVVvK*WJzf;V_aJBB$6_GByFS@ke->x@f|BXvvNs>KU)ns@tJ;(
z&u#1fb1mUNYyDgM-|oozKX37OoLu)ZU6ot%zyn2-CCgD;iMoI0!e$X|aiFJGUTDyZ
ztI;^+D8I)87v_MXN&Uhmi|YBW;{oC?rUkx)zvTKizv#|~-41g09{oR144CuNt;Lf!
zx5=T<y&%m%OG_JC5DbLavtD^-R@~}Hf(tf(LA|+Sh{@UW1JHl&`gcyU`sLPt^0w0#
z4+`mpP1Gr9@K}IOi2wD7r{91jpyL5JB5L+a8z&T%fB_A$G!9q|Vz{R#KsG=vzb%OP
zWtQt-<jP;>;tp_mZj{VDl=aAq=Tv#FCz~+?-!E~_6-1OrHdN`_DFi|}oUj$7x0bLj
zh?)6wrYZbCZ~cGXUug{|5RB2iL9ctDIpJ7(9~wcKuT9GZ(M}Zv#0X;~s{jW!(b_@g
zH70S6JzE>9`!Bhc|E%@@g|E*LuN17`G48^_2%th(7sn^ct>8B=EOadK<Ogh7Ff#_=
zb>OcubvOr_>)-)&zmW0y-@5)k?@#m&pf!B2Gy+s~gLe?iE?`OwLh;<l1NTX)WM_>W
z(3_ao(mP?{ZUod5^z|CwTfkmwo|A?CEB%D`4TjIb2APO6+Dc;8?Fa_W`0D~nVrt2~
z2k-1udce8lgFGrSupC<-1&lL(_kb)!iMaTM4A8%dPoSL_RwyZrpWwX~8o9yNd_v7?
z%`Eh(*<+0|6Ngl498Ie9-nBOLfL%AA2*Ld;{p18U&j=<<CjXSihR8U(t>@xI5Pw3Z
zmDxm}zy+h5BtNzvndNQ^_I8d@e9C_Y0AK%`u|I=cGP5~^DMma&+@heAnCVZY3!f$s
z{k4w)%~l-6kWmnd(LnR#No*T#7(KWv#xmBRL#axAF`7sIoX;<VKRo{@e%_0JnaeW!
zCzw~~k_b+7;Jx-32rDgoPZjXJ^l9k<soaWLATge;f5rmN#+7-Qj)4FH$JFHL+vgM^
z|L3gV@lX7-w(XN|9?02}wedFi7!FWk$+J-rvL&xAtU$Sf`nsdMFtaX)K@&{`TA@od
zD0fr?gWvx1reC+M#V;epzwpleVz<qP<G?y=IE|5P8xWz7r1$CcyZJa<aBI}bmZc3h
z9`x)%Cw2r-GdmC|y3aT4LK<Sy_ptwR%kfKEk$={ew_aTO-v`BQ2aD&NFI6l?VV&V)
z2x28YwC=Sa;Kkiz4#Yrp*7Oj|2)?Nfom!~QQH;)9%B>M-K&qqfjVh4SNknFee59RD
zVTq=q=}&Z29j|z2*H3(+`jKki;rAMNB?m&0J2vd~=9Bn&A5%<Ov6sS_g;$x4Y)C+G
zH7wJ>k7|zOTd<2eBi21n`f~fHvy~scZ1<dtBgP%EAlm1w|7i0@uA{Xr=#H-}2|VwP
zSb~xe%IFEuX#EQV$y)3~6s@+P8w;BYKER+)fHWfdnHxJOrsyQK$)E27eEmm%uAg82
zv*CL>7wCBx$Cc4h=zkmK?&&&kPH{2F<R=D1#&#_D^Zo{*^D_dKy@lx~wDsi1g=t12
zC9>O@Enzc+LbfK(pRoR!{ffVTYasuvOPJf@XG8t@go|_Azj#64`7wukkO9=dW#J72
zSI7xVY8&u6c<7`94_NCR@yedGbxwl!A*VdtJl2lJgbH61$@iA6PZhK?J5Kr5)t@s?
zcxeyc8MKtJ^^0RBXeuS$fv7q+zmt&`Jp(GPkMErNrykOLBy(D#Aa!vJp2f{$+|_0{
zKdB)U%5Vl(zLTxAp-Jj>itRf}@zGnoA<7zuuN`Vt!tOpdO^P1ajDTwS92SJH|H@c(
z@xRff|LmBJpF#LIr_vS{Hz;{moV$9My!f0ZfWnM+lvK7{Kivl)`(E@V<yIaEc3E!6
zo%G%##zX{=wr2?|j+&^C6*zkQ<9|INn<0Fu5$j@-12|g$b8yy-f^1<fbwdWfKbHTw
zEn#ky;2n~-MT#dbPCT_0|6a1r(F?X`V$9|?AAAZ4WoOp61)xd2?e~Z?dpnmZTbG>X
z6T@Y;jQBjIovj{qL5i&|?1+lm{I17yF>>qH=rd;o(;Vdb+<ZVwefrWz(Tpy>YXf<5
z)Gq242?u#Lx&DbUdyj}&*srkP*NkE8FCJLZ7_u>=pj!?d(vI-G8-*BA=GyU=DNQ(F
zF&|LDjHycfzQ=YI!+!)TtK)YI)02rkNH*X#=~txB&<!7n{2t$bA-yeqTYAsisgmlv
zDt%r0y!0|7$*+)vjyBZqNk8GN-2UR3^n&!O(r-$yvo=olT}wZeek6TQ`e@Y1m7(kD
zvKYjYn6w9?o<M{G0_yz!ckQfAr@u$x_k`<XU|CPVYY<K`)ViUO%qL4c5B*F__=5Ch
z=~tL*a<J1_YKjy;W&QVl@j`l4`ik@g<Kpa%0N6<HOFxi)Bz-)&dA}xoj_F+s7$F^q
z%2VkL>08oIM~!@<L+2SS#&g1$ne-4YPmMj)Xq;~h3)u5_F>fQnkl3&hJE>1f(y!D;
zbZ$&n*DzYHCLLHv!AsKDq%Uxk%g)S0dZLe{f0Di{eK@&<zbbuhsIXfmxMb2Zre=LG
zx_BXdO8PD7H3nH%umEF3>ZSCC^!JnNFEeyI7=<S|osm$Ng!MN?_yeBhMe7HyEQogp
z-ugJD0G|Swb|7|dZ2klCdu#d?tMFCno|H-VrI&}<Z1TQ8mi}(kPMS9FW$EkEmpRO&
z#ePIolDY)GE4_cbc`r-9B7IeQ9n8IeGnGqQ=_Bd;(m!|?Y`+YSf$nXhYov3bz(MoL
zd<{=+yP}J02ZOzYRd>R;&Mb<MGaFmnTA4=H)w%+MyK?I_`c%>j(r-(DDt(4*B-+a7
zJjG+?Ks<JPR!I+~-;n;J^abffTKOMI1#R6=q>rT6rT-#**Bjc?H-1I>kJ9f*ud;Za
z?O#jpNZ-XCUmQoUoi%yQZ4}~~P&AjWrAHhw+FSo%Va+iyd!8#uC8vzl*p48S!bbG1
zEO0+SL8?IRDuU31raoy2zb5^@^c(olhLc5tew{k)cG5Es#VYAV>Fd%TO25j2?~snQ
z^jPYoOICNoSv5Ce{6PA5(tnn|M2?bd@D-yd3+Y<=4xZqtuR;_(?n)1Aet0y{hTRTg
z=AaUi;JJ4hn-slFg`Lk4UGdA2^>W<5GmEvLqjJZy%z%b&mH!9Q*QEbb`i6AR>{h&p
zq{q@j;`rK^@S*e#=|3^YATacdQ>fCcjlacJ)}!_R4e9?aeF;%*NwXKG0{=um@E{0J
zHvF;l9wP-B4Dg0?2_u5vlLz-p*x39s*%bN`UAQv6s4cj@k^hRPZ3y2}J@bM|L31<4
z<!wG78Svkc{#bed7JF(=Bdz7N^bbao@8$o2^efUIOTQ^Su*uhRo-XCi-zSvs*FX9F
z-;@4>^i}CaD_z74h196^ZCdtj?%Zbp1>~xZsNuka?aUIN;4S)*ViDJr>~)~O1<gx4
z2DV3zJAm<POKhTfDW*a-MpM~O;xA%k<(H&CkbYl!fYS=-cZmT_bfTB&)9k{4|Elyy
z(yx)Zoq<`T9^_<+KQGOspGt25#N6EZbJFifzaxDCw#^>vnV#YflJu7J<G45!ps{Sd
zvg7Dyn1ROT8)hQ|5V&6rB6L9!HwR%3<mTMQJ@(cwQuvf6>_G7%3(nA_=^}92>aR+F
zD1B4<G$@cb2<gP4f^+F@>4%&OaFc(3S^8b+8_Z(bn*+MS20oNdq@PJ|L>(}fPNmOD
zzbSn~deIIR2nIq5Xj2~aS?M1~^T;yf|D5!pHcPUn7q1-_Olse}E7U6H1>`_OvaF%D
zu?9VU5iQQqO0Y31K-is>P@PReaACmzRQjg$S)1!q80ObXcclyIAEh^fOA`T1+Pv54
zK)GaH9-xKkVZSB)z?J7*`l9sP($}O<5e6Pm{)P&8!g60{Xv!GK>TK9-X{2X|pfkey
zf&8^-!Jat0HV3fdK%S*dhUp>$-a%}PxN{F+Qd?%GVYOy@{eMIHQ|TKvyk~KA)=#2R
zd`kLmh;Ur|2h!K2mqDQ$;;sxUl5FHl($d%ep7eF;-%GzQy=13vgwsh2abMa?pCfv0
zdIu0(oM_=#(*>4ly6Hb3NN1#zgI=NrdPvThDm#cg;Ziy*)NW+JJ5;QOhPPm#U`yI|
zVsoC_V;~}uuipWJshIc`Q)pV7^EGoYFXic<mVQI}vUDHcy|DAIa>i|*j^=Tv<m+|m
z_oP3RzQlpbab?Zej#MIF566TIRzYlZhe(xlq14j*QX}1!?&I8h8H|!u^rx2W%Z%7b
zSjEcB{>?pe=ANG#ce<r>&kfj;<Yb@#X%#*teN*~|^l8K<*pmwkxTF;=JJH;6Gd?yM
z@XrBWM3ti<1B~+2Lq}rg(ks%}rLTh490<8f7Geq1o7;&SgHK@)fb4{e0*&3-`p?`4
zXQ1axJB7ZWq*B?Lt~KI3D5XcT6*Sx(dS4DFVmIK~^W*hjNnervPwAV|tCHZ1tw1;S
z=6&u+uSn<VbScMgenWZ@JriMEW<7=nMn=y=dD8lSP5KJL-`UvYoS<RB#NqqLFnTgk
zXVMG!>dxMLX053?K+2U{fhgc?4JQ&7Wz8OS=2-F(esm$-0mOLBc2-!f15^E``;gK)
ze_8sE(l@13veUa@ISrCpB8B}tgS3QSkp2(po3Q7v(42Du*k}}4QsK;{Hto%4rQc=%
z<{VY6Y@o)1#AP)+^W=yHRFQce=s$@%Oa?~O9iPnGf#u}P+7B{UgdaKKLq6hIRE*A=
zF$HH1fZJk^qAE~$?CKkECVfTvE$P$Nv|5^>JuqTmK3u~lC;a>sGT$>hlcXee-LZ^?
z3sAcASEb*RzJ!Q`q8A9@+-ikqlL{P#hAW=7B)LdU+NprCvPrLf5*1ljiQa~<ja)dl
z$SiQqo@dY+23<o(y5a;Lp@w%SXAx)8iS!x=TFg<My;T#_n3PsJ9i3{J#PTx;@Cy4{
zP?gvavU&eufAOjGW$E{%ufcm26CAT)gH?CiJg>~8Fu?uP8!Vg0l~VBKh{z+uHfKE1
z(vE*UCzvWwu$e1Q;1Nkw7Hn8xMjd9bwk;fIfOn-YOJ9>-wMczwfJJG?i<UMaQAz&4
zB>kH73Z<vqVqd4Xrgc3HTqh^|{}qzz4o4qk%965jYl!UllQ@SjNe_t_a|_lp<WdBw
zJqf~V+)JB5*a7$+FvHHs={2X~ode+LjC@q0(P=8&<>Zl^u?k<4USa*_R)b2{B-gj*
zFn}gd@TK&s^hN1KV(pTkqred6q*AS~f3nJ7kiIUxP5{!H!Htms0l!q1(;RgImw1N~
zOMK2w<OcA_XQ4#qmf1!%b_&rIwfLYq7QO*#>t7`H@ETiyN)=Sc{bUh1Dcvg!O9+xe
zaDNN%<TJ9cK@DRiy~Onu+7q@(tV=LT`U-e1ok^cUb=E^ZU<QCU!;@W30_dG>+7f4)
zQqv4-#S0{iThy=^U0-ugSHNhYnVFrRjV>1UZOhJupe%Uu(6fm;SX!sMHpD}?+Xdea
zI@N}`v;8!Xx%5DK$V19#KQXr}w;AOF3eB1HS?SBtD@3}PUEdjrJbV5mugnCiV*bk>
zX<6}7QL1S?0-a%H_BJ-XB(1LT`|f?Sc!g*J*yBu-v|SFFfDLTl&9MPG<xJ$tnue<(
zb-D!6yml@~vR<E)KEr^Qn0*m6;0|bJ&(c)x06r^yf%)q}D+uJGB<0;;x7*{DFDXD(
zrVA099MGto*ks^CgDn&L)Pmw%u&HwjnAvgq8<sJH%h$s8OA*%w{P5}IvE&)#w;6u(
zgh1iaP_PubU5r`@GUHEWEPl`?b4WouC|K97{J9ygyVkkbVDq*>W2K>RJ@(K;K!L){
zIl!#8us*YDVvr&CBO;0H@a2wOy^<~=WkfDKATmT~H0MhZJ||L1@<dmJWfMi<oX}dN
zx)<!n!D><qh&0)dDBMSYbMo&c!q{2jP#<#;GeDLM<Gv)un+?eeg>)&sCw+X|DAKf7
z_sk3h^|rJ<>W*t5jZw}^4A_D=RMum3FgWgJ=Yu43ZJ=p>WUc}auO}nug>(;)uq87J
z!i77-4gd7b$SE;Bw?mL)33AqWW`EdY=-JUMmySm*hnY>fa&?;nXehbuK!_G7N@vk<
zX))oT5h>c10`sXj-$Uko3g&Y6Mf}1{&s|g0XVN3c6&Xaf&21deuw)C{xh?F)6>FiP
zX!T~`gm^pM{E74ksBwYL7G&w!@trdyG&k5b(u|<j?r(ek97B1q7_i~ojoN5BrDchO
zr)c>3oYiWKPE}A=4PI1#<=l>;jpjQqR~7)N-LOk#@JVLpDVi5`%bFm5hhd-lJjPI-
zlVJx-RFYz!TDXA4ap*B8)7T?t@&0pOxI=_``&IhWZxPFU7&1EPaBG-l=WCc@#xgUV
z;do}o3BMj5Q+~s>e8%C+;p_@*YnZAt>77XuazwxfaPL#I=Oq@e1&`m1Du4l=>@3^b
z7)RrPBusNqTVruVyJyCqS0*(3$$562G5%t6E3$zNVi0j)aa@NAM^oCN2`w8DOt77i
zt@;AaTKbS6rZR-fyO35!YMFpRQz-JHw3M#!#|^8{&_Qr*rKUatPLNz_DT{dW(*7yz
zNWsC*N7LrsCKtul26;x-V>I}5fd0=cx(=heU^`O>M}U=sBr1GxHte9uET-KfTF*iR
z!)~|a>6Pi(Y?!RghDYO#t<R2it}s^{FpJ!lU`~m#uY@!tHG9zi4#Sw+IH)DsdFj#L
zPI@dol0HQcJ2RXsvu5)AITV_PTFahopA7?`DAF8Ixz;b<^Usc_T`-HOH700KDf;zu
z&m1?&&Gn{5c>q9LPCkERDjTB5g8FPUc1L7??nzXJ!CJCl3!LtnFr<V$wqmH!;Nc@p
z--6Hwmb@|_)*O!k7`#kN9J)6va|sjjt{s-M@_y0`TP51<PRT-(KAsZ<Zb%3svh2gp
zgW(Ke=ypRef5oiPaS^h!(nEN!^lk?nZv_cA!%6i)L}~rvF%6)^9-o@IEsnJE=2P%s
z(Fl|wj|;N|i(}L6+n5<jA2xi=uC8FR&PGFpGgGdNkf*lM-{CRl$A@}lEWwIim>gb8
zl=2!@G?s1r3Z4BE2wC)Sb{K%d2JBYu10sA(@yZ@&UYZRl?Krcs_SYDIl1y<8S`pIH
zn6^)BL5bK#q<d~Yu(jiQ%26q{fvzZ4{yt!MnD+}1mF*}C4}M{el4RWK4(GbJ<ircN
z+oR_%_`4V;7oqS0O5g%Q!nr?xZpDcMDfi1fj<Cn`<3Y?FV@f}gt~mfC?8GyYz=$Uo
zlitg%K=k{_(u&fMqcHVN_@D;cvH=B5ph|9F{)WV{<d)-=3mbYcOyFRL>*jXS;e_zS
z3VTmMVMpVI!C8*-C<jsQnLi4gG}s&kmbRDw(EJwLyg+_}FOy^7@{tKD{bvgJm<6!M
zQhF+VY(eY^c>Z7;gZG)6eCDjd+_)6=u;uCIRXUKT^pi7Q`Wp9pf(<yijRsc$Lr20~
zkhxyNIjJnf7*rx6%vkXh0hjIxf*dCt7{J!t<rC?N^pbRswQbqMq(Jw{^v0i9xz=i2
z2P>IJaj?TA_ckOm{GmW^YCBkSk31dF^TGPfr2F(&RtWvU{9%Gd&L&NNtU?M6&+N#a
zh!XacF1D6O<stoBngVh=+Nm(38}F4LJK)TWURuIS@RL+rtJuPg9k4yI{(*m0W)c>_
zp{WOT&kt!-$*QcqsDZv6gsED3D!osNdnbx`9Z5~iu3p=5XmhrG4>Z!==6xd|IFQZH
z@uyBQ&du7F#}1&diOk{XcQz!=DRjAPF@wOFp?^siayapog(njt6yfuR->xl&3<vI6
z-oTAFWH33wMPo=qT%#eutl#+*gxBP#VG$cZmC6Wq6Y^6cyp@5EvQ+wxozmQzKf8JO
zjr5UGEhVk2KcunDws9Za+05&^<Pm9O@8DIh5a-m*Gk~yqGGl?)z9h`9E&1Csh$hr<
zPzr27HXEB_ySLbT53aMP(_`-1Bd+P#{d^d^irBBQK(qg3z?dsseg^<{if)C|T93NX
zJ7Wly25E+DC(LR^YuN&U?X@o_?RzR+!r5s}`nI6Y(-BbUu#mNy<o(1FzM7W}w6nK9
zCsrEEY-USM;P#X_FdJFlhGmcIYXi(qkcBmoX){TriDKgn(q*tv97*qvXZb{GD2srD
z0NELfC?f(I2f+a>rs~xrqmJ$|eTnTiK;K6{vBp1ifOLg9hKk7Uf;Z}Sv7*x=dPlBd
zYXow5MmNecN1-Bo-D0i_TwiB#SBY&N-+8nJ(XSKwGs9>c2x+!dJEj*Rvy?+N8Yf`N
zNYrmWDJ;BCzSJV~2T<4wL$bCf8h;{O49#s}O|;}It<`A;W-PtNoVvq`N<=`d3xxOD
zBB7p!yCuOYDUn6<+QKr!%h4b+zyAS8k-(cr2*RoLV&$Y*;n@21Z=_4RVMhiUoo`EC
zYETC1u$;T$;EqU~pAwawNSD$F7=oKmp8-VL8~wlmnm#0&o0~rNOh}oP&J3t~BCVwt
zxonQ(U5v*06ad{%r5Y;nf{<fJaByI&wv&6%Qpdca_LQFg#HtcGxl}RYqm`DYwsh&<
zvC_rX=2#RX8<uf>DllJ3D?5>FZnamLRT)EA(GJvcha9?Bi(zV!ji}ai{u5E-%L!#=
zQ9}+*<=H5FYOGyURxfXXp_kGFkmQ~7cl_p|gJ-mYgc&+mUsVElYCl^t%=#&|qQZ5w
zh+1ndwI1>48w)(Q*7G;FZh(;P?zj&{7C{##c3bn<VYG5%IrI)VVi>O;gsm>Y#uFTM
z>5q_E^Oq3&1JSf>6u{o%;IaH?1o45qb@pBb2z$tH%~N##3YlT$LgzAbi6WA_@{4mb
zy>o<j17=bI(6+Zx;9?5`MfF#0x%t5`qPV30R2;*Z*>ap=A!iU$F1Wsr{!&l+1@V!$
zJhaH{j`=<N+oGT7;ZMCY?}A*tNDP%*keRy)2?+g6dMGU*39W$Imn1aLq{q^Swm@SO
zm^$V+<QvK9-^Z41pcd}k^7Udk=U9cDI!s0QzJi$69MK7GjQ$<!cr7ksX+cb&>WgF3
z6*7IoPwt5wS|YQ8<oj@&2ToS<(zc;up$lA6Ml`r`h0SaVYysVLXZ5}U_i(_&3=D{C
z|GCZJs*%1(;acQ;V)trmILaXJBzeA+o?sqZey-1@143C_`@oEnYnb=Ux)Cy8{|NS<
zS~^@nE<fQvSA@vxNg_}OLKjr(8n3*;0GyGK9sE#l8L*Q~Pnh8v!HzYG_aCz$QjEA|
z34*kl^UsWF7}FFE*~^#M;3%JW)<@78k*$?^k5K05V+s0!qtDHVK~caN3C>c12yF@L
zE^NN8j^!r`uMS?;a6azBM=eN^$1zBxX1T#@LD*_go@fo#sFBS(2P>2`^8FPz4Atl+
z7eA<iQ4h=srlK7y8KUiI6?4|SCRUTtem1DxjCqphHq*G@v%NLi2iq%bb@C^h=bVSp
zz_0a8e?~$W^y=kkl7cKdNYU>jGqojRvz0!U9$ANSF9|vRCGlR@_>%~1m%zEp5dtxY
z!{bIfd&JI6m#^7i0a|pWB+py21{M1mO+8nBE^)_mcg!egEM#hU$Z#FELMUMnYa8I*
z4Qs09WFv|PVSA#+0+)YI%|6HdJ|Vd6S1!YPFDzKis0`+GQ$(S1%p>lwm0L(I2UGXX
z64o6Fa`cz?aKCe`EB0b7Js_-^TYk}n_lt;j$C}Of#WPcbltMol672_{D3F)r(t#YL
zI(AH%cruCxTdV6ufR+<4AM8B9Am;U-3P&x&Pg)jrPwd-bz^04v1x369!qhWsV-NGb
zkse9!kH#^zY)mlt_b5*o^A?vDq?DUagf$_XJ3Cq0;r;wMM4F5@Ojz!k2_iMb;sa)=
zvA(pi4+#22z;>~a4TFrv&EQc$h(b+mZ@Z(ppfMTkCf7g6YZWHG8k$QD&!o;ggUnIE
zioHlOvjNhNNG`IRn~N(VgYbdz+Xs`P%)0X1V-qsBOyMq5>2ikY1#qz<PMe1h>V_UK
zKkBxPqDAVM+E}|x&+C{Q#?jeQc>p>y63xx07elWaCVPBpB;G5N*08~S78T^i1HoRn
zgl;&`ZD)0m{wJfx+8K{GgF-MvJa!hGP6*J#I<0eBFhn%wc|eWiX0E`O$m$&u)=M5_
zLGpCPN){nNK~>lxaa&>+Pkff<VJCc0WyOWoDo`mtya%+hBa|CJC?x{m8<SZmQW5y=
zndG(gM+n~O%+$DG!#bXPZ#`lY*1t3sP-kt76;a~>2Qsb%1O!$a7@Ph&S+aW&8}rc+
z5@DP&6kyK!)!3a21g0@D8GmvM19s9=NIiR`qaeljC{0jsHyA3A9`A}VVoOV?7hoZo
zOI}blWL57Af{D5H&qYtlfXD618-Y&zyCb?Y5`w+S_1O9c5pZpR)(#@zDIR{nflcwv
z%j{4KNh)Y7D;_X2Bd5{xw-|5%cnG6X8F#)gux`KuNQwJv#63~ae`aHr?ie+yU%wJx
z5fE6gmp8UMCq^*akDfn+NAUs^^H#J2cE*cJ?|(6(!AA`;7Vn&@+>8g_V|mjWpN&HI
zB;+qh=fu+)e2>g3;0tKPQ!JnknV8u$hz7ll)VwwI9+1ImSjSsa_qjRLibs7iiUX3k
z++pcHk|f=yv2-M)nY-`N9+b?)TvGn%81PXHX-WeQsFx-?vu{y*UK59usAN7`wy3sm
zjM20tVeM?Jg%bn@4_{eaHOHPe#-ErQ$!`F`$%s`B{dN{Xv>1(pX?y?tvLQ3#OF5F1
zY8$n&AIIyTnSZT~IZ_}R5ei+{O7~9@5qP>F^zIZcL_xf?w@%P(bpL_^!W^TW@pDmM
zn@uutJD|pA(lf%x7cB8eh~PPM7dH$I>2>f_x{s9)+D+J#4QcfpT&)ivqNY|@`7{c`
zHDVud==Ed`<3Nmgjcz(8U`IRHmJFmAUA)F`&B=Xd0Q(8>${`?XlmX@xa0<&V=Gc-j
znk}<{?i{nwIt=(Fk#5dI9SD8qR;rq4Cvz1lQMhZ%dgj=exoOIP#k50^8Z!htFo7t5
zW<2tE3!;3!voUrR#LU1aTBs=FZk7x^t$5**7%T<gCxmn-zJv$qsq}&Ly7YonkYGQ<
zV~YHu8>yW}4j@NGmTYVbbQ+;#hqm+%z_q_I9)5VOmGvE-TCaA$Z!4p#2{B&SNUD|_
zOhYK*vKlwAH{Mo@Ze<{Ocf1arVN;^*qe6f>La7oLAL-bjk!{PqA7FglG5Y+$x<E9U
z!3ebAjA_}0aX`iU2h26DL(6dY3PTYFSm;g8lb#yQp>G_Yq3+&Sv#=0F&*7mv8+k`S
zvj<TuY&O`za)n;{M_Za%3LW`YO+Dl=`d%b>l^NgzQBPr&#cmY-#iz9Zrlh10Imw0t
zb<zes<rM}t@D7@@fDsd(=!ta2#V5qaH=n|+e}IPpsvHa>+R@26fub~kJRg$#WONZF
z!q1$wykiaHH=mk9&6*72snjwtZY90Il@&O9U&M}`k4i`&?vaXhyyJ<HgvX!Al%r!j
zt_&{Ih8gWoM*Z|FhG-U6?60Kzn2n{`o5G{Ni2I_`=3072?Y6>uE^WAtj-Nl==p8mW
z$Xg91y%J`}8P>jcL#1J6Go<}M7<h%zA8Sfv4fm|KZ$s!7mTQeH2OKmk0*4p}|Dv>a
zoelll25&yt#+KMGsB1Heg9?j1W}_lx!C!yCj^(KFwZ+>@qa#c}C_$9FCRo^+PtAF#
z%G$*T5Ks_a_C^*DNI0}I8q9j@B{NiuVJIFINQbwl0rU?2II%1~AChsSvc9k?QI9`F
z+ti+cAzNxcGc)YlVJOotX>R5JfDsF8E-#NKnYj%K9(b@wu5qBJ7|AFSjbA=Rfajz<
zGXl>!r?G?}T$0!Kb|Wb3VV|xT-xw9_%5t@4QhrWQ&Lu0nCm^p8gxo6oV;h{?Q-qb<
zLOm?3J-k=r4u0WNll)MMf)^}E?Wd!M2!A^SX=SaM&oFOS)G5kQVK2P@84@;wE1g1$
zmxg@q^$3OPEbfacL(LFSFNR!0v6z5B&xRz57;6v>H5E1CgS9L4%5#d{ZKY>4A)MO2
zY>fxft6*dHU<MBH6sTop@Oo|Rw(+N-h42hEj4<gJ(i7{VbfQK7@=M~gExv9~2i85p
z(-|er0fnn0I9>5pso$vJZ!4;L8GA960fY_jtSh#0LxWadFr=&u7D|KZP2zG1R=lum
zb7ARh>2F{1W{;$Mgc&<P_bqYs-N_zcmw^NEW;C)C*4bT}0|`QJZ$PgpQx@=iyqUN*
z3tb%JU%52`wJ6%c;L^%ediwbb0tB}#(cQA-K?=Nl?KWg@V+teckM58YR$+sy9kbzE
zi`~|ktDxQnhc>^>0mLI*;qhly#G8{s%ebA(5fHGGK9E+@tGMB7`eI5rH|MZ83b$Sj
z)PlP<te~;&aOz;B;NwqD1h}*gf&#x$;gZ+9-O`mOQl`kdT3CezG9Kt$J?dnxr6<x)
z>BS4Z^=u%xAVIYpLvR&gYN&oBixmRe;*SSrAUyVtly3o~zZ@bOxfL)5(Zj~h?8^aP
zE~IPep>)@%AjueRkG-d?5?4`!Ji{dKiF=li6K350<`qJJBGwBUXGS||l$r((Ad<+~
zq!U9<8-kH7PA5Mml63|lWkg>~igs%Y!-Rnvuo=7}A3U=hi?+lE<OtYUesfca0!HKB
zM&D@&{v5G@I^wl!D>4U2u(Q^+kk$*A;Q$HJ!+m7Tm%?&_j67gWLrd=a0f2*~8IKc0
zjxe&b$qfkR!U5*NogpKnwzb`R7TcNYE1{}va1je~!3Jh;KAHp;(tEHX_L@DMQxGoD
z(`qt;h%(CF#8^f_Ng%C*2LlDpZFa`mh$N>rilXGr$5i0C0nn+^eFu+mfI}F~E7LwK
zJbO*Tdt#$hOEZb3X=|T>)p(o=Sq~cM1{P6Ns4Bm03LR4V_yX!0>egbRWWGU@uMmZX
z@WCbj9(E%l=s_8%rf_;~B$FL6<%tc7$Vb*c0w8x(>Dio=wlJ_|nh2MZES*CcKS=lR
zinSFty@=Wypia7w9^>3j?PTSaoazoC&B^Ey8tmiTP>3DEA6Vx8c#4?+#I^LuQof4&
z)TR_s=R5aaQZEtZ#vlmJ(i!69E!Dl|MuB?Ga-2#ccRR412Mb-NZE#0JL1E5(#)4<K
z^zxXGnat}6e{pJQ{h8(AfkO;7F9Oxf;?c-SIwR!FM+JmI4IqqR!>R^#s{;H9Bx$gQ
z0Uj-}drKOh*D%<lrLM5F;AR6-ympVrO<xKJvZX(MZZxp*1~e>g!y5+xSK@ktO+3*}
z5I!WJyS3FJGd$#C*z|v6lUCMpP+7>8+m07D2V?yF$@D&wekR?8xZN2SV1{`KH#l%3
zm30CK?I0r++F3K}id5|21)PknN|{D1a_||i4OFTp6*y%28iep1P%frvM7Lrj082r%
zz7bv6!07yIQNVy;c39$PJj=n;pTIPC%n)pCFLI!f8cTLPIZUt_+Ikr&t%C!2Vo_${
zG{n%sov}Q3R@vJC!w08*u>Td+_10P@Dj0b|+AMB6PeQg@d)&?3u!#8I(hN)g@->yA
zyU6y!S}qEhyrtDg2l&Mb%y^IGm=krEh)~Y7fbKc17zN?RmQ+lvXb{<EBx;2R5HV{Y
z%<7E&Us&ilox6t2hGNVyQhRF$tFQ&#u?mFzw?xmmm6bDuue3z7^yhc>{RtzeA7Fep
zOa&+m6dr#vn_LtLc54Q@IezleuMSJvp+I{aN*IG&x{Il9LV*)d^&-bHJzy~|QJ0!&
z3MIaw$BzQEO=YqL9(-o`zpNvvoDAP&LD0gI-zXo&h8H6sBl5=-N(KjQLCEhsy^?ZY
z+TkV}c3UiX+m<>`XiS5IPVZi_Z$GnRHEI)M(ULt;fz8atr2kxkQ|+<sg^@DOy#g#C
zlmsE2O80p9jAE5AENeSoVq)=QsWX$7sA*;B=AZ~AXis-J5<--`gfgCBJ+>CE4Ghq!
z^iX=y=9yhuNucEs4%XK*QAFLdVYy*0IgFZ$#XlUYQCtwU+tw-$YcP+_YT~{8cPLqo
zG1w6_$>v!2GLH>ty5?i)1L+JioEsL^(gjsoJkY;FO$@dqFN?stW;-kR;8)JW$go%?
z)<@Xg4dv7L_h%Fkr}ej@Rd8dua}*W>+RRyzJ`!mxwFvPn?av$9`-584+k?OkmJn`A
z-0cj>&v8c6>3_s9;h9V04>m~Q9!YPHjd6+n2)7oLX1Dk=8yL{}=I;m;f>pmHp6IMa
zU^fbYdI%_@ra4=`A`Uws;SryABQwyU#?G<|TN^&TCNk*b$d2BcryK_*JMv_K2aKp^
zy8IT%*20SSxhdAcK)TXG++KmUG~k_(%`C9KCBJ@#85m#O8fYlE6iI=vsYW)m5pO16
zp@Aw?vnXqj&>)~iiF)8hLdzT8bA&wQ7R~N~xvDTgQ5TN2X-&#!W=0!pBONP%OLAqi
z2ZH)K<ELAXAj5{-m0pnU;%H{(x}qH|HA-9#_2foo=f*CW0hLvpJ+mamIG+rkHADfu
zZsbPyfa>jB2RoZ+e#d56b#w!CoN<3jX4yYNW=Uj$kS$Tyl-cdAIU>J}#Z+U~!VS$V
z2#(ss+&ppr$+7Y~OK^ngI*^bTP(?4DiCP$JG&4F}ZN=tG(3LX-Xa@QlxOV}OE_#5g
zp{X8w-=KZT%BQ(S<1NzNAyiwBUUL)a1KmKm`S6<Pb2l<xBNfOoRW19xx3;So-_o#0
zeFj-s%HEMXC7}pMmvfsi)(ZXI*r3IoExWJ?r>5Nl<?S87TKX7}p=A>F!shCR8SSu&
zeg3kRJ{lg_l^>al$TZg;LA2D|BYD4O?)rk)T>uB|ZKmji0j;fe9m!kJ01G4}vV`1=
z*n0q%)zB(zSkDHIO4Pw_zLDC70-p~GR0L56tXILT)rtIl4fXDXaWECXE?dGa8{R8m
zau**;A4BnL?8xH?pBl4U5pLex;R@j%g*IL_N{P?2$pVI~f_1P3tBeJb8yqkn1+hWN
zil&@}nU`pgn_=Duk5JnoR~f1^voNH#A*id#J_g^#Y<T4rSWYAnnT6Ol@^dY{E4>Q=
zI0FAC(gS7>1O+X35<yGTag?0rCg+8<eg|kDyEynn9W!io$)5ysDdU^rO#00$!G{B0
zY>8T1!$n&=(xHpZn*l*?X&I<^ii{*KRd(;P!DBhwK<<2iY#-2{uzoc^w{tE|2mAH0
z^r}s-i!yQGUptFr`^`H*eAo_~5LyF&VlY8I3E_$utVX=HnDGVHsbd#gD?@Z)f6^<Q
zNM}?Z@6lU)uzKR0oAvu2o4L2__G;Ksn-PzXyC}<e`U~kt(mzQr(Joflv&n43l9reL
z)t^b<m%c5XG6C@(OB~g-Ipc};qqpBlA4+dYuULUAs4j8mjI#-zPR=pxaPXOR<whe!
zW&qyYOiEuzE$QVE&Zq?U@6gQ%h_-GjqXo|Etr?N-T~mOW^mwue+;S$%6EiLgem#<q
z=#m()kYe-x5vo*eO(s#tY^~wC7l5Qa??^wAUXo6UDTErxs7VX*>WK(0Y(;R0*X9F)
z2oo`toJg{x1?weZ#LNP>+!$%~Xz}AG0|uUhTs5Q|o*Yb|x3LOuOFxuemtH2>OafQK
z1~@kqLcac2(mT?7(ibr)dz_|cJtEq96w_!!dA>^pyRdk5hf6A~k}|RWq2gB*YfsGX
z@9l7q8S66k!QwszKVK6SoLPk;3V6K;?Kv(nfZUZaYJyuC=5ZRrx|H6Oe!{WNtwCVX
zM^fS#*Iq797ynTD2kD;lvaMp^*THihTi>*KZ%RMpJx>w7$en8-tSz9}L=u*PbMLK^
zH3yDeNN0Ru4bkG@EW}_=uIR(quzw5I_=FXy-EuiGE609DFW}s`kd39^!y<6X0DdI>
zH|bRRb>{v@`Q?Jj+>+h9j+WH8_`gc6^f~Eus9z<D8AR7emU|xSPI^cBOB&~2mClLD
zYBnGllc&;$(xYP<S!TdYjl`T=Cud?q&u~xONR5KU)No|YOH_7ehOqB2Y<)L=X8nz4
z*1Q%Z=rbgIV*|@4B8Y?Zf%K2kR~X+Hyw%vfiY$EZZ1%MNKau`YI+b2$lQwYJW|j&?
zxZF$MMtWQN2kC({mmZ?1CuVnBcH|N3-yT2zHIa73&tIYgbE{HrkbZd?$X7MDB@`|t
z0f`q(nQda{qk?>jmkta#K)Ia2AXI;hB5b6eNPi*Sm3{-vE|G_p=7Tcq>619rE9GTx
zOMfrDES*y#y90#!3=ZoOe)>TB*V2!r|C>>ety$uP2<3e3AbkwMXKa9ST;GxlA{#!y
zY7V3)a|b5A9e%6D*USm!7f9QQ^bjm-T*FHXerMP~yCfb6p7HD$f{Io6@6tm0Jm*92
zhe>FdF3)CfHt!ATFQgOcH>8)4{{k((qNgNB$G5)zKal>5)Jk8M9^hr}k&Ok7Q_$ol
z`WP}2!~?n4ITk6(Lt7BZ-Kdg~w&VnheTMvIfKQjOYU8*1^k>KHkmWGh*_(D0kpDK;
zUt$g*S;>#3zXmsbReD)^*+}3U^n5S<SbD>mptbZ^`nDua0KN+&<Ush8gyv)EE$J=j
z0R5f+fpdx^-KR6)5>!8}`v=n7(ub2g;?mj(4>tUFZ=rn9Z^MuG0_dqy=z4I6J6zRa
z1i)ut>UW7<3!Cg1k^Tm!w>~zVu?kP6?{Id_7jZom<HfG{<|FAX>8B(o{SyAI)JR{G
zzG5PIjxwj`|48~L>7CKVkEQQ`Q2nZOk7|BqR~FJU>F=cP0agtZxMh)dmMI)q>fA;^
zdC=aofR^~DE%vOoI_ex0D;i@57tg41o=`yy*$yf|XM+K=V+UMIA4z}1?1@j?IB+lD
zCTsJq^qzDTOIS-EOMgeO_Blkjv$sfg{{!hw={+huz4H76HLy=}KzPc-D&Vw@^nvuP
z$@Nd4)c<o2+Q^Q?yR`X7TLOat6Fbu9NZ$^)=z@r8Yon=7-G&DNvPD~W5Sh++mJ4&n
z^J51f0-(RbKBcCvHN?;;a&1Ys-e#gvFT&r3v)3B^poBqi;L(06{S4yI&HAT1r&st@
z>qB}#^eAwhkE9P=d0hTAWGK#fiv5tE(E|AmVwjbg+WF9#)O=yfVh?Dzk=o7}6;zN%
zbQZ&H?J(GZ+3eq8C;d$NE9p{tO?pjwMY@A+ifYV-^d0H@qh;7jA4=cF47`ok3ED@(
zwjWD>C%xg8Fz)=fQY*b7y)50OATwu6KbGE={!x0{PjiV+aCuJI)u=~^IA)LExf~6p
z4v2eYW#&Ej#m>;bi6T^Pamm3PY(zaR!Qu*%)<h99t-`y~pGj-!E7Cda!!tM@2}XP(
zeM|ZggXwNA-b?S%clcO(0quXlMs(6M=^g1!>HDLLlUnSh$I=JVXAsl~pVCG=lm3Ry
z8%IDhAg}<i_e8@p{ARQ^1;TuDaRqN}Z^sHX$aZaQxO3y&4z?g_x1qf?Kw4!kAvWdK
z_5Z)6Yw0V}>(a{l-W#6oLV8pBvD=Wi_%EbK(&sq^D@cn1r2VP%UFnCu{z-Y>m;O?!
zrO(rawKM)k@;mQHe=q%$n?B!yoaitrX=gU)ti_Y;u#nm4;_!?uMlVBJbJ*;=<8$dF
zM2ZfG=!!3xRP!-|R<^eVK=FnNKlwm<h3-Ics_d3-n72l&kk<cw>93_$dJD892T!jd
zQ6#KlAj0Y5_oV+Jt)+LR7YP*Z!$NEcfc{bX(PS7GQd|(-XHb9+HbWp;r)EUt2?|kR
z8WLss8uHLJK6vF+{_3c*6=leVWK}?zYhGD#=ZQVIzyqHkKs&?Gb{qnI7+t)@wf@9l
z*a97{$bue8ACCs=C5d}Wx{&^!c8HF9mOR>}^hA0#sl&lsDF{E;1RJ6B89(&Q#W*~0
z(7fl^is(GsT3I%L#ew#uH4XRQTKQ_tvz!|%Qj@Q6E&W(}BK<AU`3wWP#<@?gfAaC~
zOKa&3J3w-Y>4{|V3I{m2cq6?hT}p4!htSwzo_q4jkEF-P>+dy`70`8#6qVS}%;-d8
zGHGEWP$Jv!5G}901v&gCg%4?NaC&<@acEL!DBeT?$))g3>6!Fh8VL_5S%AD-Y6BA@
z{I>K&`X@LBIjS#mnrG5u%2;E~dsDiQ{?_`RWJzRgHL%0+!xvU#?65wiDMbK@#V84i
zB&x7?IZ!JLMoI9YccbIPBF%}~`waQHq!@S4#wcD*X7IiWm(sVO)2#Vv*(A7YtCkJy
zPs$6vZETmVtw>2!`S@6Q!t(uqT2I3+M%=QIo=A^)sfkZbSR7e*X~r|EC!?0XoA1r=
z&n2p!u%LrYL*4R5eFj|cce$Z`Gn4#{q3{|_lPyOt5ImEQjhZpivj`<76leob9A&cX
z=wHS>a95zM^uaMnV|eAQ5X#O9j6&QG+<Z$+E5e#y0p^ssN(*9ERw=7Fs<1I<*5}W)
zv0Q^RyTSkkiM}3DpeC1aP2dx}ub_Tuh*3>kyd{p$hFTt$G%}cp_1_V%#rpfrpLb_j
z`RPEnRb^Gn;0Tqtq3cmgXl}_`#OoP<G9$RvsE8PExVK2HBVN74?g&q`yls|ItU@Yi
zemEpCxE8dofRpS0#GcF1rVh<%Z@cqGdVdO`>X7kU<v%iDQM!x8?`X@3&VWn%E6Ipi
z0e;4|Z|Q!HPTu(2K7FnTphT5RoN~Km#6?10qmKDJADkkYXp88E_l#YbHX%DU&sQ#X
zvHQf$dptB=nSJ$E<0T@#Yg}*ezUMei+4)j~I&275L;u^%ipa6R1-!of@i?UnYn}5V
zEurs@vi1sRu=W71JbshzHWmMQKF>1||AH6Cl~F#-?KFhk!p|k8h5?FDn2A{ey^Wf+
z11DV#p}w1Vsk32NBHqYpPRQZ-WKKf9?eohop`K$R{$(#tuX=&$FYHvhE$GfQ6zhQk
zbf%>(Qo2K852PZwF-rQa>8yid!^&?h{R#e5YyDc;^WWuv)B69jl&1!&%uxMQ^jg_W
zjD}kK{x|~*r<20~$gFfz;@b;wy~W6lRyKq20MPx|AYhHgPyCClg8p|ZPm<#3r|u{`
z&xQo-`Dhw(1du`ZjJiuO8ar#r-b@|~yvP0>K<iIPR!_-hFGmDgO*(*orJsOcItx>3
zGNJ~YZAO&XyYdcoxuSM`cSu+3xHvgZ?{t%hy;dNuwGA%{0QF3|2R8l7OW6NoKbgI4
zO9jxOnubIx!zB*tSS>8OXatP3`HE;}W|ZQEchc^GByyCd#UC%t+(aYK;osyIL>+!-
zNl<vdnr!$9)#%PGaf4e3{!wmGO<<l2npqcay$;OG2?#D#q8@P^UE0MN^u+cPeZarQ
zPhlBs05&@^<GJOr!Hw=CkwAFQklDh}n94X6(d^f=fVyFVQ(Mqf8M2XCe)n(A0R-JC
z2WM$1BNPC0CH&+TlinjuHRRMY4lpQ;b$<ZeT3Z&~m!BG&P7p=Nt<7Uf0MpCTC3Et!
ze`7ijG{OZ8^d)fR61lIz;J2e=O=XRE3)mJR=yQUbH6hO4@S?%~7hoBgQBrg3i1~!X
z>tFRJBA!d)kyBv!wH;Bsgooa*e@Auu5)8S8yd|5cm03OM+F|6qk#m<69xs^9m#Usy
zGPq`v2KsN{6TtX0=nomNTZZKc2ycMD1uL#2xZJUwYeq7p1-pWiFfd*+u|JTeIRQ0z
zvwNc2J^=bR;s9hL2ZSahdS*-C>;b7P$+QNYkeTg7gAU}F_W}>GBv$IzFnHwwE*2C3
zf{B&mRW5F+aQv(P)WkbyM2*iR*-Xix#O*OSgHm7)JlB#e<dzZdZ6@UP=s=j#2qztp
zR%Fzrl{aKL-M_U@5~&??G=c=+=Atx`Xyyhes@u_L*RYT=XrKU0j)^5V7Y78f1G5l8
zaA?>PT=0x@)&5PXKrBHLuaB9CcE&pA24{9wZMmuA=GN_bfXN)?_nl2e&<H<`WsI_H
zY0Z~=cq9$pqa1}!w>p5y7`40Q-^W+p_J`f`uYAs({j%xnJ8RKxF(Y10+29pAi*Nc*
zPAO|fm~#P*_1q2%idxa&DS{r+4egFB2rZ4p@Cgt^X8-$(t%6^6+x~s>`akFT=Utrs
z?u78aqWvQp(ies$)T1OIg8E##D+#M~z!{%WM;wsYGZ5>R>cfo5Kj9@>L}PC4V9$Gz
z=dAzFwTjt4^WtA-^OAi?6qHk@$W)N%!`xJMC|4d=rj<{y%@S#kk%}{XKs1>TI#CPI
zh7O%8hX?#+R{LB|&NKYPB{+WVUvmGtpI`ou|ML?r&ZQGh49&^9VvtV?dP>%Hz@sT>
zT#A7XG5xNl3vFq$dIybQLc)V|-Wbpy9%sRwnor08qDD7aENks<6N`}%ibv;z+mp_a
ztXnS*SJ>H4M2H<Z>UdiZHZiLu%nS<3g6^yW{NXSP;ze%S5zW+$Mp_x4@@jI;TB6$)
z(E5~U>dd%#t(P92BSw{>d`{$vDrs{Yk{uTx7u$lu&^%|HY&Gok@9cZobJkyER3Ud2
zr*`CQO*ARSPG^2`6u2sSv4U;YP?EXk(3vre+>zj&+S=AIvN8j$v!BNRl*0m$??TA`
zv<T-W|51#{O^V~<_O?6c*7cALH=@366oiN)3m(2TmgK?aXC+?A+%f13U)|Ad7~y;J
zp)2yU=<pphlNJOwS3G&`1JlU><3+Z&84nlXs22zK!lk&r?I*%`d0T^~E^cqT*fme~
zFWvKZUg*|}exHnRB^W3NaGk+GQh&Ub?Fjx)X0~+3?)1tN)wdj*wX@dwjLf(p`W+7e
z85^^d!oOfgbIHEm(8vEs`tZ0ND0#paXlcyMq=xofOFyGsV0`gPdP#c4ng~XI>{|Lj
z`r!D^GB>xfA*4bc3yO$4y0Lqy>cF7GtTq(8?%CL#pg0GqqEDf>b{3Y03416MCE?f7
zD`=B4cqYBZQ0OoN9qgDim+3-!Z*(S4y7*jrMS6IAY&y{Gnt>YE#}}VSuShR5FGHA)
zhF3@=-S<abOd*M-9V#!Beha-Z2=P}`<_A6?vylhUIv&)+n9x_i_S;7Vf6vP-jRdd*
z5^c<+ratB6Z397tp?i@Q$c(gZW`RYbYQ8@y|L4+6mi%8q!&~AH(h|OV+{=~IM{!CL
zH8V3>1KmH8K04k!fB18frN%&shGX6uer`bFnMwDMoSKte<~WncX*((b^U=-)!necF
z-8UFIm;sK0On(x2EH#3j8lx+WR^mW@IB9<h*8k3MYH!3%$%K3~DZ)wqUzQ$Nl_LB}
z!_Fu9zdnA2mIk9e6sp>Wl7}L7jHMZX2pNBI1<5eOY@OTDcD3aMo!>dA*9V(D9AgA`
z6b4Rhmi^8C4J=V6ol0MpzA1fCdeySPBra)U-Bam%(qBn$(YN0V@E4`ulD@<+xFm_!
zEJGtbk^WBlGwDsgIEmHgq(6|p!Bnh>RKsPbR`|b?-Whd)g!L(SixTdw4@BHcPo<9t
zbb2tpwI-<;=Nn}I!os(F#6y?G1wvM3BZLk#MFj_JF%_TRyv0iTg7l}-S81$W*l3qW
z(ql=|Po)1U{SfA3zy4p6{#g1lefZ%6YW((N=?&?>OFtg1e;PmW$I{oO`*xDvQyXje
zk@P>Lzmneei~SJ_<BVjOp7bj-4&6Nm1|HO4at=FLBbZAgwFm+3tR!%w15t!uSq677
z6s{(5Ng0HWhmpneucXgPza@P``ZUzJj<u{A@tH*UjgkDHNS~K}SNfXtlGGw<FG=@M
z-D~Mv(*Ngp^UkENNZ*iNmhQ7cXG{l5cI%P!6X~y{AN$4W6YReS5OEqCtaz8#Zmbus
z4<a)<x$h8Vq%`>q{O1za5s2IXLX9LMN}fxzEdiV6<Yd!ju(S%lD*e9nIq4x}(xr5l
z=;2y=Q~Dd}e@O2f+n-;N{y_Q+5#L=JF@+U)B7H~t|4Bde!>Ln__-W~Pr7ue-Hpr*I
zfL}<@q;E@qJ~Cb*g?oCHSCpYr87$fJXXt-Jpx*a#<#cCU4w<M8n%A*y$$Rv(uskox
z`K=gyvcc>w4S5<M5T&odQu?g)C(?hEKE*hb_|^0d?@O1`J<hSd!L#Gye=q&1^cpI5
zPE%RQK^5210!!3iypTR6{jv1_lD=$*{vBv)NZR+VR4@<ja2$SRoOiG!UdfUB%;v|$
zkh!rLPbTsO!u*m|4a_A2Q5sj_XRzJ_Gda+?v9`Tlj8t!=Lrdw?((g%sBE3rBdDo7F
zdxYz{C%q&6ELf$<`g~UU9qB(wpCxu!S$Seji1M=ZSo&zR{;x~FFa2lf^F(doHxJ<M
zKb0O!Z_~*=4nkw8a`vns8+(8=tC)5*;E{Li)(SRx4Y%`>oGv<*u15m}W*i@TYA!lD
zqQv6wk0&6E*8fEMob>zBA4^}7ULt(TAgvyV!_Tary_f&bNWU%piS+BzeWRzIQKU!)
zY$m-Sy%(F8*8g+Te~|u^lLRZ^gmX#Kh4fQtZCtVepE&ZMXqgD)Byfr7Nb>H^oBxY`
znaV7AX+IPs>(a}Yt~`Yu*;Ik$MwiBc8+5l-NYW~NM*2hP-%FojuJ)yLVn>^8q*w5*
zPe$wiIqBa^|A+J{=HXnbrAPFsr`ptu(i7>!I36_J`8Dacq~DW1E1d&k)Ogi`Q;yH1
z$8Nkv!k?HT=Tad(kshOhE$wtmpnyRJRublS*!KcD-Hu{TM`7~dQk>Z=vO8c)Gju8L
z6V?3d(KOxO0aVf}((g(CLHeroMOD0B`lTdyFG`=0{!x0ImZ6eflzv<KW9e(soZ&&4
z9Yl0OD)CwA^U^=~#V<?0C;eyX*Q7hhQ#{@bR5%4>|0sRmVv>OlToP~3FkQ8EsJ2vT
z*372sFK)SE#bz|{Nh_kB%udN2G^5V#G>m8km;;1ut*de8b38{f@KE}7>35{hGu14G
zVk@(o=}8|*uSloTll0&->2>LMrElW8D^i(_6;4m2S~{0rkWOO#<Ica2_l*%wCHLF`
z#JncGD*c$?W?%yjh)R;OyF`#DKy#gqGwDkWUeidLkUITq(1x(&8O62JNNzG*OpGiM
zHcYsS4e{ZO1)IQ6X3}fYA4-2BeHEu0fX5O8m_DydpO(&J{geFvJLz|%F95+M0~9?w
z0hP!4KXv*4ZRroCuSgvSb8K-PXH158EFHM`(QzYJ*k|u|s4a=Ad60H=CewlOS_P7m
z8RIjT?tub^pBP{V#qg}w7C2%C@Am*1^|sogO6djZ>(XyZuNwFqniJK`J4`D3ek|ck
zdR6*e=?|nYODi&%Qv**MqM}!&&q#NqkJHv9%lC)Ue_{&O9X!r~&0CSaZly0s-wEZ3
z#gAE7Ep+iUfmzC39)eL8qeBrh{^$&xwY39qBi~vAi(HSqM~D1I;+Grv-VhjHSc||>
z5je#-UzYw@`ZdmF+Tpdrh867Ld~~GZTzXCVHR*G9sBzR(S`1_K=Js7ck@o!u>1(it
z!x($bT++Gpvh>2R!KYt_26J7~Rviq%o%B>{q;u&G{a*bi%4c(^>=~q~nFanW=<5K$
zj_S56N3;Sgt*zj~diDmNWN2QOz9xN!*nP%N?@`PXekBB}U&60Qzb}0SDXFZ(aKl2Z
zp@nC$dFkRWNWUR{0UmdU51jKZ9e(1A(hpf69lu3p*=oFGL_izqW30mUXq-%Iai}a<
z4ZxsfnO<OUa*yAomt0`|W>6)fo>SY%=<L`51O)a6($}SXfT?q~vx8R95z^cZBs{>+
z-;_R$-JFrNT~ZUxINu|YF_(Vvm!;n%w7rl%wkTo4T<M&!QpeFl5DX$&FL;H$HEUgB
zq}Dc(yQhI(^QeK}9XMQgPHmt;sH>3!UXtL|lsPw~yaBL=lgf|RKc!#)PWmjPP`C6k
zM3+sC0re>@H%s^x>6_B)m|Gd4A=#1*(d)H=13hx|r_ygr3n;RIBCc$bKr<?z`}0>&
zTwCcWqwSX#y>-Sv%)E{r8CYd;>&%J)d$N>*@F=Q+H^To&dL%t0yBP66YU5F?N9P%o
zz6_t0zJRkYhNH6*`6bnt8bh^v?Rimp5zElg(kA?PfGRbwR0jZ-?)({}j5k=%v`brp
zxR?Vl9Yo}IAWi6C0}XS7O1F%ZJB)U3fe$Gyg^nIPpQSQb!{~51n4!9so=TPUkZR%v
z;`>AaAeZh)UzUDd`V=M4+JfVSwK?UZ;a#cr_<89I;8!J)LIV~xqoXPx5;Sh^{1xe|
z0CrJzog=paCRRlA)31<YPZrd~3Z65!7K=TZ&EXhGI9M5>H1m?NqZyXy5ZV*N@m5rZ
z&Ww6GH?5u>PcU#d_-T-b!Vbb(5Hd$k{+NXooA+zd%fsPE8%~8>+F-(!wf5YQUcVr{
zF6Gv?wL~s!?7;&4n;;Q{-9Mm?TO(}c!$~#8=sXEwqOYiti}x%6LN@&2%mCV6y*(oc
zx@W`tS8S(l8^n{=|FhDkF?tK)iBlVO7CC)Ts|FW;mb^YVdnK__65izG1{iv}Lt<T&
z7BaUsOfV;K-rLfQKNY2qnelOFD9?-r$1DEi%E$RMyj5Y~YlHF1Z3%OmZ+qi_JB9_{
zw_}|nA#d>8DI0oxYX(r^vI-;j2J(?Zr<swL4m|J))?c`}6Hc&g%ze)6e9vBVqlb6T
zmvUmn1s?bsH+02SgHMG25k4NUUf0G1Oi{`cO4$Pgl-ufs^q*i%I{=;_Xyl^^Xl_Rs
zMHVg_WBxPdaE?0=VhLXb*omrH#B?<!tJ)%<J_B1x52c6FearT*022cpX`zS@C>%3V
z4q2P}61&%6(qhip^fZeM<J=H<t&LLIB0T5hxMQknWKcO%tXortm{K63$!$1*C;|uZ
zIt78-;cT>p_1}zbP=<Xi4GYPJUahe+&YW3)<7MtkFG?>E<7fQnLo=G`ZBN||#pc}y
zq02aMFQZ(wVF^VMeWH`L1a?>x1D)cAqpSK3%YNcXc;K|Byqk~Ex?^Rm7_l-|;S7*x
zZWA~T(#JG|-(ke&ha=M$^1q--?WvRzG&zP{A=j03M|ucIezVSZU>5{i$#dr!QCV$K
zNpJZ|=6s(buo14tWQ_*9FlOOsh)t&VydZ<^q$lLmd&`XL5fUGdG$%e-6HEm+HKNi%
zB`ey#4y3trTzWI)wsym`$Q*!%Kifhos9C@{6o$PC=0JNMGI~C|bI3|*^yS)WQa3sf
z?DF8P<@kVL-?i{B9#3-M;^8+O&KuBii%Mr^x?>p?k5FT7oU1lBMoVr$)?f`YeDoa0
zy|>wur-WSBHil~=f(XE-#>$o0^OU*0Cw0;T&i&0i1Iv-}8OAL`wr50R7qIcWV;=MY
z)3LBKHLelB^e&I3XVOzvYWn%(S26+x&%%XQnfdALv3Wu4X@Rf~)azUBF$YoWUp}R8
z_u+ih7XB4j4{^6phVd&zjK9aFFR%@P8RbNV<uMAF+w{3DDMv;Tch69l_IUk+#@Dfh
zb7~j|hM%{*g%j1s&%cAJl3Vw6IV{L_tnL7VF3C%Bwm0XoD>J>p<UF_!?J$&4FAd`Q
zIlrESdLw-ZjeD%Z1+QI!hl@w3S<906`jo>@`y?PE01VF@@#GSX(iF@8W}Wv4k_bH_
zv+rybaw`msKPAXdVSJBkg1$R8EV{5EeK!wNm<lZDTn+3bV?b6hTKu5>EV(QOp2coA
zJa%V`)jzXh4Ff8Xo#GR&I3sf}Cz0$DjW5g?9}va@rM+en)t~}<u;^aa$yu@nOCq~$
zl0}o*!7&F^bYu1NE&scOlycYOsKWjU%5H>ztwEw&ilez#Fn0zC#2zki7ipdDNOK-x
ztiT<Z%~NS7omsFJ@zsa8&Qo@(bp_nR5)1GwlFB*E`U(_mz`=~zC2}M&si9=?TWsik
zN54k~sD{irH!jJ3*qW1meLLg{_AJ}lP7W)Xd~%BL?dVWlj2e-aRy2vafxDiqfzcE4
z6E-ZzV@Fjeq~OF-k%>>o`X{Hcwyd#(ym5wJT#ka^o%D?2RA*fi1s8W56Xt6<_|yck
zdsH~t$F&Wl{1{8NXE_Enm=L7W{P~jM9tCIoJsst-(H!heQU?IX8HB+F;Oq6I0qv*+
z+_w`aTV$!?{-<CHV+!A%?vNQSuBOnD5%k?5)}M`5?qGS5?DUWsIK~_@W_4Ti(;C^L
z1X^l`O|5~&?XeC?k$V#u%$=Eq$Z55bCs`i1z-4IL+?-p6@a)N&@*zF@=2QARq$Kxa
zQsvtC2dy=Q<W`@!xn1Z=#lF>;^qH~3A|c2}FVNZu5}6LiKT8bbc61)y=o8%lp|_RA
ziy467%b3xL>q8hjb99;6`LXNkOaRkTElBvT$-fVnzCj<iFF`hZ<`tK9{K-8@c4)-7
zh?)%8C7nkJs=Bs#r80WheB=O5==>|}i9D6axGY$iV+CjkQ%@}3yfQZuMcfJ+UB8Mg
z|J1Dc%rKi7HNn962KZ5J-Fg`@K%k@@4zz;?TTccM1%TrU1<nn3&JDvq5KIhGxlUS3
zkF5c^Adg)jxvj;@^|AG>D9}EE>U9P^Ik*0b;yJ><w0OJajcPkY<%DNhdl{t}re8It
z<BIZmi2)8CK^r@_HA-G<yRfrO-2x}Q2vj|Ig&-mc!r!&ky@a8xhqvr~Xs|{CybTB^
zYG7d$+{5%L5UEv!aT+?B67#;8v_BrYbk@?lGMAJj@_pig{?l??XhO>lc)5nUO-|x7
zI}QMHAiO2;!@}OVK#My5eUJh54xnMm)0L4|P5|I*;IW0FYK1>TLC6qXu^Je>MWs8k
z_(2L^n#P=9U&>)Odc`TVtv|gB(4MrrGI*pmkh>$NsE9@eI@p<VHFm5;YXy)4=4I{H
zS0Gd$OR^w3<?jV7jua3+v!kuX22?hur7-%s*wu3m{fv2By#gHYPZ{aUiFHG?M{71_
zF%+gECE}hTfXy(nn@)jqgU4FxZ3paeN%-!+(i@=t5PS$+D+dBvP?cF@qzCI4ESNoM
z>IrFkMg;tXXnNWK%&=SGv0CcAPSoCF2?~#aS^!hwWoHP_0_jdWx*Ej*4e?_1PwhaF
z&ft@l^dwv!k5UyG@C($CO5^NBCGM`V0eTim>|8)7b5pXFA>kR+lm|fVgOytv1EeFE
z%b-Udh)PaJgTUPi2)7jV&eA%e0|Lp8mB$@GgrS*{*#q1tj#q90V>x<{_UM0r$^m3`
z6hJ3TP-~VWLXXNq(hM{Cbo3xCdoBYAcCg5VKe%uFoZMd_Hg=05Ig#sNRxBTRkJ1fb
z3iI#UvCS2Ru|)=g5_cHAzyXcFLzvoNdS@2<90($M16mp+5kW`>@{mMg$=5V#fWn3}
zmOK_F%yXs_my;1xR4{jZ^)deFoCi6WSKE%fM?;5CqOop_v$I2JGK)_pBIp`yDYL@U
zmg_IM;Q^?+^^04ZdXRF-g$ZippZihyQvBrFK%&rr^j^=vSmwqnnxL_|8!r)$5=m|h
z9&atj?l+<^MVV2$NomdlmbNnQnYr~1r`@r6o%8}D(Q-2XXU{p4HZ#w<uyxJN1__ni
znQglH>JH&q8BZ$OMRR^M8-C&D?`B|o(fYkHQrwCN@KXAiuzdW<Ekr1g<r$K9kRC`6
zpi&IdmsrV&5`sw^Kv0KaieGHqfCg{5=GoS;*+be_Bt+ZEBI+D?qoIZu-aFvTl7Ek~
z;oucYqOXp4XvbSb1MS>qGTdA_W7#s}pC!W39ljGm-{$z?T?>N?*kOXIq0TR*2c%-X
z6=;uQx(#c&#r_s(-o~e~#tMZNM%>z1XH<#P9(#|{e`7}LVMJ5Gii%QfXisc)?>%yI
z&bdY73x)_r)Vu+kyP;4N(y8>&Mp)$3Wa3SN&T?v*-t-=blp_Z`Xt`i7J@fMEXbtz$
zQ|TU0-f)CfVHx3!QpF(o@9^|Xn;El&^CEMjd&^@6_b-ULpAxmm1{^O<Fm}i7908yQ
z^|9dM8S0U6=7|bu>`j1tJ9-B)Y)cT5C%}*NDeyimcPQ!p4)i~R(Xtr9xVg1YWE4xc
z(npY=9$2A%$Ng{8hnkooxSnTZ9R+`w656}cfy`(k1_<uLNW6keR2wJVb)k<~7A!=`
zj|L1IK&nPDKmp1<BfhBYhYIYGtRyi3UdAgd2xj~bBbz!{Yti6JXIq7Mcl=3ZW1bGj
z13@zonkXPdp)5f#_v!GiOijn18Ti@4i1UJi7QDV^(tFZ}K?KVTVGaUi<n&Q4TiKxB
z*(idJly8p>UrSGctvWO82Tnei=Aj4^-5Rdj3?(X?L{R3q$V)p?^{yRM-ja%JeWFr?
z|0N`VibdYB3p3mQY&rvoH7GFA4dLsCx^iK8T{n6LS++Ol7IR)StkR55lW`4T28~Nd
z*@jJ964Ra;LOb!Pi7vK88YTRj;3X9V0o}+FwpRJ?FkLMgyQA+i)p4I$jr~ST-1i1<
z*`a!S)F3KwhvNun0e$_7sIR4bQ`pjM*tVr7(y<>wUMZkGgfTi0R*q?qk+8SsMnt4>
zf(>eo=Q`nlTLPruf~>IBQJk#^oR_YHnMF#CQAk?b)YiBVPo2jdH;)2RlEUaBTd@%*
z(krOMMD&oG>F;cX(kd=^g-BEeMfeWDB{IbZT{-|EMeQ&1QGFR`)tOXCSJG2E!!ODZ
z6%2qow@tDOt=zNWfmyYLK|!&<8yrg63#ZjSr&t{^X;d{1<h}#^E8v;f?FIU?XOzRa
zl`*PM%z!&wO1P_Q-ci`#C_D7>BkTJwSmA>a^lHpO=bE!eo?`uy3At<Y7#nu9ImUlO
zesc3w8Q+fQik$iy9qWBa<WU6zpJ+p8rVCsAPUv;Nc~P;sgwPSCqp0{*lnw{V(;L&u
z61F9KC4;_XAmPA?L<MI@98loC!Z1YyaW%qVqkI`8pB;}I0Di&h=cB;)*oKuuHO_&f
zw$!dVwx{>N;oxN`#DbTsj1hI0YX-aLuv&1<O7p{eE30?M;aB`hX;D*fxWbtT>D!HH
zl%n4HvGlq@|DmN(dbxun1+nhtX_vq@CHAO>yy4DlWpUKl1~u$r@XjKF6$`urOBh!Y
zya?X1Jvq$!GtAY2y*P|+p2{s3z$*=;+X87@v*z`vdNLBkD`szmKi?xTb2C7j$t`Sc
zMB(0YA~DCKCBz#<(1Bb8=`?0n2zxXmf?iscdgA66%%TbY(SfOCCFtzJW-;|<u*FNB
zf}FM(f|+fqcO(szf5Z5SoPuOPl|iMrvi9nU=RXWrwE?r;W5zSy;(?EH>5gKB9agL~
z+E;YOJR2Qo6OR`pnV9`ikvQ*JoyvRgN1x1s&VYj=v)J>b+Yt`XnN(JW`6UkRCC@(x
z(0Deg<Hi<5_we4Pon<^rOC#Fa1JFJLm`mrw%riRCE(srkg3|iIZn1`uJjN66t-Hr1
zVW52tfxAp57uYail4cetbdyWJq{cL}9?F8+>jzMX4#(#i1S%9whb<~mQ_Pt0pYjww
z!jFKyGxW2xw8Hy8_QwXOLrb^P9V;cxu;;aPWO+K*KmUgIwmA=#S;;ba<$E0KvGSd-
z`0p*DNP>HlGCjnk9)?b8Y(0arpBbevAgBb>d?4+l7cdxuX5LGJ$SdhCRE?D-RFR*Q
zBLf)NOz=M<>uWK<J4W-aZ;Rks?k~cSfbAj$?~LtOdma@9s9<e(9C3UHOEL#bY)8FN
ztz}>{NII8Tk3=nbNGelIuD>keXd!Ea%tSyMc5?4wSZ1PjX2aytpIpP`TOr_wQ4ALa
z8GnQTkLSqJ!CIT9S$w%|Y)0W-_Adb~-S-Se2GQ5Xc6t>=bHh8v*LoIKHgxn0Ya`#I
zxf^RI?ItrAS*<a2E6Bh>%NW?)(Guj=OdJsEfrv3`_xbTS`V4}_nK4E)h$q3HTNx2>
zAU3@j%UGNY8OjI?1D?C0$!%Q9%&p_`0Pm|ZX6+TybFljj_PR5kO^F}eGLa~~_i|K0
zi=UV=?=RZ;E`h|(7^|7_@0VW3&FohS?sLIZ^rX-CP~bhfG0_DiN;VO7=djvJ!_f0d
z0~$d<TCk0|&cdp`k=G9}%!(mLvA;Qr5oEkmWb51-zWbG1@p?0UbS1rHaKH{TzMEtL
z6+@zA<ZF9#8VynEdK3X|q^DLl>d?N3$r_A&F{F9jJnBNaFFl|KHTWMnRZo%1?QY8;
zQ?m6ydPzC~|47~=MubE$BiamaE)L5gtn|`w=Rl{TjMgKuk-U4MAd=<W1SUyaGZ{bx
z`hQotXC}6_9<vI2_Jm<WgX<l+?63{Nq@1w@PiY1pTwF6}C>}BtIllUc^)Dwgn26zm
zWn2u|B#n8mSooX&%UE+!y;|{HH3>o3_K45Bc-tr=l{U=(fGs(*V%T<cB&o|!13(h2
zft_vG!ouL4PZN!D*?n@aHNB)W>m%A>5u6IqW2<wrt_V@RjarZp#+IjN0Ue}A(z`s*
z0}=wEOc9eeY)`+3xyfq5DrGFj7M$&1wq-WDzM@*+VJi!Yc!3U$(d*-Fs8R8#cSKq3
zB_3yOc-27SgR8YCPdwqLD_~fU5^pYU*qKw?naI2{Yg^h5OA=!RoTC8gub)cSp#L#i
zWW|7yK^$00_oUZtc4v-Djt%qrWH*W+ilK-dz(NX(gZAk2bOts<TJ}8NDQRbiV9$+h
z-21T3XmAx!$%ZeTS!esDb=vd_P!ee`OzCr^pdvXvA^e=4Wz-^=OQRvX>F%slF|J_+
zFi=yhoSBdpruqj9f^S}aOSrK^bC$MkGk&fxPdt&nY&jq(_%k8bt(_h)Do+hQ2aH>3
zIqk~2p!No&^f5rkti>}MieB4lH5j8nPA9_u)?RpKwbR(b#;^>px%J*}VI5H^@w!6!
zmk`-z(#2^1qrsrDWT7?{UKo<!-S+$u!)<IH+X8cPLH}9OuFFvpTH1+^7Yyu~!(I;-
zW{AcBzpg1At$4`VXcn<x8BT8Cy?O_*<*~$G?+l)dKa3o;|K#*9c(E_ad;y>b7O(`H
z?O7Z#wj#%0H%3IMFf0k`y*w_WcBWrzTt|3<;L^;9fSyi{BgqVB2@dIDnEVB)rjOtf
z@9;|clyrivjJAl%3XjdO)~M0_=Q!)Y%)$UwM$_3_hhrA1yEn@G7VRs!e`ng)j7AaE
zG^cKjWU@!ac4o|Wqw?+O6BMpf3*VO5wz>2&PxqF$v*Z??HE7?K%k~7sIg)lh>P5>j
z8c9w9PK<=^1^%-&4(5ac?>UpBu-6vS8jhv&m6uiUf&x|(G>L1%P6c6iaOddlTSI+4
zwMhqsJ>XM{h~tOfF{$Hv*t`@WS%X(Tm+rBIeG)71lgCodHv-5F2<(JfWgos4EdCkb
zymwQi|2&Zv(uMT1Vb|GU2{u+Gh@Wn819SGjwpNgv=MRF#g3C%v-xju()#O0WC{53J
z)tQaIxgKVH*`)kuR5`9daa)_|5uT)jq&@D0&ymm#`+rIdlH@v^Sw331t@N?<n6Xop
z)zG478l=K81+jMkNj@G*4@u#!;a=x_Co_uirR8G18dhXmfe?j0oKeJ_8x{QK8wYDK
z7HSY}bW0eLEsk=mz?p4JGIXgXSlXGq8G6<Lm{p!dg_gH;05pWJxe@EC5r!ERm@Pjn
zvW~!og1OU-X6gE;!Q5?0d~!RLvqCCo9EtbTG4unbOYnzgymlj95Ifw#9}e)BJse6A
zCY-TsYdfTFW`W~`0=N%a7<pyJT$DL6Ns3h$$4tC}HP48qlbk$*6mb_6atuBen48e@
z+~{%XJ{Q>5X&!g-J5Qu1v_HIzCp!?(&hezPkqrswniHmPQHRcm3pHGht^0sTpBt2{
zL(Zb9?O@;89UD*)2R@=5v9eWIvT_l6O-utRc<nV!Bo%+SM}KPQ2jk7FU@n~jwQY$W
zGc@Xy9`Ma^^c87R&<-<}dQIJGY3elKQDkUUZiTAWsD34OGDxZe{IIl<uN6GGiUNWU
zQHY<LAVG92l`vr&9%>G^eq}yv(2QIYH*P_JU*sE|S+~L%eZBld&??^96*kF5RCv!u
z_H)n&FsF!m&Gh_*)pWx@T}qFwHR<M0FD$?<EZ7M%HYcFC8cAOyPh0YhYwS@tih^8d
zx|1-tZmC2OMmN~RGsH{GHyn?XDe>Sl*e)J82m7iz4*%xh6j9@@QGf{c)AsBQ5}(K-
zBi=}h7vz`Da@^d;g!DGNv9^<HXdj#c;cYUtdyn9~)izB-7D}{ME1Ra!1Z6<xF(cjz
zdS^|d7X2Ch#S44knVkgD5gf082?t$Fx$$Q@LmPqxRa%I-ne+jfRr6<jBl<zt&?A>v
zsCERwwT90gSkZzAF8T;|)}Jt_;4P599b+*Q1>~7Arz*mzX&S4G@Pb%jVK$_)K7#02
z8I;mD-0HrKE!r52UhqR>{VjR07XNt4R~vS8gZEocicrbPUkiJxJ^6kVmjj@^^k0b@
z@3oa7qoy3-K-5%w{5R+?_pG!Wh05U3$N!GgPhSPuq?e!qTr+iM&uezl1zffs1ld^K
z^ez=k=Vo6b)a^)w9UF`Q++#42*1EC{9e^wgjF*-HNJ&DjIR<hp13@2;YDhbbx1BmT
zw*<Mreh+Uiqw6^wWGqA2jfqdxyP^bs$B>hSH8JF5kpmzpAfOG}6K-{90}W1%>)V9_
z&rQ}Et7E2;#(fgups<W1=)lkq43CZQD;O0~T#9mcPee-jca;BHqiUr%_<;pfhY<u6
z<*O5OPKk27plV*>$!;A5k)t|0wm0_A1DC|(e~cL&+u-0#WSFtFg+HwwWztswTTDT4
zfm)>haO7(znlM6*PN3N@4gYdhaMTKS$3Q?3VZ>L1^jjHoOUI*RD=QZ7t(jP4syi&i
z?zRjj{mLCXePWN73fftE=HQqNh;TKD*Ika+oS~6hWF#&wkAaBj6xpG_6)`|s{A&)=
ze2<eHHTeoFORZWgUiyixv5I4XZ$w~+LkPlFJ8WLt@LPdpjfFR2<gJa4Y{4R1vY9BL
zP1v8#u*`!+;I$#Y2kEi&jyJN2?QJ0N&aI&%t;H6Ke>o}whvcOlzGt(zOM~NsgS{J-
z@dgl3@S_Sur?sB`V4<Wq=&r}%+3*!TNv*6b+VQq~-@Pc7hQ$lLjjb0j$0H^lP6VMD
zb7EV9C=m)&M$6t45cEMnN%(nUG_;l<odH|rMvdy%FxU<a8J-L_YXCo@#5vd0r-X!E
zSTE7RSb0H2C@na+dDNMWVvXK`E#Y6wuUwd8DE-a_tFp8S92;II%y_Kg1cf6b%)3HL
z19!|UU&$sT(0HpFchj(V*VM#bq{DZHhIgU;R|X735iG;dMEQPm+(+-PkP|`QmG0w)
zOOxO9a}loh8CZzG!G5d8tM6>GLw#&L0z+T1HW~TuN_zPJ=k7nhExC>>Uv#F~_PBEJ
zmH@qxNVX(OqTKD<ef!J%kKcH=&Fd?*q$WjD^n?cp0)?{2ZI|YLSmzfxGk2Z>joOHN
zV30u7&YhXDVuhJ8BNhvh3Sj3(gc>nGOOEnx`6tz*j!^T+hRy~+O#0H(MjTiZ^h~&6
zZt>u7@x?w1B@v*;hK2lC`cOLG7ePz-zP6M(2#iS`r?zv3Dp=bjwk`W7=#t4jOn^Xh
z*Z&rrEokf&t5};SiKeo^??!5nlXZyF004jhNkl<ZLBQPrB^+7>e}++=6KZuA2NQly
z7vs5E&YYR7fixET4tP%8pkP?Y5n4Dmi{4dVWAk^95fE>RhIZ&(V}Cfq8}@WC_CQ>-
z!%EMQVQy_gm0PcP`UybY)_VKWpB%H**M1tvK!H*UbenC|635P1%o1Qv9tQ*yuQsmM
zmZxj6XA#`BgWX>dZ*2iETN{A3Ha=dBs*VJq!9uwKA=zzhZYgk>m!1!*Ei}n-BbNq)
zPwimA4#v#L9}~Ingi!35&2Yhj9}rUw!#|gT42wzsIkU>Bu&OtFo|vr7V9vfx-%j4+
z7G=q~fzcg>YXiw*#ij}6iF>Yy=6kWzK>lHIB4vpm+Va0q-|w_As<RKJXRLxQl<$&?
z%?U5qO9I1t=61J|e~;dqoaEqSbed)$T(Z-kPpx{98@r_%4D<+YO8`l0rpO%G1eOZ^
zPvdvq8sM7Q@tm`MA4_8l;}Q99VV+fJWTJ5|a92lIV&O1*TG)Uy7S>aAVetA4a}wq#
z3(K+MtpackA2PvgOpK{C5(5QuuC_Mh3{#n-jtzdc*T>LU7hI@kOVKoFX33de?jNU@
zb1wYMK#*YKMnajic;f+3&jD<$Kr>~#t=XXw(_NZ*>@MEf>1Qog{*s#F(h|gapaZ!T
zr7BD0HvDc4vwVgzA7K%sMyR{e%MfCBR5BL4@i9?WLo{^tZV#n5r4OZBBsgJf3(U<i
zWx4gBf_*97lOECDGdC|;SQZydy7g!Tnq%1y`prj|xR)t`@)j&=7CIc>C-5gB{~Lq?
zA^F@itP8e+?G#fh78FivG`Sf7s4j(b>6g;S(lc~SpYxv1;oTZQ(+(Ez6X{i}<7VK8
zhuFgn_)ixBrBwZ{^h+R+S)akap(kWRa(FZvLAM5G7j_PEqy;VGq?Z<7ckds)o(cIo
zX2(t7=SM;`lwU)O=A2!%K?Nq}NrIa?{_rd5SJK<kGpyeu>8{a8r$!L#5t$n4OnOgx
zM>=HWWNt%wqN=jNCCfvPGgkf<rn8Zbu}4d8m{8bXj-JZdS^(OR4V>F3ln_K_B!r#`
zC<v{mIE~5*1r;^C4eH#xc}=(1goNO+bST|s*$RUD!~Ib<0v>-~dRuzA-;mR??6pla
zpLzP*a;)!1(tQeWVaXGzD{_||uDkI&-vg)GGKFkLRZ@lnG&Xjj4*|)dwzZj&h6rLs
zW8)g8Kr;|PGC-HyVn;L}*z`N1?$uzJePLQY2SQl`CTB!iORF9XgSaGyH>K}Lhs=ST
zT6fvbD*YP(jw&ubm3}7ulXNV-VikkRa>js=&YX$YN@vo~r0+<D^enfX7-yuh<ZE?Z
z5iBfukcDf4+nNOm25;#N=^3{2l5tUiKG$FgnW@eQkzrz|Up6e>Ib)APT{m=b4O8GG
z!k<Xb5#29%?TYYq$4f;o(ke&<Y4QHMbW?f`@FRzJ9MSxi|6Tjyol9>@-{q^%uvAmq
zbP0vI#qVlg{)H8u&nU<xGoI6X(c;D<*1TGnxpbSU6=#-jmgY<|=<@dm13hcgiW!`r
z$mv>a+ZK~>zOO$?gl|hf<Z#;sI+yp$BRTrg^)e>T+&@cCNrm)z={5!=Gwwvix$ax<
zWKMT}L;8EEmR^QEx-u+jgA|XjWJ4G}2kknvlJ%TjyoJ5Gur1W<=qv1`<&qfm60z4b
zS~PnYRkyJoQ$b$CsEd%lvMZykK6C)*(p%C`q%Qz71mUVB>%U;96$AMv{r?Bt<sC`b
z$qSx#$%vNLeBjm2`-Sv9>5lXqY@-?5VP*;&O4+@9#{x>R&keTj#yT1T^O$f=<ylA+
zmLr#n72g?UINhq(h8G)POZpIwt04Id&@GI9uMIao8|gjize)v1t1qk_B2vS1>8|vi
z^gea<tBc>2{vSw2FJOH(cEhy|$axrZ01`p%zJ=4`z9s!9=~DU@{EZy()Ht(=bS`}$
zy*CPjR+e^_INfNWSy|#VL1#J)H#2zaA)Yh`{XR7dm(wrQL=k&JCvZi8SaFA>zy~bK
z<~jg6iSPsIA2_+{B`d<M;h0<i^Svkik`noqR=h9$t#l&&nRJ_dy92X|eOF0$rFW#q
z5l5%R`#ZijBLXX|p+ZK~zVCN-8#HHv!Uh0e7~{s;$u0)gCsjRfS_&TY^(gkXREXv*
zN^c8HjN3N1e&&kOQAXS{-iVi^@GI$iK%h@cx1n?=2<ntgwUyqJJ`4xkvLt^`bUU?%
zl1EY{-H}@9k<>^ZOCR~ctV#a=1)04@n!3SyuZ_)6j`p%;9P2yi0N4_1Mt9@}j$l2w
zURb4&-vmVS!suWVs72e*|DfR(@Q0?BZ%hoA%Z=jFj>`X^q*ovkW>(F;B*IR6;XaUS
zr~e;H-;-`I{Je&_bA(GQh&Jy^_x<9v^p5nOrG@mT(vqO@gb*=y`xz&)jX^-3LPZ@o
zxM^#Mje#evqLv@kr3Uox2F0e5ICPD=v_^aD9Kgm<rJ#Z@p)}5IOO2p^=@UU{hSH^I
zE~>6uK#{xB_oa7Shf@&rBkblE(1gmM|MYWrrSC~^0}5PSd?x+E3S`e&DKEOd3+Yt)
zsq_vLo_e34AqJ(fBIH;bsyQP9=wZ0w0OrJ4A*Tr3P5|P170jq07r3<m%u3q6=1>wJ
zq17WWdSbl2CH<{5k-j9|G;)4g)Q32h??`XB8%z<@&!|$oX0)jm<Rjr~_oN?4Z@LJR
z{NDgA+OUTLzX^EuL+M@V4SeA6`DJi#G!=@py)wr5;r=|ZtdDR7)oN`pv`8zuE~eI)
zcQc}1*&?sdtJr+=(Eun(;pfssx{y96y+Y~ijBqWnBpykBD}9&ZdME!kENZ`n#s8I5
zNH0o~ekGksccr%m`u|w^xn)W>rIsT#>VDDT4e3|=4me7&Yx2cpQBFBxDrX>Q1x(tj
zqeg?p1-qz%N)n9|hoHOLfdQWoUC-?dhDehq)(}%tXBlC^CHenU`oE<{dY-i8(7L)S
zUh{|259v6$y7)qRSNg7W#5}^S<=&5_dr~WXB>h18Imvlv`F_ggIpY3zY`#lE9zT}8
zBfaJF@0sB(GOVG4OmIID*)Od#x)(%F$xx5kA3NI#kEJyvr|CWgBxet`*3+KZjEf9e
zm7~@_)Zl^iLqM5?Jk9%{N+J=b2xvRF_`dXA>5=pj9y}x8smUwvOYcc<NB9)NJCoj)
zD(M|MVrJ&dCalxj(ueyIP@ru=e%Vs|Pv5)6CW(rEubLU$-vc|>dV&MaftDr)Ui4}(
zr4j#wYd7Vq0h{ecV;`47a-$d0_Xy}5F+BsqNEukKl%Kk=z9*eZFTjIttTLH^`4s-Y
zI}l;I_(#%8`XQW`=ne|lD5ZbD7|j4`V=WYR=)%Hwc#d3_0~_4pWlk;AIplXUGJ`Y9
zBU#iCT2RF`h<S^bE+EeY48McOHNFHPg~!s*q^GE!tbpy((%zTekUkhlp^_d+Kb1Dp
z&y5YSfxK2r56C9o87y8(Gk-2^r8lM9R`}116}^%^lzuVTJRyaZftS%ac1jQI+88y*
zKF%r%kiup~NEyo85)xdpuX=fGX~D7xqgF^L5}L>zn(O3$U;2)83J@1jSx(khOOK>?
zrMF12I}tvTek@%||4gB3&PxV!@xJuF^xppBJ(RxBK!^!F4Rf%IHQ>PegUwU=0-Q>Z
zm_)uqV`kPRy#=RS3=T$%B6l<nMC=<hlZsr#4NjMs)ufOGhfi&ZjE=a+){1E3dmz2#
zN_y8OQ$&u}TC8!o@5i^&U82dT;%D~QA|@LN0&3cJ-nVw)=sgIQ^u_3D60)M=8s^i2
zY%DFo8S#0u&*ThlP?SF_lkN;z?kx82Do|UdZaib7C`9<WCX~5J4XAyB8%RDlp~6v(
zjR%bK(0b4eyD$IHt<1uM9khaxi!13KU}|X@eeii}-gIQ~TD~De3zoRlmQ5ecHuZoD
zlUdhGuyv9wXGU*caOm%_4`WM=9%e9#$j+Ht0ae#e_5myDedJ#l;FR|<^brw00MYcF
zlUphkapJiC|6KaOyhqTxTWrkb=qQVfSfd7y7yak~B^(gLOzao0q>!WM(d+E~`H)~J
zD(fRf;4O69%9<Fq^w2M*W9;8Z`0pzGF+G=sQM*D5M{1D!D*Tq!zB8oTpktQ<B2W74
zq>mZ<T@rT{#+hsSGYhUhodXCO-`uD+8@@L+T5N8^uR1f5@X<RqaxZY}3G;)Zni>B{
zq+l}(O>csvM;siwG3;U*-2;~d-kF&ZU+v}zPW*x}A+BugUz+PK4${_&9TQ<*+`Jjs
z4c`DIAjc`Jy2wS7uRP#S);NP68Lt6ujE?03Tf4Ch82s*@0jii2Q`tQH=$XnW!q4{`
zNcW`>HyIUvYdhgu`L8T%$ojwjB>C@*H@AOhzka*|AAHP;rq2l_krq&}AfWAOVOZ(V
z))4+70z1S8^~&!V`ai+4H_ZKy#4Ta2xh2=xbxROZSPeFrN#WNLVLaV+B7D-t`QBx|
ze%Bh%9ktiB4WL{|cZ^S&;f6aime~;p0sCeAi!kp2eugLO@Y4#y+sq1Wb80;^NSo)D
zd+W(>abY6p(WVA8@_`5|X6@ysj(+iz^f!WbzfxwuMs{9T1RijoBjkvg25Tg_*hdjs
z(vD}a9<@2!g!!D4rmeh!oZ;P<=DSJ=kn8?v+X)P(CoO=+L^QBLpK`ITcQH@8xV0^&
z{u+Gx^^pbdScqW1=7bU=0(itJ6~j_`&`Iu4Z;Kw6=t#O4H~`OogEyID_o9v)sObF3
zK0s*c=r=w^_}5*$ug}+B{Ny{dPq;GLmV)Q8L+xwpCzqWs)2-YYtpNu%KRRNKnxCsc
z;f7T59T{X`8-=yrw$wCl4^ET)P38aTOVF79{x*x(UT@+*W%CH&U<#EW{mFqTUm$Jk
z{XLrTrJy~fU}#M$HM6R~kj2;%ac-EzH$&Txjow;bcPPAn+`eDu37;g0QUCDi!q=Go
z$A6BRBSR4)?az=>OH3Kfv_a19p#bUsQlD7R-7+R)uCeMz)E8IMBk!V#0@2E<nJv_(
z2>(m|_SX_d`^5kJ^v`d6!T!a+!A}G*)sk$j(9$JA=p{a|u#q|4^*-w4kqv{I;z=rY
z&j^Mb@J3<tpC;6SGccPBA6`6p1N@7L@VEJ`-{!Vq!yij4>S$|R>=G7A1(`1(^_~bb
zo4=E>7J-GdWbRSE?_s!=%_53I*Ub77TYCA*U-KP*C;9*O*Z=mH6@>gt=!liYKn<Ne
z_t4Z1epvTAn=_ky6Rj8}_-Qs^VbvsP6H3WbOyI0v@1NNTi`l2X&i@K2KH<voKQH}1
zCWPV>oZSrT5fzxyvXJg4L&Y<`J%Por#-vY?aPCa1z@SB|Pk@fI{_u|Ilp6Ly|9*U&
zg;E>fwPx$5N=RXDpqGI~!tXS~CH$ictC9`Sk**774X_vCYi(ShwRM|Tzq@()-|Z7P
zfmiKvK%E>xX7B_j12d3t&SRLQ!f$U4`Dp<!V!f_HsnqK8JZR5`0@{u|x+UuvX#@Q|
z_yjgIl};#KM4@nJ=NX(c_ak!RD-&`?&tx=0M(cB9rv+RZp`rUst-bG<ZiEW@OK#9m
zIF{cN`F9SWG3cxSkKXdGmv9+cI~}Ua0Jh*>f&2vfDnil7g-ds<&VD@LCu<{M?yTn~
zvanB&K%e~ApT2t8Z+-D^e`Rzi1>G|vUTJJf$9j|pomd>HI)A%=Fy!10p@{y3ULShX
zzuOjpZ>$bpurz1;qd0yeLHv&tzJBYK&ul19V^br-Za1`Ll!JM0GW36LanFK!X<!CZ
zXpE&_U$ajRt>G&Mz#Vh(1(TMZM1uHTrSLBx|4;p)@t+7!IPXJ{6$&rx^umTN^zKtJ
zO4l9uoZrnMRXrT^(v@%?ryw0Me=Fg7x1?h%+HaQJ|I2TIf9cKBD*|6y1tOYWTQG+j
zkbY(L-R_rnR6tV@(*WT_6+5Q8<Z#mSc!uawTd>_C5v%N=jhejtNk!oBVexoD0m>}a
zXnviG_koCOi(qlFvs!&UT8pelK9T=V?KxW@punb=qs_b(hr%!59_JKyh3j4z(y(+c
zL}7*hnx(3#gO)sbkmR<X8i6{*?WM5H!hk1<Fa$Y({-0C?j-}2Gg82>ZT*3y=Np@l=
z(V9@aF?haHkjxl+wedV;<o+qvBPLsP&lZisF#~am^_ucmLGU_o0_3lw!kInlpa|TK
zUYzyUZxHOZzs|)OyUjO(@4S(VbE^Sn21;H~B~KQ6L2<s<%910}b0}03mT-cMbMu>1
z*Z;^D!tTr3CIdEYj8pL>B9IKD*so+|mRw)Ec!SN;@PINce$vYCUw`e&_lm#^ti)qH
za8!VE(6h#jQQ<*+nCnY>hK7JLeWKNIHi$rXh&aVX$?+cdPgsmz{qo7$pUMq~+^wLw
zeuElZK&BfZqfO|Cp3=Ni_bo|lb=2X-Q}V~sR&t6B-7;L^eBXd()EKAMI$ha%hW9Oq
z@`uHc2~*9F=-Z7Xs$u|J!O`fwn@84lVW+mF<q1TnvRSe96AW-hm=;eMjw<zztT2z`
zQkW6r&25TOaBdnKkTNd+15ny95t)$~RyL1*eB}?xms&!f9k(hh391HXa!hD^xkR@o
z+<a!u^nu)MoCGmL)wjlHj4iaIj-T7S<ZB_&47BiwO){~7dS_(haS@gbyqViX@}0Rp
zzj#E27tE4wjK!8iD!XLD+(-ZsG+H;2Tkx9$<6=pAjv%MIxugr)oiFX|><U05E#7Q!
zs9kHr_-p<mUOVUKoZDVKfol+o6#Czo56)p`oQ|Grn9)8Gg`SW_i!3*!KjZqJMrItC
z{<IVTE8BS;eWVLTqP!h;ufcSPcPLTv+*8=rUM?d;&Z!^^1zK7yz1uyJJFJbqo*R8S
z0_s6_HvXAN&)O)6f=!d+<HYBBApJ!8ne@;tLB`ztFG^pLZU8rKO;6HJeoy+D^vV9k
z&q-gCUNIXm0qv@one{X2XVU%aCK8Jk&#f1&f(ExRp$uWj3H@iXp~yMgXhH;*p+g%l
z!iQQH7{JQnh!QhW*{N^WTA!z-&v86vi>gc+a(7R9EPXgC!kP4}^jYa84s(bgGTILw
zOYcfQm+tu<$T<4(v(mFxu8L;Pobutj%#o-DPabM3Onme&RThIpqP+G9G@|AJIhPj1
zw+4q+pw*rJH@rz{Y`2ttY$+NP)}S`*Vj4){S?McG)ob8hh-Xd^-aFDSr27LDC6RAl
zlwOu@kfw(NN-FS3dRO|z{^Cufo6>92>)-`bZ0(Fwo<5e|kbWxN-Pivb%hLcpU($#-
zv$bq#)9Ovr%pp)G8+gH@BwEFx>GGWLuJ?%mw6Y!p@pD1HXzf6tYxVyr>5I~9R24Qh
z&?T*68XNE<>63jGzAb%LdW}Ai%mV0SRPIS{NN+NtW4Q8Pl3tgdwr&VnN$E`bK>CsN
zVcfZw0Tf^fE%CyNuIng-$5>9e4GO>_oM?{gNS{li(d`DOvFt#V&Mn!P;y`9rc6Vzv
zj%VGFzAF8Hq|ezfrnH8?l3MAObSeEeX-Cg$=L&C0UzYw#`ZMVnCep6$h?SHdeGjy;
zn?_JbH>Iyj|3P}4s4s%32DyGH-LfH1!#F&t)r9c1H1bVA8-)c|T_<EbMa>A{oaH=Y
zL}@LZ<5)&~Xe4nH!-6&}_!JL69i4*^BD^VmP5Nu;bz{zC5NXyllfFS}eeUvqTKXgD
z+tQb%6L?W+>#a!TPNg4{uJmBs+tSyiZ%Z$*nU8HO<3u`_K9PPP{TJycerK6tt;sPO
zJHoyt2y01ydniClf|Q}H!f&@kLK~cMw5arI0BamdZB?$|phix!VP00Lu}_l1H>Cd{
zeSwEMppEw-LDsqSLu*RBlEOrKM*6DsHR*F4iEw1icPT1+Ed7wt1wH-0C4Ei$OX+Ji
z!DV6-xl$YVcSwbY(@^q01e&uDbClZ6&di~JNL*&tE8N&)FZq)lYt&@`fe~zRk-<6K
zp}8&Gl^w$=u0;`_lKx2g8|kaq@I)<}@Iz@b!w=BzQy1Y4=_}G-Nq->S>37$*Ofh;_
zI!043eDR)=zA61X>B~$2xkUeh61<Uq3YY5Y&YdQCo|KctWj2|y#;r~VRQ#ypio0wH
z*28W@A{voRJxGCP?8znOHVVqYE_%{1tXw*hJ|}%k`XlK@8v>d_(dV#)X(-K4r1zql
zniTjCq(7DZNcy65i<sh!jB0Jfw4X?CM;$7c4y4zlZ%JP!K)b<h8*8x3q<hk{(z}CJ
z?iMf6pe3!1Fv^Y3RvHjdy8{jmVkCSS23R~rZgsS71C+K-t&u^u7QUpFza(=VvHnSc
zza;&s^vBF|TUc<NC`8XljdUpeTzWHfI4$0nrT;_vCNm|2E<A(Ky@0Iry!1xMKT&AD
zD*am~$!%<azE5|#lx{QNX4uRcd1+=O2BGQ=CShx$?J(pms#5g>^fi7XT8DFD+V1)&
z8idZvidM`1_{p`Ur9=Hcls+eYTlzEU1!8v*0iQ5{?3UC@A4nfYJ5^e|uS@@4`n>dl
z4O`f->?I@VCB4BYgDd?%mcA(cTj{T)XKeyRw85rbd_&qvKl6)23!}F*vc?0p@HwH&
z4J(iJI^m-?^#<(&!PbeH9$Tg&t_NLw8J0c4&XtXnE3u0KJI}65zmojFB>e~JTXfYI
zOn}X#13I~L=}SoAJVbaXy(ayc^lj-gHWMU(pq#yMF8xaS*p1hT^cCrEq(76M#&3kF
ztcVqpIe1n2Y23LNO0;$qNDdfYfLn=Bq#Y<=X2akrw#Y4V{vEh=O;QpVTo&%@NIJ3V
zR%77LmcvydDV&Z*KvU^O>CdD;m0lorX>A(8O$)_Kh{jisFqdAF{txL-n4y(JA*e9j
z6=D6e(hKaWI@L5@lKx2gx^ygkY>KiYtjXyAnG7nhgHL5izL%1s_Er%wp7}t-sE;g=
zc}t{n1D9H3>Svxl$V%%If}JH!@08~9#6&%})1Z5wB!$mNe<l5i^a6vJq8hpDLmMZ+
z-d7RaOnO23hV+Nhi*|5Lfr2bq`HaYZ78lQ@SEN6a{#bg^Ao3Kf9~i57DZL`SM7R17
z|2SmR*O00*Mrz59b!iM??yX!RK^(F}E*aj`a;rH!$C-PN0F-we0T+`5XO=kB*ro@g
zLFi0+5iNYyPS%~k=$jdwBI(6A{w~S?W$90(Z%WVdgpv5fE-rbxnPUeF=~d}l(l@0S
z>>T9Qn1h*Fm)pMcT>c5FT477lvR`6(0{M(q@h$`2f>Q-^Feg(zGiRQ`vFf4k!3}yI
zqUb$f_`(KqR)j7i>2K1)=cTVnUzTo?{vBf1Dg)zNCf0UZko5nLrGEo>EkGJ)z*N$S
z^s4lN@oO}bUXs3z<(r_S3#)xs6jnEk{>^(1pk)Tz#!O%IqgHlCSD>Q3@H5C!mq27E
z#-E5M2`1fv14I@4heO^w8Mh;2!#5~kJDPu`ApA4ZH>KCm>}aP+l8iczDED2t^>p#8
z(wC)IIo4v$%X=Zd#*BBMST5a^{y_S3=@r7<0+U>`Ldk&~P+uEj=n+xYyiRV}X^;nE
zIfB@h#rsF#*x2a@HA(e`f0;7t;Y$7yezqpFf{-cw;TaQ|0(lyd|B3WP=^N6Epjg2;
zjqzv&VM(u*A(M`z&q`mGo`-o<*&vx5i;*7ZY3Zr3At|(aMf#%jBA|5!+<w8bZ}H^1
zjztuex{PAmBRfkj<J^r11v}x#;x#q|;xTN9Xycs`%}1$0v43PST0=j0<YYB<o5B*6
zA%2pA@PC61XsFo4K*&fXlELd@uq6N2q|ezP?I00M0aBvBq({ZbN&LRTAnL{jfoEpD
zWJk;t*U7(T4n!101DDLOZI}Cp>NUj3!dgZ3Ad>J2(?Bo02pZYhLb}D7y3VC3xo!CH
z;UvJ&|1U{jm!4y+M}Q~`a?-i<G(lchM-J@b>(VP!ont!})DU;%eutDXuH?UzUX{Ma
z&a3f9QTtCyxx&x&S{S=fI#^me9DH<CWI*z3>~XXLTzzW$i=a=(B(X5^InvH!!#S)k
z%*83X-r5+o!Ya!nP2fxES?OuE{ti#u8k;?*R(0ky)D$c{%lu-IEf!#n8K`>8(bkol
z-XrN{>3JgU=Wu3oeyc^)sl)NcXe)UX5hG*{>lf6Bc!h=Aa9L7tYI77@%4$2T$I>i(
zkHR&<*0j9s4mCWqxyF)iO1CY(>^8s+=~X6BWM)sd=1EFpqDF*qCI9E7r)-Rv$P{;&
zj%ZJh?EGrw=P<SB1|Zi2%UkINo-l|IL;gi;gR&yiOXA(xNf5R5*u@K8*^s2alf8Go
zz&4fE;5Q^uL}anV(bcFx4oxL`0){PU!bLtSJ#V%5xvlfWPIai*NOj<EOHQ*X&?Q;3
z$b?E@`p;<3?`@uE2zJ9BPuVk0dTh*L_T(SfL13Xd3L7k}%v?&Z9kBrFU0RyFGqr6E
z2Hy<i-(bLx`o=wY1d&QL*RilN&cJw$S=blUdMY%pW#?AXL!aiG0?=oHjqWp3`KE=0
zk&Vo3I8JAqkMLWOsmw9@_d#7I6d-%)SLr{KFq`}Q-qf0c!U+bLzwvF<VwtDXEkv}n
zvSh>42SKIh0Sj)jHPfB*FI#Yl+`{_d0?dsA+2X4<=z2l;TM(jjp=`!k+R=n?CVj%L
zm>Z<t;>>zQXh+Wzk)pCT&WH&s6zImj4rMk4DdYOqyl48%%%^wPw|MW3^pJ(lrImDs
zdw3unN;k2MT^5|#n2tHsy$OF_+UZJXEa3=*l`}7NW+>i-I<|0zCBIl9?>v<6I8Zfv
z+swLZb3(aZA3`{Qn?`lo0Sc^0+YgZ3wG8ecT%AF-zis~T4D>cR_Va-cug%fd3=v4%
z=?se(vre{_*<F1qGxrH3Tftk7F3KeNv%z5+zLRJDOxybhWie&M+8v82%0LA%SY`or
zPS8@Zqk0f-%PK6fr#o|z4j6Cj(BI*wHfW(T{3zHATLK6%Yqhh11=wbW>nZROmu5>+
zRp^d%)347>Y*8+V9BYdT6jov$dH&20svM}<XV7Ilxx@3F6^R<iV+_`^Pk~HoUZWa4
z@0VeZrz}C#O;`BFW7t{OJAlX*FQ~dq*rik3)vX!B-U4n&cn&OQN+ENyO3C+~4o3Nf
z11MPZoBVl-bc46(qbNqCU$R8vRyqD4qQ;h=8`AhPW4v8ju-MuSw?;)CGI(2x3k7~S
z*n1HQ?FOcM@Dqgv=n0H`4Er;_u(4UPy+)X+DNwSnsa|+&{X;EtH`Z<m3+USi29XAZ
zDOubH{OGll_zGTrYWATdf}0y|oYAZ^^F5p!yq6i_NtOpStWo3mMM0;3LxKN+)JiAP
zi^jx^XPk~ga8ta*3O&zx+X(`iFzW4a(8`iBr6Uae4l^X1yEmf}7C>gVftMT{vM~ti
zv5}^y#B`U&N*yZC#8jxVj?Sp0cyd{~6oLag1wXBsi<9%V5jrg#!fnlt?Xc2elGH!J
zslG0N6A$)?-<l)KKn+^F=a$DD_Htz;I}x;$G(@zZ2`6~(?zX8iag6CqI(uRBvs&={
z=xG^o01cV;1Av&^!sy0^bybvyMyM=_MtXx&gMyRO?zN?4q5s~&GK1}RNuY4Y*lAlU
zLv$;DCBViCFQWDmeTM~S)YLqHm&QBmHR)?sEKJ%u38T0#JsON83k6QEwZcjrSa+06
z4aPqiZphu$(tLMITF`6oiu^Omtc(7!z8MKkAnC1(u_iz}!q-F{dxsgB8;m(@#&a7w
zv!Qz_h?w9~BC_4|_j6D6kEF-aCuWZqR81@3i~z<*gdbJEoL#cwhR4z^8y4OBwAS8o
zga}$=)XdSJHKw8${<P+LDxN4FEw`qii2a8YsvTUtmGq%83v$r0aKqVkdpHo^h9Ev(
zb%#<+4e#k?;m2?XB6KK#T{isO3|`RMkJFo4QM~5;8>lIT0UVb;Msa@$WP!#Mm)M#Q
znphCs?cWRsc7uqkL{A>Gat%OK%_Daghlb~v_!%NB%wlHb$0z$4WW)+dR1vRchULUg
z%LWdtHL)$N)>N5BhmkH&>fOK$hR!CZ5p~ZD21H@KbR)vAsWL?>8cf)o>2+x;*xUK7
z^oRs6LWkCLq2fUH2R5d^)1O=9MGaq?N_TJxLJ=+ojd<No-r+b7Es<Dbl^UB-5_o#|
z4zV-mBs`(?NBmrYK3ooF8AnRGwY{F1eVY=Bo({%94Rk1@@K@83G$m5Y*>@4dOa{*%
z1VCr6mh2(VeB~%aZ~!-G-vsbwhAg&d!nF);StwPZrKM5!CnnUnCHvhc$K2oKbq)YO
zQhd3i88#}5LkGB~QXR>ELriyw0o)B-zZ>w4bra;4&=pomSy`8V#{@3vUM(znm6?tl
zu6Aqgy_W)?<9!k&-H-vz;bMgk-|ai#a9|A=EC_sM$Jaz{Yiqg6mHspqD>STs@Q<dD
z<5!j<bowv!Z)RpZ#iUWY4GP(GU;o1=Rt!;DU{ALwY|U@gc1h2SEHPa>UUH2Ox(OGl
zA}#KskJu`OwM3tzIa`ZWDmLHvF31^VbzsN9#+6&*titG6-TvL6zf%I3DbY*8*4o*4
z%O)fjoWqjeD%mhW*PVclK7t87vVSx7UzB@fJ}4s|Xsp{e3ndROlYpHc8RI@Wd+PlF
zw&;%qm`Epvxz|+lE8em3^uNmhD!juj(yOTE?JR}avio|k9uA-+PfH4Lj7kS8xqxBV
zQ@|wL`w)Q^W@XkEQ?JP##<eHft}eMxl=CZUe`UY!+=l6GfJ(xiML-z7JM&E+p8_kf
zvuzQ)!v<bZ-q%wN-aj=-J%oCjC=eOZWTb-?&T#Fk5H*;VICP6YF03XvwT_hT;tGiW
z79y;e{?{0WUt0b+5`pHB+fJ-H?X~NGbSuuDTg4Aeh(MPX#MVf6Za*>9;mk}#&dE&&
z(z)~qv>@WU5e}d)L0H+w^00WosUaVQ=W$U%4u)366QzgTA-wS6z4R>r!%M@Kg2s1%
z)2T2_`92FMGilf6H<O;v`cB^&ZQZX_HiEpu*0+o=2^6v=jOa4p#_V&B-@T;PQ^QG!
z>fg={P>5h_;bzqP1DU8mR2F#R-ZYIAaa`cT8&)Wz>Uu#cli4kM|5n1--J)s{%nF;+
zT39~(WR@>f7If73;3LM2w^onrE}pXBOOoYtY+;l%B2gQXNE(_1!wTOZO1Lx#yV#E&
zdc71gLy{JTqGYUXfvVP?f@S>|h(C<Hs516wa`7DrT;_zsiAENAL0fmu*4<vWfpaUF
z$Qs83o<HRs8&a9h2G*ufJNC_z-`x-j2fEaI`6xf<B;N~4;eorkJshF&iB9cvJGE3m
zP%FZ>md9LKe?zCn6*abb-+JwAgz?nsA(>mR9gAF}y|s15CsAG4e2kGL$oiTcyLZQy
zs;t#>JwVYau9sDMD=S&Ge4!Z7kP9|<q*)U%lag;2<i8hOGTg%+#}+(A1Q2tgF|miX
zgI4^Ko1A0xQuQpM!gD)js~s#}7_h*%8!G}v-6ONP)Z;YPoNk60n=+8o8H-=rn4QWs
zC&d6OQ_q|p&4~q+>B-OCd$(v<34yw?!>KE79vZRT*Z&rHXSz2&IE1q3&v5I>zp(H<
zncInFU8gjJ<VG;cT=>yi6y&Rb8KQhYVIemA${)Yd+Tr!V^eC{axgC<*QKMpe%|XaY
zqslGVf*S@S1P6VXgs%|(2^tj(85GKV;Ci|ua4h!?<AFEe3NjAcg{^kxd+@-_dQA}E
zW-u}>cNJGsUclsw&6Hc<mTJ0>m^wXTfP(#1f@HQvMA{ILmzeS3t6T*&52c6F9eyrI
z#KEl;id78uU|6QoJnfvHFU;Z;HtTcvnP4<Fu(nIKQEAlK9DmR=@-0=n3oCh7Y>XWP
zEspxN=HAwuVLG<PzRjQ=tc~n6?!aV*y0$oeZ718VVKDCo@~bQ_+DQ-S!kStaVi3a1
zf$~L-HEPTg5RKX<sqM^?jMT9MXbt9@Y&2DCtmtqR`C#{FR`kixyfe61D+;P3{I3PD
zM`NuxGdqQRO+r`>C@}FWQvkoYA$J9laRs6j8@Y#q)z*W&BiLN@eN&{;Yu|$L8zF++
z0*wse`(lr1PAuEf#;EPUk0J?JvnZ?mBqC=&MAlcZ{^vwITa>!j&s!Q#s5QU8VwQJg
z1DorfKO^X<p?x`iJwr6-^ao4^2D~EHIiZ&`m<@#`j%A<h)oE_PrfcpTE@H-ZZo%(+
z`qPU;a%(%-7>W?&>=lYN(#}!xxE0%|F?da8IyI1CZXsi4QBS<v20wn!8i~d~xkV7I
zjVNtV-~uN)8(7bZH%uGw8EIh|RTR1w2Jc^ak1Z=(A<uy8S`yaGUi;eoaRFj4E8-?c
zxiJA%^h$Kqzy|i}5p18za-19kRI-*m4s5{nIl^DF8FQo2&6%j9>z+Sq*R`=<*H+ey
z!1J7U?3JQ3UNuq0Zr~E5FDSm*HG{WC?O%c1M7&v(xaBP5!FAI#8w1vIFoz`6D2R(_
zY*<E$>P>50x*fRhl<LWjFLwb%6ktT`zqPD;1F0*h0JWtky-#75rLiJ>FjscOev3Y{
zqGOmWP#$Li&I!T~XbO8sGkB+bG6&dV{avpLd+c5ZQ7ma`y%*~~PLgrK7J02agUlfE
z++yHV{Al_d(Rb^A)oEUB(+^{v0z%vJI!UxINO$&!$qg1_VkR~+=$he6lRkH<d!&FJ
zTh?-7dKoR<!wPudZ?X#V7IR-(5)d@(V%!0QOoENPrZKOu3gpg?p6xI=N6eA{Ptce!
zMpLL29fvD;9;x0GNL(PMtvQAgdery^4sOPpbvt5(BO{7c>4|>I;EMx-xQ3iPS_Wpu
zX7BC(%vf#(PAlNTt#MNWLm&46IS*1;u-QPL&dmB`128+cf1+s9z@9xMER;<s%R>G$
z*vSu>_i!IB&%)kF{cebnPef1~Yr1V&_LemYq^kGwQS4p7S#k<T@Bq%Fk6}n&-E#-!
zKRQ+m!zFfvuM6qM=oFpUx4D_Mm6_4C(F3CrR1OSiOA;56)`WPguzeb(<n9v&EgaA>
z@#dP?WMPb$ks49}d0QkL>KHZomgp_w(;f;E&7Y!a^qi8;l;4^e3e>A%$auR0o?Au?
zIOWSd68paVFR>O!)WCAXKLd-({pBOBk9ar2y|Dgb3;HTvAt(2whjfy+U`8|Ru8REP
zgc{}ugjis|0<^Cz+1i1{Wd_!CFCV7A;vq8xS&|mDP_uTvel0Q80k66uDqTo-42=*c
z%60PZ!N?gm-5R1$S#~s`%Q=->wu85C@X=f9unnyB+Vb}!>83Z67S@Iu&N90HDr(0s
zP~V(r@6(dSrdPO-&WIp4=000=pf)?*<<D~($&%9EgE%}hdh4)=m65FlR58Kh$@V~m
zzuGqd!d6BTcL4CA0~wKFf`-lpI+$C47!c#!x;7>TlD31FFYynL@O2A2Pa!u@U^3YH
z0|Rsk{kUS05^ZaV4G7Uhp?qiqYwlmd*q!&KxEk;PBH3wedRb;d&FrArDN~F``bZ)I
zj?j2Y`g;L{I8uO6KX+bQ5*b}g6PqE|qA$GwB4Yx|jtp>ZEm-2=BMu%K&=wNGeJkCS
zMrtc;M^8pG=!!zwj4&kGibF2gf$cv7y|BjuD(s(g+T{z&X^wc(Y+rv|iz<tew-C8z
zEM$&qA32aw<FqarmXKM|BJhQ|f#DqhUqK;T*^G<IDxpypZZS<GMffN{2H~tW5G5x|
z$qe)F)_+d)nGvzfafT%?QqaNlv2;Hw5fK(%(mA+5=1~J_+1D%S6DU+ypVFn@nn*6K
zYPEy<b-)U@u7kk>s?9a!tjd|?#jzMe0CHpO$7ofWAd-mzPLqDEx2HWdUN<>_1>0z2
z?xwP0c^eBbAwbD6BWwCnGFr6N_Z!{y1-Z`!E^-I)JVr@e?C~0g58qlYS{es8?C#pq
z)ZzO_g0!%zT_j{JT&e(RhaczBA3`8)8J}2T084AvAAV}{`^i>4mLAbmv$kffmHDZw
z$4irU9x!&O=4p?ulq*Y-x<;hTpo$|4B4z1tG_cp!jx!?vte+(3XkJBcY`E@`Fd#Cp
z854<Q!E(txYhyGi3}}vG%pszsy1@~vJvB<DMxTF2SQc$-g*C&+W5zb?+2Be#lkTHG
z2UG%AR#=G)`k1t!C%j-$O^Fg8Q3{oGPMv8;f$~QPhE#6r6#TQOLDhyX_1^y+>*pO1
zYjd|P_WaxpXbAx}5oxCAZ?O5J@gtfe*82{)fgaV^1e0KpMAg5A3XypXHZ#Ix5XmmV
zL<DPQ!{2rksKBo$B#yC9Lh?Hjmq;->``aO@H45}p-<v0mTUhbD=od|{)H?WqIq{ZI
zpd69Z7=TkhkR+1)lH{ev;;+%(3%p#kY2{C7>ld%n?zL~X=DvG6nA<i^e?76%dQ{Xi
z+rVA;5jpE2iyMUQlKzcU0iEnK_+=1Cut+NAa7ISiT3KF$N=Pu?ZV+!r)ko%)3+}OU
z2qhwt*7iVPt<lNZ5ZSG-lYBJ)OlZDJ>$<g#F=4LEF!z-Cpo)=&{=KMmsZXuy)P2_f
z#VCSR_RYyCk6xC6m3`PN?-z}N4%VSCk+s&m@Bt`!j#b-OD$?UHhW-T95ee=kfqQOs
zn*lwwd)tB*<vVEMIq6qx{Ey7sVB^<Mj6*xY4Q^2N1>R%i=7F|VC}Lq^4^!co!pKk5
zaPwrrc0nYvGK|R)gHHa9!R=9f+%RK4<1r(Z>B8?B^pmvtmgrJTM{30u*g(@8i6JU#
zC70&LYsy`o#0IoD(#Nnd-G~fq3Ocx@ntwDH1XdGnuB_5KF-}9e|6IB=Xv8c1XHKP$
zIc_b(;-_7CLVZX;COr?_Sk+f(@_+)frXag=7&EgBXA5Q&bd(M7^ci0}8oYh{O2C*i
z2F|9|o~R=R?l<p`E9i=a0ov_|SDv;O5NGzb1Mjy5A3P%eYuH0!dUA`SMk44E470$#
z<d#!K2S(HodOYsdw7fR<?!-8N0aNvw0W;%&ov`U5YCSNHQ_}E<Ohp?;Kw-TD{lCHP
zKckddp_@koGhFbfm*79%IaL^dys*a9t^g8=eXu2#?4b<9oD)zU@AIg`r&p|8!5w0?
zHs)2=gHasL0!l)@50!t7Qs>fh(upJ|6Sic^E335cu)$k9TVuvTjIe-2`gcPA<J$Ip
zs(l_%WnNi@<LZ;D7q<xfD?19PG*F}A=S~MfaEtJTB`=sfHMfxck}Jm{s5J@71hGe3
zMsD3*0Y`R~yr_UKA-+chzF_g9Hn?Tgy1m&*r)IMTiC@lH!wup4&<(aI?<dU4yJ=o(
zW8c~I#~WVRpVZ*=2;`^4pKtor%NfqtQT@)jRT$z3G-`{XD)9FkBP^f{6A|Ud8BxcM
zP%fj0H0>YL)dMyvV=8A9WTJ>9%5|BqURbY)t_xgYvK1Rri9?omBmzwWxuk9C018Jk
zS{V*3#4{Usztm=bWN=na=2aT?uVKS>5p->;n*%{TvZ;x=xuKdAr}qg6r$7g`7_dv}
zm<P<*$)%x{-A}Zbk!Xf2+1_4<zF@q3mj%w?M+UsMg^KQ-U*iC1?7_;q36_LX;jy9@
zXn{WVNCX9Ec0FzL1Xp0dL0$FoL{D<Ec&$YWQSS<XucW>`l7wxgNAS4UMrJ*i?oo`G
zpcluEf<%q5Mixy!o;d&rJF;l0XFawC?yuOo!d65Oe=UHX5hIs|IM)yp3tCU7*pmoJ
zu71)%ikdT{PWY9=Q2A&C>s~%8L%{-^8Jl6p3!Yjh?J$67$&pIrH^GJm3#Q-(!z3)U
zV2XWjt(UXqozHQ-JqRuU@10djD@*(?I6td2veIxBqCRzpOlN74RN&8bUnpG+u;siM
zaiUkrj;wi>geKe_+&K!z86aE6c5JM6+*&EQ6MpIcc1$03d@Bl)g}K#<t$9!SK^flI
zTJ3Cr+1$pN^fo|FhggQcOh0kJAvBH6b*)At(8M16m}cT7CO%TY7y=MeKY9^NAcJA(
zccx+otV9qoIvw`3v$DcZK|r{${*&Ap<b(Ss(EJ(f(}r;{DRFuL09yl<PCSE&3Tlmg
z_lYbm=-R4p(tFRJpz~;!Kkhf|<<b##_z?#jgrz`F8?<HGFZ?zZ?p-;6pjEau?WpLN
zlmY;5y^h}Y#XbYft*k3O-nJO+LK>Li+G4+~A8bZjz#%+_Ja+$<>?+kdo}nz)61b%f
zmK%M-V|VA;VDJh(7c+{JcFbrP9vaYiPT9bvKv7o)ga%1b11-qRRy8)M<C24@=0>!b
z4?2ii!@47M6xW{{!+JKFzQij`F~A$r#1`EBz{V#Gk?eqXgFbMHQ41~DSXwX{L{C%c
zDd{Op>?P^{(wd0o{O5S|P)awXm!xN<mQ%kE*#d#+6xN%(brTW+(bl>ZTau`tgzwC@
zcQ>Cw01u2J(3pmVI994>ym~=!@W!S@-A=16tg<wWAW8t(4f$Vz-lra`<biHm5v>XL
z7r|0+1tW`(2?#CK<fv$m6p?lWyUA`I0BtPUWTiRB+O=mQok&kfx47GmY_GP1{hVD<
z_{E1dOFH2k2W-ZvB}*;4Z^+}0BHjuj$P71fX%v(im|`_J6(NYWMIVcq0uN@^!yByF
zEbhF)+&zN)5Xnwpfh}OsNDV{iXTf$0a`=RrT)@z?KE=CR^Qm+b=GHl@HRX46_!<u^
z8ot^wr);RHB_0iWzcJgglI~)XhI@DdewDMF1*%bF2^)kuq$0FTUtfZ<PEes1<ZEhv
zsxv@2<H&E}y>`}8dnA?8sq_g48?^flEHkuZ&FkF&(l0Daif_tW5n4l7as+k+3gI-y
z76tEZ!5#jsHSnr||J0gB-6FpmLU5(Ir3r>8NL^8wi8{l=JxNP&lXl)Iwl>V*hW70A
z6o+2@6zIYYh{kO{O<S^@CG!ME99SX@J}=!iHcAvG&ZN8am2Iu_zB9fX+Wi-pr%3iP
zGni4W>x|b-y4fFxUcufknP_xGgq95g&`t~${LUSa*beQR@<>Z-OgoECqbR|jLFmtI
z)0QL`>3@@jy6rG*Tzp}T-=!f^!7-@zPjm23&O%M8bY`e@^8GWYE#1S5FWr#tNXOX2
zwjV(Sarf!{E}+aPimkQbZgA_7yzcs7*N6cv`Js?sj%~bc07FDPy_b)Kc|sW_$h1?G
ze$}Tsb^v68Dw3hTX(QntbIxIo!i;0kwUuV)1ON>f{-JRx!VUD;*eQZa{dX&>6rO4B
zfYu%lhH=$KkIGoi%o-LW*vdgfRs#nR;eU#IjyN1vL`4VOrnaK(um&Rxa1covn=BAz
zxdsn^6+Pu_(L<gv;Mzc;)Th~$qxv&7#_}b461|YM(dCEJKsHE4ppZw#h|5g0E86sW
zHJCY|d4|Vbv+S{#H?+Y{NA+JM9vk#5V=r#dg_i$}T5%`BFcrCR2G{iAEGWqoMl!wn
zR8~8k+9b>krftIFZTHLPu0k#Lrv<2w!qPdsz>)~O$ES`qqXRqEd(Fi=WW1v@tH)x@
ziIN&@-iGr3!mMyGl6vV+=K!`qUlr`l2@1Ht?$#{H8duR>oQOTgoOd!qj0+5KBI+)z
zM3ISyzArtBbBb+pO=kZ70Iu`pXbSIPLJ5qyCVYzq=8T&1>0lsffv;)-zza*Bwm|<i
z)~Z{*#&E73zB?#k4R~%tv(k1zgAFy@*tuSZG`z)*om!?H)%Yv<SKM$53#GKeb#4AE
zCzqat{LiJgr8lLUjKU~Mb7DZz#Ln6op@9Zx=z?9oq$IP3l~}Tq10%?N<(AS7B9+z{
zVbLSAAvIoDsj91s&dKz$zKXZj+IBMN0`liy;RZt&T9!Ssyp@5PodIm5GwE0Ko`kt^
zoUEp%p3~IV8NIc1U;2a@k%2H|pwFVFx)@wMcm)ACI-DbR)P}0g1Upv_o+1=xg2&D+
zT&U2+!1=qKH?v?rJ$#hdTC<G@6m~{x0BfB4Dd9?lzZpNb<3+X(WCVT^XyE~vOOW8_
z+-bvs5$m`(wOe0MD>}5te$0U8r$M*Z*Xe(4zG`IyuUft^W9L=O=bgI#&y1ng*bdo1
zFE6dgx!FfyTkBgcQOe315lb8I(5X=80M4X$q<5s3jd~cArj#>Y@=Cd*APGJGKzd(#
zUAkeczRVb#Ye-hL!<Y?sep`B;S;i|nTP`P*XvhynIw3_|9%T1mK(`jG*XZVAP=u~+
za&BRDsIavYAjxtRL!KD=7(9m*2Vamr%&Z%H`23CZNcx%d8YRO>2o|_5VZF|z$Hc)`
zBD|2^k$xl<((}@Sl12{h-LizGfhwJSd_(%7^f~D>dl>k!&1Xz|do?<XspW1fqX9Q)
zSdEe0^fz{ALxOG6fU`{Tp*34q#L5%Da*AbX3?i<%vO-#m{)!Iee<6J+eI&hRC90O+
zn%I<tdQc%sD*Tr87N<=u2|P<uhMX$9h$Om;PmzBmJtsX&vM2~QJ3FDG7?^I~{cA&B
z7DP|Aor8OT2sZn>D0axsvYT~3{GW~%(HkmG#|H6dMwkjJ$RUB@5XFix<VRA&ndgCu
z=XSKZ3=XVA{~t?lN^eR}Nzby?wv7F1QH@kddsq4>SXk-K?@4b;pWzg^;NrDN>oJ;>
zUk3q&0|@ASLmoB<*w4WZg2B<naKWG4!bUl?oFrydXn*}H2&6miJmo;B3@zH(0b@P>
zVCf6+bLnrTThi;K4O`sL)JAdcq{q%AOxxm3>F=dyr7v5JE3n=zEQ&%plODu0!Q`pm
zl>R|_R{Dl<hALDiqV72lHBtndm~veLw;dzW!pQ9<(cs*J_zet!@Sru3(bguqM4L)4
z1)h+jl)Tu4jJ_pdKEkBW_Jhy{GyHw&Md=IRETR*%G2eYIeJp*%$u3v&e@prYQpH0`
zk(nK>v?V@T1A%oB(8toxrMEfrt7VaeuRNo^*^C}%lo4ItlAf|dBZGjp!-vM>3=`>w
zB(%2ltA#nzk~T;9+8H26<6CQ|o3oa;C@(Co0}qqVyd!;=F4qOyb!I(Y6E<@q{B`nQ
zNpDGiN8Res;(?m(ww(1#8R*&<?+wJZv>0Xy3KW>(hM9%yQ3qUE9vZcZ45Pmzq1i}x
zrH6wIxTI2jimwfeJ2C!IMa^Ow_)$fIePmP4HvD4E|3;BvzOVmbLjEh5;SDTyVYHhP
z6H<9QR&u~U0tWvkN20dIT`94Ik2rL<@|Ax}`bX(0>9g1j0X&rO$upwY^U;Bzf#S@q
zM=Cf=HB*3!(IY*h*qgp_Z0(dYs{~F!s*n69Vyk3kJP&9K2r7>Ft=<;sex{P{N#BL3
z{G62qYbbSRq_FQvACfgm(oXtV`tJ}mUyz;xM%@~UlOo^`r1zw=FaSI0uJm0N{dw#D
zlli_K^Wg5r)*HyMwcKOJVkC4b7ACiy(5+z&zMMdYO+U$vZiWbadJsSXEp|>PW&HV$
zP%P(Pjz)`MA^(4t?no2qi+wGrjMDg#^oje>5aIWw2}G`A_IH8NKV>tfSm|+yFm-_Z
zO!_<NSbE94f5Tj!_oc0LB)u=)9i?)o0PmPd<dqe^H#orP4s6|Qw?@NFfYl8P7YdFJ
zm>PaDEWc)0mKF+3GzUmJ0d$Wze1yiglRlFEJHGyTJk~+KhkYVFmfn%xV)%Sdg(LTQ
ziijgOe#EJ?lHQiya{0&NCE`Hff6p*iJi|^sl71rnOnS60{|4~k9MbfRD+^-c2AJ;A
zz=j2c_rP>!VwqVq2b{9CdKK7~TWzVEo!R_m^*OBL5p*SeBK=5Oz@<6h=MF4@xs=YN
zx1|r<3T&lcN&kQWd<p(q@X<3gZw1Ams{y7v|D%*i-;};0y+lr#!MqXdptq%WedW_9
zBUxXfYT;bZsp+4P#{{jSC%*;>FF6u7*sl}ze@@(54x|_?z?5T_rV$w>?m6X&^|hlQ
z*3vJf|0eCEm!(@24ok)&q~heI^dsplUx0M+yV8G>K9OFRUV`%zoaR<~DBYEQA^p^e
z7|BGuFZ~zkvGkhs9LFoR#G6yr=}qZp?mglsf(1|*g&>0S8JTwpI&$U{Z7amm0H)XM
zm<6@335d?AUo4V^0s34UU)l#kjNlJ0!gr+qD4j}or8{OQYpWi7DE(Oa*!OVDNr>N-
zB>fUFKf{LY(8P1;htiK-hm-t2l>T0tNuPmBQIdEifG?Hx-k09kFT-TWJVWm;tr{29
z&xV!SI1-r_`GItS&eT*Ag0UCKL%}_I)=L)h1<O~s@tTw|a<}m!-lg=e^k1blq2Sy&
zOQl)sf0W)BsPIa9U;1y-R(e-@#bC?Yz_0t#kEI_D<iF<PGwD@|@HdG-b84@jNI#Z-
zHp*gZsQrnq|H5#Esr^eK-JvVJ%YcK3ml5&@B3A=OpGv2c>N*P30sN&c8)gn#8*Vsw
zHRCL%kiuQ*N77PyDBYHX-*372f%K8|3+ZRRokIS9C!I<!lgD|)kZ$my^h4>FgYyj5
z(tG3;52TljFx1!~SofqKNk7?_e`b~9(k5?B0Z|e_b_kIuVxVsGlvLa%u-Mj=OBTju
zss>E*mdN-K>HEa`M`tX-#L|++m!KT1K}ts-60b$Vu(P{Y(nCfw_Z0A4dQ-YD{j<&X
zk90I6K6yMi)-o=B!;+AoH0`XN;aqw|3OxKcjkTp*v6hxtl{31t@&Sh}BSxA~3^^l+
zZusd8={zKq?qnA9?aZK>DBlKzumwzfu+KsXDV#}fNROl+13G6cT^P^{>ArM-z=a7j
zl1zNic7m5s5dq$lJ{gR~OuzaIPLWst5Ra{OLY^~;YGea%Ej7w4<SY8^6ORl+>~wgA
zdd-RSS`yit0P7{2v1dSX4AsucB{fm!%#40Z__rOkLFx5=Aw7`(yY<DDmQ5tpN)X(3
zaB;#C9!cM4hG2#k2W)vs5pt;i7t-6(BZA-wo3;RXxx@|J-=Aq3LHL|=yj?Dpd4d2d
zXP!qz1RKnwsNh!IE$!SEb5Q%m!I=r2o5QD^nE?)?du<(iCA}-1N#AFymKM;~%#k?-
z!s=a|<o^KU73`QWeKj=V`}>P`A-xZldboF4Vak7RRR2Q?(YiljD!~%^{{$zS58PmB
z$f4(3XC{KuLfYlPdyD4R!ltcdm>ti?vunpe`yRODD~_#trdN+v(^?9I_bkKr6>mQE
z;teO7$1WdSKYp<+Sd7f|QhJOs=GcZg<lG(-Ur;KqZPU)N9`V{6!ipY@nf`B!*pgX0
z;31;{w}BV1)AgTZMt&)Mv`4mY?K{_sFa=Tf494{B7WzC=$4=ho(ysK&;%?1#ciz%z
zP+M1dZWYiLGZ+klPJVL^RV(;*q!LlM5BB49a2Q2Pu{ZdD4Y^C%C-v<5(9pH=U-#vo
z?Y(U65TIYYl71!K-NQNBzWkcei!WKftS|rgLi+^$4;voEVp(N!Vg>1n*r*xUpgB<Q
z7VkG@A-DYb%*K{orO!J%4<olzTLOFvMoi*?jE{mCNa5qY3i}tHB*HKuF2aE#KWXzM
zcs1@EUkanpjBcKmi31O$XQALt4b^YWU_^zf`$TW&p(VbtXBy1u%%<9Pj}SECBlBCq
zVcy!#S?%+Oh7LexB7V!y;NomlT}MB0=iRS)fqydli3yIkF)XLCf^;>Qs#Rdt6NTcY
zbxh`zqN32bA{y=G&!-T*CiwUj^9ggnr{GjiuM2|2>Q$fm!oQ~ce}hl6w=;i(#S>d3
zgA-m^_*;QzF0kI68BQp|(|sRsX>`Hp^F8SEg}U4B2=iBL?v)K!jGQ?Mb(}}<Kew;K
z`ZWddJ8hod{^EGTkh!oPwTZgHv0gfjTm2+z4Hd!*uNA{33KpVkBG|E7Te6-VJdEHE
z)rMl7Jz2Z|7m4t9y11qM-eSF@4L|I0L5k4$7zN>a?sN7}Z3cW!4J5abX}vZ87eVO0
zdRctt;eYTAE?9zQe+Z059l*cv=Ql7Qztbmp4r^e@lJbizWQ*`k9)Sip8f=w`m9LTv
zgTtmld=@({XIgPEF6Y*neJusxf8)=uyLcHi_hSN=6||v(obs_Wlb*HWT&KVl5Q2{Y
zs$e@@LSmnrW9~AD9H_0KIMgybU`krHfK+mA2J|l=h5x1N)PmCnY`KG-BRj`CDkNPT
zoMDi*gx`_Dt?;coVw)Niu3LEl96Xlp;2DB)5%tXqd$hhTiTPLkaUxGNuC!)$T8rI>
zFn)#nf-1CSY}V4oBMM92m9bi!c4al749l17T+9X-UaWsr#^}=*AbK^o_}&@eM~ts1
zjpf^sNuyG-G{Vpp&%S}mf1B0n8IWX|PNds560F1xwX}YgShdURFzA21pVquw1`8y(
zD2J3^-SqTmpgZf1Zvje=;KgKp=(HNJxY~XZKC=iov!1aqRh!=>zxv<v6ETdBg9%Q;
z23o_BQ6z?3sMeU!(SshyM`n|nb|y{<{h!);w$Nc}EL1|==ho?#{kre=ui?{TWOfi>
zg}bXUEfdnJ?I6+40Dlfld2{2R<v8iu7P}X{3ZCDFG`nCsolwCS%i*-?-y;VgYbCFY
z#4DRrJ45>pyqPqD!Z{@1jHEWVj+hHOySrCG3#VE`L<-)OY^@^esD2lK?SI=(Fbsn7
zy)^b^Y1G$|L^J}V2o_Tv?wrXAfqteU>Lc%C*fB2b*jOybRCyvhj!*W48o<BaPlnwN
zb&Oyn*!(pScyAOz3uo;NkCC$-N^8xl5PO3g>D6#<OE+2VE$%)D0hJ9TIr2HGzfT|M
zBBq-3eM0d$L40X7*-n0?^?x=*rqTKzmcvWAy(Fw<VPXJ73H|@TMx<ue&Uf|swH&~2
zh6ImZ9}@Z>Dg78Kl&bCbq2C_EsELl2VvvHZjnmrTNw%z8$+W9zOdLW$TZSO4*bgOO
zVEWXo!4SaypCSMMDW8Iz^OFAz+<lGFSkR0avs`*_c*%f{)C#bZ9?(60L|%N#Y{AZl
zMxkfJ%15QRC2Od`w(C!=0sL$HL<G8_={!deG5xQFd6ijH<<+OMBMENUVElrWlX2Hu
z`{(NBi8p+Ndbez+Kp{$_87<s<{5qfCpa%0V|M|5xUuN(@W_gsC(q%o4+K+yAR-8;S
zi>AH_{&>kZTfg(1lN=TZKQ{+3wcdlm#%o@E%F&m}`QLm`03PVI*Ik@j;5#V62l_iM
z)!hE^d!|psm*Z>D(BzVu2!1JDNYBFQ4(A#py&Atb5Q50aVjxD`Ina%D7@*9QzJvk2
zqX|4FZiN9_{W>M?psEbGI9Wf+9v5M+z-mj-8o<&#VeyJF7D~Ife#0v>l=gzXo}<7U
z*xV(}j3-_P52zw_1IV(58?^S+a0u)DB6=9KB~g=Xphq-*oZ^PtCvWLc;X%`Iq~=d5
zBV>L2WFS9%Dv`%mesE=Gu5RQL%e%z^hMrHDGFn3`pBvcFsm&qBi3M`g80^r(tvq)w
zTj*onAAnyO?i}3z!ibqq`dp{NdNK=0eX}(%UTt5CuU$N6{0>6bC#`(UZpkQsEjj!y
zDuG*u!JT;%k1X*mS(qRkMF-QA3~tCF&*3H(G{6V#F$jyT4an|U^q~XDtp#OfCqf68
zpoR^94<pyRIJL1I!xIQ&?osl^=&_m^8R7{+NLq$q#I({GJp7SX_?*zLq=NM<vy67&
zH^Hyn8Q^?%b62q(8fV-jXb%^3#5Hkca9h$p(FDJ-lFy{Cde<7D9HQM#P>eveD+u*x
zP??AASn1+h(rtQ9Y7(E!MpZqM?t3FlyhvKS8-Vr`)5k5e<i`xz8@@u6W@G4FaMz<r
zGF@}T%(QIq<T1->f$L09{ty+IjZ&x_0FMnku--L3_yYdRwE;*PXmP>>+fcr=ffBXs
z6Y1mY^#3+=rsxBx2=2sTVGpj8|841(9hg>I`%Nxg&;vfQdAyY+*gh-ckw;reW6h3j
zn1N*dZ}E_sjgk%4dV)|N_RkyZ@Vg*Rgv`{p<b_Y6dV>DnlI~amC?bR!<1wy<8E0`F
znJ%@wNDC_em@PbV@l(=MoL^rXt9WNr&yV*f5d_G4j&ZMTbYV?j+>QZPkN2lZWT<z`
z1vxKrj;gH~&fD`rJGA-`RT*%2GaG4{@3Vh<(m#?uD}7yhhVx;fpRwdO?n^(B{!V(^
zJF=4Yzbbu0`mFR674cv^olAG6Po#IH?@8}_J4JfL=cLa`pO;>h?wAc(V>&;Pej)uc
zjj289m3DS+62ndD85_w}@cdf<o^B0m4z5VLa%nAa4SOiFX`WpXvL*Ik(j~p6_BmtG
zGEDc@tlGHtFQn(BFG^pQUbREPYqG<w^r7@4>F=et2O^wHFG&AJ`a|g%SR5@2aml2Y
zpGp5Hec%_T#d}Hmn)C(fIkUABYu@?@`F}Xt3sW-hum%j>+QEWF|A6da99GZ^6D>#u
zS=ZcZ`VH!k4;;vpASghJoi!b&)j5;yKEe7Y8}N$sP3iN}^T1B&X&*?f^t7~-{z>}J
z(pv-lKa^gRz9GFXJq08l0Ard>_(=MJ^!E(c=;{Bf(jQA-WDY{Y0#0qVdzu*W57K-4
z^3QGJ+LWq$Yif6Z4}M4urn~us&}%`;Q1%1j+8R0vpM{kf?>XYwOFPpf?vZ5vr0_qq
z@Eg(}O1F)}dq8{aJ?R7K=hAni_gsW&^Smm3UHXi4lc@=78}XXjxPLDF)4u&#NH0r&
zEPYLS0oY{%r~e)s{!Qt>NN<exqNXu!hXgP?Jn0twuk2xmsa(c4R;Wsh;oLDBZ-po5
z;cpEM7Ekpj<p$*<2<*WR7-IpYjrOYaH_~4tgQ=a;P%~fdQaYmF=*o;{NdMc?-$<WB
z4IPrL7^3h~sbQ{96L)?|`nL2f>GOD>)&|2Taqpzpr3_~>QcIsPYwH9I?#NhC;)$Y1
zNFm0p&7i;FV9A9cb`4i9(e|rP#mth-hyXK!#S&w-B`Kbc?tzRcDE~(Kmh@G|LgtoT
z769YVOOK>af=iQe3e_J<|4#ZN=>*}GHpA?pbSB;80MeZ=-iy-LrEf@|lU}j*^VAsp
z33H*JB8VCebE|DYK-7W75=@DsGBkO&pT2ks-6G@?2OxAi%{n~rK8%V?Cp^oJFs-rX
zvjt6}#i;z}(sR<+rGF=VnTo}PhfnYP6gc^>q>l&s|FZOF(qF=Ly$vgNXP8MEckzH@
zZCaQApGyB;`Yd4j99^u)ChkkO$>4@WvKHlw#5us98iuDR*FHpQ-Of3%c+pY*FmzjM
zp6oE%4E3z(B3#?ia#MCkZX0eK#Y*yjQTj9KZ=|m=1FayMpAb(yl)i8J-wx#eH`1#>
z;xz$e!Sxr?^U^8x*{j7%{N!&+e<^)ddKT;>We_Ru$)u;H7o@ws^8WC*7-v2q<Zkhp
zQ4iZ0+qnk?WLU@xkRxwK4r1sLBJcT)k|jJOGOEym20I^4x;MzIR}Q`{eO3B%>C4QI
z5iX-($W$8m^_292bc7~pB0VSlx%3~TKcHJG?YKnZp5v2K+V;Bi1L;A!xR7p2Uz7e%
z=^Nm_x%tc1?B=2Lf%Fqj-R`~-_--^#3mdQ`EtpY_nS)gH$W$dZe8XU+7NaGcXwKR1
zQQy51Qf^hB#(uFNVUFRZJra9-CeqW=H>JOk{!n^RDx`<fN;;vIpFwbXnRz9T;^Jqd
zZ%ThI{UMY5;)&9uuCV@}m)?|q9`nfJ&VMXDhn#0N9cD@$@rESn9qG-{c~~_qp%X0D
zmN-Vl!8veLM<fYkR;*lQ;6?mZL6mog!Ty;qeq_ZB`S%0bGHRL-W?)g}b@Km1>D$uR
zrDyR{L4Pk8OPc1iJ@4|LO3z7uEd8nU1#sJdvzDB4Q%i@WxIc=SA`|IZ>08p@NPhtS
z(DJ)e>mgrCFG?>-|13S;H=_YTME7(IQpyQ$W~fu+W<W@8YduRfD=OS=s_>le_mjr{
zoA9MWC}07hY^X+#r(em0{HgSp(&w0gSF&c}hR4#8bS6Dz7~9pwe<J+_(dUA=VM*MO
zUhSr|l3qmqS338U^mXY^rO(lJHL>u!1Q)#{WzwtCKZWIU{hu%laf+#Eahf@=aE^e7
z_D3)|8=GY&YwKu%^e_A6iL1R)^1YfotUzWd3BCaoG6n%95Bj|HIq4aO8n39V6$W)r
zIWF@l*7GWVeI4s_(-4yM-e*P~+mYRraq$B1;Wg6c20FuriKfSx?jufP9d6(qGmD?5
z<Qrq;Zs9=>356d^4;_Y!9@wa9`2aaDix!-h)G<LzcIhR7?}i|@BxTyto~-?etRaP0
zr9YOwOavl^Gh}4+V&Be4aC50nZt;uK-$-AVZeSede5vMFPWXuvKapxFy&`>Cdfi3{
zWxz2rGPtDA$I_ix`F#g)!9j8b8XFUJw<u%G<mjXj;b;)$w#@n}37KYAH@Ni0^RAj5
zy)YF3dXRw|er`4@|E2U<=`W-|l3qXy<ENv$XNUJXi6gt?;y;tV%tni?-`J?Z^a?km
z+abcVcwdnILi$>Nh--}$*T8uV6}tJT{0p{tTK}gxM0myXH#n0yeT|(DZ4p?5kOKLQ
zsA^5_a<-p=6r+db5=;a?en=sTFC9r=LWPfrpi4U<B^VGnT_T+dC%5|N(wC&$*n%BN
z&<%qfTfnaF;sfb((zm5Ald?^b#g5SDc@$|e7$hI^+Zu~1V)xAHw%Fzka9WSY&|>fw
z7I9W!7e{tb_~St@UW<mE+t9<v12aImf_iArdz7&N2huCjtJ0yR+DoR}2GgRzS{1$l
zkEPe7&k*TUTsgrNujmZl4xCV0`p+Z%K&&dWyYZ4+BeeED5hhLuS-0%CxdCt+<otMW
zB3Nz}m?&sP!F&VlCuiw<9GQy3eiS<k7%gcy#08=B0YUKa6Zpdyq*tgwXXGw1#bsh`
z9^Ezy`F}xr9-uQPQl5fnH6+D73^85&vh=F-3`Cl#og!WnJC#<@9;fjY9JH{Y;!s*u
zW@9Tg<RSzOu`wSUMaYG@+nvoZ=w2Z#^NwJDAw5I-78__nvrM&bfKH^(OJ9{<=nvJc
z4Emm;2r~!razgXh&2?;yn;F)BNf^|L&^OPa!6Ug%fUTL{Hn&#M5&1_YDoUn1C>yCs
zJF?ap80}RC7N$P|zEjL)Fp`2mePFzvPIy87Su+kQ=M{D&ZV3^-(AX#0@Ry~Rr9&$h
zgdi$>dvNjQVZ)R0dWobs8Ju&erEwvn9ZJM=kx*aRpVy@?Nl!~9KA^HAe=_ozKvc$S
zcu^0WVlzq`Gu9XnW94T8W$db{bReDJoC^kOL{Cwg`}x30D6v~wtWs{W*-&_{tQgf(
z{w@QUOLwGa!B;Q)Sb3nj6C}}Lz$rjHwu7h|>xxZT_!_01nlfEod>}n1y~tT&yMEQl
z+o>AsdmbuK$S_dKBP`hhr8#Ha5B*d`8HJ)I*btQQrR8s}f$<#xy@rx7HFos{Ax|RD
zWbA?UXc00)Efe`9gFv~n;#$W2O3EX<xOfKhDnoB)7J=?Kg>1>b?|g8&_>S~~^c>Em
zAp(yLC~WZQXapC^bU?&#Dt%<95VbZQKlXDSI=p3~?K$<XP_-6?*`fj-;wV;OXFpjJ
z2?Q(kfUfL%pMw`RB#k2~5qrAB*2;G|n<@8pzL0-Rb8O6qwLD95Kr25D#_je21uOxc
z#ay2yu}K6&Bc{hkyv_NEmI!#t?ugEoZWv-q0JG~40nKe+mFSgM3p(L9<f!+gyTlzi
zb@(XJ2UBS1z%tuvm(&n$m{p4aqT*nLonJhMGgp%)%~^s67K(3poS<i4dGDt%dsE=C
zwGo?FWMd+395T@}uACCtL@L@Emb@Z{J)@PSM}jz!p0TqQqJyF^Qr9`LZ3n@%yv5w2
zw~`+X`bY*q*o;pDO_A22nTXl~l1p~d6!mC)4I|izvVDfoOAh3y%~4I=^%OSj@xsP<
zB?EfI(??mO8r=gEZ1~(j+o%k0Ey`KiK_5LG4sqw!J{`$_KqE7gaF-5Dz`7sV>35L?
zWiS%%N)Ja{0ZKc}vNS@sH|*@_@SF@9c}o0C(`Rw)u(dOKFvFO3KC~kCn_8+IC|0!L
zdJ%o36~iHj3)Am4y)4JpWct|1;T?dyW$aRbhmD~_6^~F13`k>JX<{cyghrM;e<aA+
zzyQf8>l={699h(MJi(SfsoaN7(AnBToB$Q$jn>j9(%k`pw`R=`h|!|^c}GksQ^9c(
zdyRdlf#BA>{Grh}7rbmS*uOj6K#jm=fY!-xTmYJ94&g=w+At^k5oL^B{|wLo;zrV!
z*gp$YaKnjohwR9i^^^CC2^l{02;xP<Mt;iberY+TnD@?#1dA!?|7HJF$%5qU;S*+w
zhuI5|t)UV%;W2Am;?`8IyFOyoAT2wsVhetChk5zkl@l9vRal8;YgTAy)T}Og>Y3rG
zt!!Z`U9*Qv^F77j=`$kDUH@Ebx2J7Ck?X?mh&oqL)TXv?Cu9~e<zYmDrqT%{;2cS0
zyn3px7v#7j223D}ftbZZCFw7%pxf&p2<lx5!X8RDDPm3V7|C=w)l~bhkik@*lED=0
zq(~vd&UUk0qF5I5XG^#k4YvK3m4YwOtu{7rWY~=*0Rj6&9cqOjM|^PD>PyV{9k%q=
z%_VQtZGfn(9J0T2);UoT9@_kvY%q%{E{@)&xutAdRIN5>;&`-%(U32!d6B6xDq4%f
zV$x9;gylT*1qGziq?i-Fr9PH=K!%J!X$n#Fl6XB-t?f62jQD^s1MxT$JKHfg#$@CQ
zofM-qpDJU=#Op!q6o##`GybkVHHC&HVWgx}>A(g=6)4!nKyrgEFtGsi)U=>L?MqVG
z-ZYl*ZwEX}kZ+E8_!S0l?Z<(|^Phr$Mjl(4XbzCv*(imq1qZn7AAWXe7t9SpZ*8%z
z-aI6n+vvO%eA3FW(3rB?*`Fn#KorAfxa4E&4_N?kb)Q}zKw*`xwYAihtm>9CA$nOz
z5KboCH;h39YoRW+0bp$?<R!bjly2dJCL|l8qiAY2c(?%qGm1P@*oq69XA=LRBDn4m
zNrcnOEI`_kOK;H1Gng(t3_7Ya8G@)tWisOEyx*rcRQ)J<1^0Z(9u92$vEe#H>$QVE
zyo9il^*LaPdV5J0m_-UaBVUOotJwD`1~_NaCjjztbOX0U@Jn+e8BcU7ePXP*#|CO$
zy?<eOOhwwd<m{M<iLD$&9}_HY5D(^PPGQ`GbHg%60)VI?<rv4xy300rl9r>fhHfw-
zrc>w>Ga{Z8fd;kgz%?)9`E$^w8w3q0q-n4uGq9*(4JypQ5ewN;^^e>+H?q`jpTn3N
zS}KDLTJG84#vWtmYL{G^T5wysgO$6q4xkAZ@^XLk`2MYUbNBLVOO8X$szDwYfo8a<
zWb03G6g!N3wwG34J^z+uEut)e>#U8y9az|qKXu7O=f_xtEkmsuxKwk>dx!Q$-A_cI
zQKKOhW_N92WDAzm#+w(UR!d`T$Wo>jEPA+kdl1kB)u}lXBJhtIqaB(1NCXmKW*FDZ
zUT<d^?XZNt!2!G=-Qn94v_jSiTJBHd47E5TO_-21oS`0v#_Wv+@8T#g@KpSS*hU$?
zHv`Y>6ri;sw=+<*1;=*XAdz45MAK0giuytP#$e>eveU(NCrWQXw!@7?H+TeV4Pwgi
zEmxl@kl4anC%2ZV)RyQRUw5>6L*Y9JfT1%5C;25l-7%V053UcQY{f2}8l^PQt)NwR
z-g^h%JN~3VDFp><%AYi&`WvB?Q2r^v>Y8fW(i-{l!6S$V+fo2J=K0ftE%E-32U{RR
z?JoHDg*D-{^xe-WWlr~1ATxtlSOzmC{S^-qEQg-{G+3EP&E~X%gccTN)h4b2NytfO
zZ%H?CGzAsN(guRY&uRY^B5;oOm(agRNJ|3!V~>Li9MzV)T~OGW;fa<!V`O06;sx<z
z3Jwv#dV|GHZ#v$_>kApRiUM134yKSkb7P8K-M==^9vUsG3Qd3iuAGM3T20}W{c3BY
zFViPgXg(U?k3CI_K8s@SMCVJ+NjzbDbSYRkgbEH|N$i~QZWn-wGb2fL`|c)nm?J8s
z!2)V5M9D|3zswBZ=TP+)sVsrMw<KtV1v;G$7k&FU#qQ-`4Cymv>vsy2a*Q)ux&rNs
zASl-JXpp@61{Rt%0qxouktH_{V=t|0jo%ko;lu!NS$QTp7JB&0#1=*t!Oww^CfI-z
ztI7;N6?K~c5105ekyoc$>BW8uAHFU!&^eLi)->wa(%epMcdS<Upd)-xGTq0x?sDKg
z{QlASaLc%=8AIw8EJ59`_;uDxpwCeQSYcH%o5)h_M=w#?+CddLLU^q`NHJ>tL+#F?
zx5<2E)cYs4{uOc^UVlWC5vaJMQxoZg=7?bvP4Xepi?pHG9bFfJ9OwJ`-?ED7k(QQz
z=l!!a3-hF%8x1PPynpg~$<~+v4M$UHZ}VgplSG2JGdMcPsLiPTiMT8M*3yEe!bo)!
zGvGZoNQ<Zi5P5+QkD79AuhaOIJIh)M_%2l+@U9Iu?q$H8GF6<Kdx%N&Vu=^=CWA9Y
zGjrcg_p8u~hEQ+<qOx)^rr%Dj1xt$qJRn$N4=+cDxd((G8=|+ewF&lJ4GUo63OuKG
zU<;#3bTKuybu;6<_3j)X!`2Md0qfTAhX>XmGTZ<)K-ik0)DbMDMLXkx!=7}{AFPh8
z?HiGJh^<!{;NMf@@Fr26TJ|C3hOlTP@QI(8F~up6y90jcIq4Pw;f&zE3m_Wcp$OAw
zn2`#<83g3hef%R(tW#3(L+c=j^dUEIF;c_W(WiK$U%0EVTWib9r>+HIB9rnZ<Y7fx
zI6*IW`-6O1!_Zrv*@@D%`Mo7^#_-cpf&GBRyGbgL(+%L2@pGp^XJ(}qIF0B7iUrzY
z`Zq>H?sThwdO5K}&4uI|OT6;Y07!irWh8_V4f~q!2D7|s(zGyoGwXudlG)8jmpA)K
zNNl*J^&&>-8D-%m4Pql@$c^+sdWI9}YxK9U+Iw)3x*$HIwJ1t=7sx4ks3(>O3_lIF
z<=6tXsPtCmbc+2Lx(o%{ap3yPG{J9Cml@QGNAPm8QDZP==^J413PwW&OSOS5iq#8F
zX&8ertf75dN^Iji==5MuOD9I6UmHK%yW}fZL-$@gV}9iH-&E#%Gn?tzhV}<~eqf#1
zvI&f5rknKu{P3qJ?O)bb&qxH1f(qC$4{B{UY)RoGi;fq}t&}*lK^u_8hV2q`keaj9
zMLIiP11N1Er_;v^JA@}Q%(yc^B|T#;K}Ya;Z<ziBCAN0`%WRh{Eys!%F0c?eJrN@@
zL;x9Un}JwaS-D}9W~yNne*cWfred9<pE?@hF3GPqgEB^>DHUpaWNUdQJ)lt`N4v6p
z;fuT^S?{N$jOtNhP~>d0HHl^qg_?kqZh-TGza&fU$^nxzBG882zC~w(QeEPt>+A4O
z*?w%;Hfu=7Q+tXVL|d2pY>Mbb3(Hkl;t^H7OV7h1#}9p)Q-+E~3V*z_o{%Sf3Y*5d
zwy3HhMLBRmWX8GLuz40_z3E@hiJpc9tgsM%t4KmawJsTFKN)1u!y@pEIQWvkZlO^=
z1l0~+N0)_o5Erz831LN!OK7dhHV*~bk}Xy?xG$bEBcMEIGQv3emchopfvkkCS6K4a
zY;d$VbUgIOz%~sBOQz?)paLD}#Cq`l87+b*#26R2)Fp+V$nm!W100`8p9nLx%<ljP
zc!>|{!GIC6g^%A+ew>-n4Ulp@u;CRXi3+_5zgw_}=a3Xv`w?_&A)73ntpVcBt^7T#
z!)CyuOOU$_VOC*9mnlBvVh|HXi6f_pVrJ=hX%;$wuyG<sMvb;5-zqVzB_`p347hWn
z;Xp5dK!W_0Tit4kKda5KcEW4S7%hP0D`K}JJKb&^Ku>hfXQbz#UT3Umtjh{Jdm`QT
ze%A0Tb3(T*&wt704MoZ<zU(&NoJk=Kat=h#ZA4^a!%u_h+ku!jpeS>)mJJU0VjoMZ
zsWHpayd1L=8W{znUKi-ZW`vZs1#;u;)ZDYM_v)G9Thenjd~AcM2o_I}%!d%}7Gl|i
zW}U)v!U-FB0UKpVAWaH;Uy`(B5L!^0BVsCOGu!I}NP@lr?I4i2DTb*c5sl{ZOP|D4
zusJW0-p=Y+wbkc_t;nspshU(ZQkaNr4zW9%{V`<mCR3Zi7F4BRLkq8VNGY=Ggp9_v
zpv{#0%OU<?4mLlGz!KN&7W|vw-Y4GQ+=j8WY}KIx#Y&c@B_$z$iEVnsgX~=XGN3RA
zfV=?zX{`Cm3FV#k)fRqEQI3s`a|#~zdY{AV8@NL6Tka{-!PZniM=n_?-+_e%Q8&rK
z;CHCwc5oh~`o;1&3Y!^eIQYcvpap7w4=)h!BXY{vs}mcjupJOt8WeNFq=1Q8xr%1w
zEqv4-lH3|GEjo8Df#jnUoEe5alEH+1Jt1`2SWaGAZ}0&{|I!`6gfEq5cXJl9Br0qM
zb~o4oOOVs3V~3Xu8uQv}gCh?gjNOf)=TU%~LM{j_vIm2w0Ol?n+R{#lh>*0j89}@7
z-VG(|1MB|YV!fl~ra@A=t_HB6moGAy6_MPA%qRp@xf|R<EZd;SQP6M6$tP^YA&aoF
zuAL}_Z-_*L&5_LM(rf2(2QV>i$br#b8XjXoIXsbVdL0pAfHoG;B*k7B0jsege%&+F
z_QY9VxH-8@dai=(Xow;DQ^>&Z(hd?ouuyO^fbnwDhg%kVZU{w|$V~{jhe8FgazXj@
z!06u#%=HRSTJA@nUf^pjOr07Tc0xN!HRy&2gF2;RF|~m5oClnf-<@DsuDoBG@N&xV
znDD+0zup)ewIom)v7U3jI-v_DXTh8$Q0)8g$RJaU9mvI$4b%X;<%9BH3p^F+d2YM!
z5{^?4ldol<XI3NK!9HB}TUwKLT}t<u>ecCwqix}0T2y-it~TFi66O%Jqd*tTua-wU
zr}lX*jWwzi7Z$|ksBO5b*qyyfNM?RN2(TMt4s9WD9<UdCR~EeI!l<`FNAh@Xa$Oxe
zmeaGg=!Kghzcsd~+>ZeQ^E+T7bMS#<J4Q*c+x(Sv9~T(Ya|@JO<1aP?{5KxEHq*N?
zoHt{*XOM9t-Rsq%S5W09lq55D)G54CSz6k<{&<>@WB<as?$FMep-BB2fUFVh$RO?3
zK*t(kMrE>Bg>TTijrD?MhAt#H=8^Qt{xI#R4o#_IZ;S&K8TC!+^g0-^wIgRH;9*C$
zlcv<l)`M1raNkRtN8f;UOo(p+LKrr}Z0WHsXo4@OY#(CuTavoV(F&B1c0!34#=&y{
z`viBtu~@CQf!Fwo+>qqTxC~L#ZU<sZRF^rg8rl;>ZIa9v*B$9sqIX-f5*y?fk*(0E
z&W!mZY+3CfUPouj3On0t!;O~@VKn^K4(8g@?9}wS=s_MA5<AU!=az(G3Y&T)51le?
zX&r;&jCK+}BuE|I&Wk3Yc)S@^g$<T`hgF%3I^Yrw-!QN{`hptNrXoe!4$N4ER;|o$
zT^Q`v7^`tL*me~b^AshGF02ZZ99*b~W5+W{nSG0dKq&78Fy~@Fi3`->nE0l^uwM3Q
ze$BoQXe{;?d*K+J5fPWWWbHqHZC%B|9BKh?8dBl)emiRjW(NT`!y*RrXlFgT9gQ{p
z_p$UW@S-ezi05+tSFZ-!X;PrKxsBlxa;q&kNSbil`za&;X}R4TNH;ht8(h(BApM-!
z<BSD3$06@9<Ds{Y2M5CX^Y6$`rgp0-M!T?psM+6x0<@c0RU%lGh4IvOF2k1H(-0lE
z?D&bjYHmZqwxbT9CWW0)!E4Q!=2*PI_8PbTL0XLRWo@W-i}2T^B0;d|0^cnxjuyf|
zc<<UW%1xi6(kXDj$dNnF?U|Pb@a3aS&NY$Fk--sDZoZ+-?LzvPMzY?^M~P``#}`d;
z9Wy(|VRKz4^M)~(iBxn3=61|Nwr0J0W;lYX60&%0JEuf>H&$*RqF7sNw-0l(!>Y`I
zz5|0E(f`c6=RIjFJ!KTWsE5@SE@v{aRb)Lmtm>A3Y3T9VfK>F*_pOBpkr3zRo0AA;
z*Aa+?=4E^-0?kMzYJQ^FH{iJiq!CF)F)z$(ODfijAp>w}jc8iqPg?_JbI`Ss05n33
z9SyvBe-}k0_mEbC#$R4EI0Oig@f1h>rr3CjEX0<xY!_zX3VhuhZQGJu??FH<G|<Nk
zPmWg#K2*S1OH6g=gu<Og_hW?r!Wfj0q2|(02Su=e2&3(6#!gxjLF{Nl9FL$wjE(uo
z$oT`2D(yeJr3(XmDis`D6FFxpg&mkm_ejta8YFJu>DQcra6qsU@xgSUy7A;G<ciGA
zf$kCuolzgm2XIEt9tbsv6!rigxWU6^gF4a#X&p1K^pt2e412{wTuWo!q9-#0;FoOI
z2%J`U(jFPGX2jPm8yZ#`7`|gIw$_H;hUTt`XbbDwNa@EpKu}ot^}hbJ-0Dm^!)cx&
z`L&5dcm?@FMzR*T?aI)G8Oxs2D%uP#uKACs1c*qtfYrJMW35J?zqVf6Bk7n<?tm8O
zY|SXQ_R`0IR3_}=0<i~*HI+`GVswwtpsh3b)d6)>{rOAHe(o`_;km&naZOV71AN17
zKLAN3mgl8srKLIjiIGTzGoB4N>`{8FiC(8<tsC~=4E-I+J=}^FM&``fGi%IVl!`QP
z<BgfhHKQ{R4C;zhpc*hZ0s`F<p9c5;oVKh0RVz}7?)ol>@gU%km1wE`1gmKYN$eux
zs344N%!6l^!lmU77DzlyheEgwD=bFb5Sa!%8ZEuUPh`O}!@AC_G1$GlEG+3H5oz}d
zO$fG7U>P%ZNDn-VX&W0Vh!@o2XA}faEdm?azzrMy$l}e&>kH6@0*ohguDd=;ZW{}Q
zf>#x6u?%^2VoP=LTT+SlsH7L@SWH$aim)T$Ps5PyH5R7<tDPDinwtW51~9De%J6CN
z@H-=ql$=vK8{B+r1jGX9ctTH0VdSGI2oFCcteD5#*1^Bp0=kJ;u;ck_8XR^QpcW2m
z1i>xil(K;t3F~)m6sS2Av<>n<kse)_K{lqjsqN{~{H_@KdTD5N9ZBHC3diSGC2Ps|
zgG0Hq46qFyTr*%K`WR#()r=E-&iAJ%4K{D^XSV&h+Krv;l3Kn_orF>XO5C!#LnPjp
zz*$ib7)GBFg9dDp*+b?1o}Brp4{+wujz}`B$d+eGruEb?@~fxFiC%)KF*UCgY5lqM
zU~ndJ7+=`{znX1(j9yn3ca3ELVW=lWP%G3W0<x%RZw4%&+%zVbRFzr9bL($+vw!v3
zK^Ht>XNARIal+!Ebj&!MaTqu6V=fW?hJ_517z)_2fHwR@X&t)_<|0DAsmY>?;I<Z*
zw1&8aGEQL83>P3CBW>DSV5n0=ymy9#w`7VjSn6u=f?SyO!RXeKg&eQi1;GxXZ29>U
zsvnU}G<2$r;5Q+KxQX*7TLWEl$fWCm8OUwy&KfN&3__GOGae2O^~%|(K^|Hebum5a
z6fw=o&a-{_N3ta5q0MaTMO(y-nS(tDsNz?4R&~o*wup%*`-5OgY<*DG3UdiH9^;Vj
z>_#ChQRq50>TC%Ru(2fM92mMoF<V^hIUAuczaJ<}#`E_Au&7##R+6C2)YRRQ&OCYo
zgjMjXHOo1%TF;D^+Cq4IGytNa43;6M@I-+bRW@3y!{DPv9LRcUv#6#J(JnCZBl_R+
z<QwAf45Dob<}<M=IU({+|HJx6lWLB&s*J33!70%lgc#tyAd2p6WJg$}IUOuVu7ABC
zc);YlG(%BYlu$}H?WC~IfNlw2=H!n%Z2JT01{CYvAd9W3x<s)lRDWut$8#IiHey3t
z`?+9j&8@*=jY_Nr!@N8K&HE?Aq*UV4AjQ@axvmH~(7Op!fC7lhp%h*`xe+EaxG~ly
z>H#auk1~JrHN}w?x?Y;~%&aOQ0@_{?bxNw5<bM+^CQ5;&LDEA2Hg@0K$edvhHWo71
z=GA-hO9}67X=>xZ!*0s3s9nF+`ATw8F(@q>J+Qr1k|oF{UTE+NQ|UyyMdp18ijt$9
zM`+Y^5Ceo3ZGj^)Q}Veb5r-tvJqE%YcNdg}ntv+P{<3eUhM$IN;0-X#xU<lL#zIUD
zh%f;x?ARMA-_EQsB%ZPL{hM>xa$$9YnU(u0<R89eMEL1X6CEe&i7SA_BM_A}rR<&?
z4H7^oe9rZ2S`;$W?sC70ApOk73~-Ghik#$(^8e<#3`~SEIYgcXES@lGK|krCSOw|h
zg1DnHD_^6wDM&mTps*Q{>B4Z7mfPO2c?PpV=pm<p79iOr`NM^6m*iiTn873G(aYM<
z3KT8mR<luJ_YVu{1rS8oUGdTp!Docnv(XG7%G4`7b#4<u!t_<x=^+wdb58WjHvVyo
zRY|mmRHZy8{OCBaQ7M|D?io5<7<npoSTk7t_>~Y?36@e=;?nTsQC95kA6WUhwIK)4
zoFqR-mWKlfMM&kC=U4yA0^b%?fDsJTi6|>FjU=ZVJkFM_SGpOCH;;vkI$}jZdPnY8
z_y`-n{+#+oM%P8o%1@EVY+$<lQ$fKv;ZO2@HWFFZd_M?|)asnTv9z(n&JZoFPqtIO
zwA&U2xYXo+J517!Wm@i+QUfiU+JQPpl>ed(kdZ+IG4m>bOH1|0LWd<5VUD@kvbE}g
z1CI4e4<GfJh;)PJoj<t-_E<WVo<+86P7Pn$(`M3%^t|+pA4wbbwdKKzJ|l6;LgWNY
zz5Tm|o+z{}EJ^`=B1%u=y?C{4-YIf!Np|NpA#rUF+gX9UpXO9%0w#Q+q}^gZ*gVmp
zCNwm&^8}(eeFg;AdA}|L$SgX@*b*{}8JWhg3q}@%E$NopP>`78*;Y1NKleDGCBwa8
zB8o<`xe-BPc}J}0h4jAkb7?Kz;mX!};0goDZ+IJI;thU6b?<;XN7^;zP_u{9ed)fP
zzIjz>_)xkhJ!7W=Ec=mCW!;A(jYt7O1nzl>8wt{73HY=cRDi>J&-=VPnE^dQX`+X=
zLmz9_W@?l8YSd#&+Ia~6H*6yoLH&Ij#y3Tc=Y}Y5aEu!_pbgxfdk`L`Hrglx;s%*l
zp8vMex%9sDK1V6%^y3x=mj_2Cm_9ufitt@lW{j{jT<?s)f8jPb0~iX$+?vxDI_pK)
z9{O960T=W`NBY*<%h#6Z6#EWf!-;B*;mfTpRYt@%$0g<=|4ZqY(ud4rJiu292N=cu
zb2AVM{r^C^Yhy}+Qy<8{+L~cUK2d87%(bD3W{NIN@vQ{|q9y|bB1#Xf<pBj>XmRq>
z!Ns+4^J?@oVvta^l>0rpF8#WcekFY%-Ii{Hpornww~1bYVAOM<7t%-4CtO(?gQPIy
z7ZC8w9l)jZvGlR@GWKUeTgi@PE2J~&9{i)dB5=6rRHjL99I<y~jtvH21!d)G8$}8e
z9H!vwE<p#*d561>!NuxTmj7-sfKhJ=E<v-OKZy>r9|GM!-H*L@wt*70XG6uZjRp9n
z^h@cCqg-<1G3TsXJ3!ylogYg-mcApMNG}m)L_xj42BpP3AH_g3o^4^|sfGsF5;GFg
zSq&V(l<|>;vB`JT0S_=5!LI1}pvaxJcHBT~#Qz#;-2$&0F`#GCFQor2NqR-PgYf5&
zf==Pb7u4FT@S*3@+tNQvQ|V<Jrk~p&n44(VM?O+MW!OKJUXq@ZUg6~zIFNHog@P44
zf}wj-nwj<3^lM82LH6*U*dAN!)0<jkTAR*igI=z%)e~z7h}LP**%Iw=>;30XfAPNb
z6X_-CMZ8~hLT6T_k$IJF@!po+l3u6bEJrCWu{d+Q&tsmqa{%v4A4rd-n?|au%?HMO
zlhaZCk;RZ=ZC6y*Yx2V|FvURrl}SGR%ON{2XI*lur*-&IRNtPKo}m&JRF<0R$qB!H
zISAZcgg=%3foU$sK#+m_m6jYuD%_cnA4)Gsx%4tdd(vky)#%6&c|4cikiH|`mR_@B
z*xY=Wc<sA{;e7`XYCH#SjKa~<P#`B04%yTNGlPOfu(T6MWt6%e4>tpxicKFR{8WC6
zM-VEt*>8nUsn+|_vGl6+S<<!|PV_nP$#YUIeJK4B+t6M757I3um!7w(VP(!FvW9g`
zqDqSTmh`twV7y7fd&z*H2uQZlN78%J=^%@YcrjYda*~({TOv1HEv{17@PrT?S;8B%
zm6qT(+DRq@>$R}6j|-y`N2C?~Z_@$z;qre=`j1QtKcO%ow*Dp0m|}oWe6G_*dQ1BM
zNvG1E+rXv>45su(t~os7GQ9U%`la+w((}wjTru(>ijy^u8G~SYpN!Iai=&$2D^}Lx
zU-5IzzzkH@U=b{gTQ-<*X1ut`AdhWPn1E_ZwnyEca#`a^M+|6^|9_H>r9Xjj9lsJV
zWkDzNhpzt{=>zGz(lgRhx&zQ$S$9|1!}CEBwUvG){XlvJz-G$PdJ$O%D0Y`d&~{(`
zx1^gk_#z;{#zOHD8TZUskXUw>Y_tZAY}gJbwwJE%9HpO|Z1@QsB!S-?vP?S@=ZO9J
zZ+M)qQu19`k5@yY_JQ<aaCyRn{NGY7{abv+!Xm!fK#U7Nk{~VKd(z+1ukt+W6sSSP
zT4vHi>0=DMIe>s5j-_K(Ic>2ca*2Qe&rzMuhQzRg9kPzr+1%oVh@E=YD}t@u=m@n{
z9M)t5PPH6vfCMDGBmKR!kS+=3&!l&xQ(iE6f}cw7Q?a_*A`hkSk>q}vi?_fo0a)Fa
zek{EmyC+@zNcw?PNDrmYG8E-Z$_ZQrjQxG-C(@(q96*#YCRjWfw!FZxgbucD)^>Q(
zC?duv2_e@FIZ<!-IJjI}rM@)Za7y@@jb_0~3O|z$q!+MqYtn}u(+CqQ??=+lsnvG!
z{~_k$Lb`+e*H%zV^8e95{wag_h4i;l4tsTJpmCr`8|j~=w+A%3(ElmjFc&uNKe^ij
z3xfg@ZbJUGm45|q_>i)w81TI_Qdnn$YWO=7<2_D^iI0)?u3woPk^ckfyByzp2V;8K
zKT9ehdOwri9jI{1V-KWSdKp6fnu8frNN`E6_A`%R(#n5dDx_aZFG{aSPqC0ul53@V
z(hsDc`pWwf6c#Q<`Mu>8R=BiFNE|ySahyvJrDKG&wQiVTRul#>b>25vd|@%${&^fb
ztkBvzAcjlO5Q;af;%(z=WZZcreI)$=E$rn{Z%QZ99cg8w#T=bX5bTenHwW@RW#>($
zuK>7b)|Mga9KRS?zAgiZJYz<l9*FD2La6{#du-UuC{i~JKx=FYM`b@TEFes%F;?K&
z8GB$0jcj4L?9lp$B<@S!1&(+I(DyzCmBebhl-`owAJE7u>7Mj`=|cJ`1<`oB1?2L(
z(odwf24hJp>8|u0>7Mk8^gJw*)<$YI(nIMD>7CJ;rr`z`sKkzCj8+UU(GGLLjNQ{%
z$L|4)*Rqoyxciq?@N$E=12GN!rKAmXHQGoOQg|f&RGLUXhYKSD_pk)#(!0`IgQJPk
z#cxPE=^e7E9emUy?CB@c&!ji^<$quLC+SRjfrf^F^o1^Oq<5t^26qnS-(kaN*y96~
zGC;A`z^m3*Zi$x)*lG&rqs6_JWEQ<<()2UYtGuOFGR1T)2@lrPBgTC|QqO;o?sGh2
z$(cyOJWAH$gHaW}Cw*6XEImixZv_@Bobj3T$-sK0JAWkoqx7-#iu8(fLt0x4-;OCK
z?@RCQ%U_1XSHv?pyC@ji>2DwIBiRW8dqm^P5(`xu?^6AR?BPc%0;nl6OOGAR5e?g9
zGYEYL`hP0Dg|EA5<J5wgkw_$OO1~WBK{Z%rE&W(}hO9I8XwDpxpG!XiQSa{jK>EJ4
zmVP4Lz)nx`M;XY<+tM#Red}rfl?8Y+jQNhIpK{R>OVF`lGpNxg2z_EFA;t)cz_W_M
zD9GSz2euG2?<udB*<^`v9<kV*cclyIZD#Y!;@t){N?B|@dhw<7hH)Lk0G38rJ-_ba
zbLkySU(n*kgRE&X9%+OLu4G_gRUZkR17pfs5w0!{Zb^zaFO2FT%cU!0Yfk*(8*4_~
zVZFA%A#+xK%6!xD4bTDyTu48(se`$dm0P9>jEHc@c`5JNESH_}RCA8xJKskE6TFsE
zff-vWjCP<17o+m`BIMjszM!qNc%r$v)Xo9tHnp=dkKgbt8|#QG{W!6fYobp_{hp2}
zMaT8q>m0C<|EaZI?ko=s2z3S3b0~t3q*LkVb_POaacIM)UXNa!cHRfl_sOPqR>$5^
zy&7o*i(hHE=L%U|qR2B8d3s&+v!L^CV#syyntbz2J@m~DqzOjW4hZ&u0$_>?&PO@u
z@kwd^FQkvGV$m3*Z%Z(AF=|3SBAQ=X^-yN{Hr53<BL7S27uM;LvuqhlFx3AJ0?KV-
zQ=qTQelw)__lJWzM2+3g`_#9<rk%@j2(EPXX_ztE61YumZrPk4D$H?Ry8tb1c<1Kk
zTl-tFF~^yhTjU${I=^9SErH{AKI3%gxXdey!66$7{|oTVmCZm4&Q)pj$c5D;yX!a7
zU6=#WkkL~0ngGy6dB7j7t1eE+2Ik=5hn9=a`c2~F^4~y|%}fNo1W%Ixj&V+TfAQj1
z;^L7z$KnO-*xF0BDAITa+4RkD+%WpT8YF*d49^aNSVLjiq02{9=GWGam1?5hESG=>
z0{MtybpUY_Qof*9S5KDzu5YC>TeH@>S+14;MW1P&*~NOY{43jeZGZDLwtq%0uIya2
zIUw6qx(NyyVD@6a88>3kn^>L+-oG#=WM-ZC-OJ}#|9h1C*03>7$qA=qBdh(;WALfK
z&pk@bOd|ECituc2hsM9uPqsggnMx_!Kk=#~JFn*llTN0>-IlUg%(5wgW`?~G5k75s
z=q*})U=*Cp-hLZmjO05~m8e~wQ~p|7PjqDD*M8zA)%9Qa^o#XdT&(`eX1~$J8H|=v
zI-#JGK$$bf)GVc2(&0WCt-&k==$#oIB@DoYO{(qGAV+&zn=z5FsGCre68d{Ui1egA
z5b;3v#P2-m;_SK@@poLjCtaUpb|Kw|!Z9W2s3{9iteSr@fDi?n!Fg&k3t9>i+Q<7u
zj45P?8qB4{l3WlmMd_veAE?4lUc66He(~B*P>)a90wr=fq>K`A^okfp7KUB@TwzCV
z?pV$pM{-X2lhQJvUEEUyP!G`dThaj#*2X}+rti|O{rvV?*rFJIhd~VUDHprt%l0l9
zUfk;g5d51GHgHDxSy5NsNS{a_4HAi#Vu`E)wlpLzHyCZ$iB}UEU6_42Vs=7!g<yD1
z2JZhiR<uw3{7x7DHop>e+zaBJ$~sQd7amA0bc|vEgthqRV+&TN0C79MlN*-TEqku1
z*GB*VfB;EEK~w=?5O>S4N(aW+%(-pxslxwVrT9O6@gf7;F;Bm>n&gHP&7wnj=NNP_
zC}zm=7Jt3!XV!(CS=i}+l=*5R_!R<QG5zks_>lFlBmdu3h5zR+UPi=mVEnt*`kv3F
zkFn<mHg4uh5P_{E(^sLkJ22~(BrG`WJNF(_^gn^0H^_PmO4%DKDN&*OpD6#|?6-ci
zD|-$g3alk9NYS>nrDSoAR<|Jr-#uHa*X9<`UxKhk{@M=ioUvnq1ry*yWm(4BP|@F?
zPm6@793wEZxg;UZD3R?fQMy{jIVnS^XxT3xMgIM8pcrwJNxvmEeJT10qXXjF8qB}m
zPe$$V1|)_WsBDgTkqz^k4BTL0wSv};o@iOiFe9b4q;`wnLd5aF^^n^4R=NlHwtjLA
z;9u`2V7m&gYl>U8(vrAwN?>?lV|uPWk$X4Po(sUwn256>r`mWgVP=tX0Dh4#L~CWq
zOp0P)zy4K!dNqI&jSJey#0+p0rDulWceUgg0x-22W%`vw|9uRY*1Gi5XF{!Ei!%vg
zXhq1HGl63A>%ZP_naJw;80*t7esXkn?e$S0oLeimOdJu7r<Ua2Jh!PG;8|JuFm9Mz
z9=09yG1O+0E^XdqhPi4;<s4i5?WOQPOoZ26Jg7m8|72)o**~0Rg6ZAbl+Sh_f~ySq
ziTzWOgk8WinL}#HoCI+QO@2ubytKMcf%VGm9{+Pi_!}(VlYin#`XB9xI~0B`eIh+8
z9YF0`_W7Jww<>IYP(wC#Y^8&Be^f!Qk*qb4dS><VVC~P1-?A7?BK#jE!v7)mXXN5?
zaB0&1hx9R@7z#8Yqe`6cUIvh(v<rSR{oT&S1m%Dd(?JBE;o8$6kz-qi7H1NyrUU!+
zYd_v^puIX|b6qKFaPdHrUPNo2aPc5O&iWT$3lHXYu)y4A3T9jo8)N75@MSEL<Z;iK
zr*;!awYKe5?We%58718HKq52yYLJpXMGZrK*NL$GR1pR?JGfQz>&t&ozW<cPON__I
zfS*gtnlfC%+Aw7KGB<FWf(p?hxnQ0>rnx1<0e3Z+(k2Wo%o$(u3R43zCWCD|Duv8i
z#~nld%~aSU-6-XL%EhBQul>}EGhROO-I9@?Tj_-3@EStGULF)xt~Ev?LYkWCj|X-d
z_-v4YRU~VLjWI}B$HV@J)yVa(wX<1Y|M$1brxa<5z6kf%@6(M}Y#zU-MpwQE0kzh;
z=W&dX<f#p$%-n<4@N|~eEhB>1Qo3a%w_wP17+iF~H_T0H2(}hBkvGWk+v{vl<nlpu
zZ2HX~kqBPy*Qx>wSz0r!7f~w%qDC&BaB+<Ki0-`|k;%n%cb+hX`Vdk=255O^<0o_a
z^>P<s=tKc|XHEq^HH;j=jjOCRrBl~vuqZhbX2;?k5sE&fPNOG>KhXm|H3Jk3fS}r~
z2R_IbFXSI#c}7UEwwA;3#XIZ+wc$f+jP!c6N?9`nrZJFh2P15b=1$S{d?3ZRzGa`R
z`0AX&W&vBoH?MT#9F;z0g}1Qq@_r5Ga4<=JR0>muL(a?|MXPLdxYpN+&}WiF<4H0V
zm+1es@;~UKkTT47#l_<WEC$Mi<|Nj`gz%`bsXtvAB_gXBpRvSIdaSgCj61gtdbO+1
z>1&7y5EbE4idDD#=X!u2jY?r|Jg>IjiqK#}uC)S7YdaLCGCa=u3C3%}&It~ChB~|P
z>usK%1Bk%kmJJULpPpLn8uHbi2fdU&l0KH62FO|W4*|`Hl1gBPt1gHjpcaNIHGQDJ
z;-7jj32gsUG7;ld>6m*iXlh9&;gR%?^oI0cAjWj@E7A+nA%dFYgA$A85tGM;Z1Rcp
zwDdXYd7HmF1<u(*tAAJe#r~b+6H(jF>fH@P>H>m^F(Eyrn^-ey4yAJi8$GIFht$z}
z0BV%^7lfagHNnUDiI^8OZh$7z9qBWq?IP-(8i_pdWZz(ZSWo`9q}Qa^q?;J6t(nnQ
zx-Y#Wy}2*KLb@TnCVfu2gT;?Mlq|$2(wmIp7?Q9XY(<73gc-=plErUnPy?7FxpV0~
zn1l;^kj!jw&xTZxybf7~^zB2+aOW7~=qS3@0hiKE=|$-|=_wHFC?K8F0rN}gJ#vzv
z{=X`{Y@OK2R!pg6+?C#y-WlCIPf4#!FVjZf;x41}@ILZC8<l@$C6*<k-dI&6%t>j!
zsr!k{jvI+V)KN4`?%{y1bt)VKZZ@cDfyZ6aTU;WGv(Y9Lm%<I{73n!UJ#u0$G4(sX
zExjc@9F_m;(n~~a1(st9!1X|SOL{}Pzkl&lD1V9BX$%~Yw2?lN-jwdgEA$${witjz
zn*<koFj_ETWKYNB+)0n5kEIv6(asEw>|ug&-aUO`4y7f%_W3K3qixxt;~Suq?nvK|
zzAb%;nHG<v38!-<$MQ(}9uZsDPnWjNSEX-DpOvQ4U1=p9)0Uk}XVS;g_oaH!&zsi$
z52b%Ey$+{1Qi)@x?SCTuNNS~j9(3|W?W-csoEwWOLYZjZ>2aZ2>tA`)KT&?dX|Sp5
zKf>3AuY65j_1MDq%EpnbsqNgbGl9qDe@prU=`W?vSR`=F0rkPQdrMM(@?h`cZ%9wE
zV3i@^say5GqeYzv(~y@hNPi)HOM1otP(iQ0@cfp7U?sgH{hie8w;_iQoszYsg-DE^
zDC?G}@MJKRKTyR~!@5P*H6<E5U{2ZyJ$Y&&-wmLK8G+U%|5*@z?yd`fk`%ro{e|>3
z=_xzOuqAvwXXwd)Hr{=w!e8X#JG@s-);Oo(^oP>_7s^^E{~OZRr2k9$BKgt`g}Wg3
zekA>!^dF_4k2V8`8;<6!h<*xV2S)hbArKqTGNG!u#htZ{!Z;&|?Ybx@to(vX%#3eL
zD2dj*$UP3E9>>#@7Jgg$oTc@L77T5r`_lJN;mdvd^Vibn(SmSBb68-h;l0_H|4r=A
zi(DYSF=sf<Bk6}WqN9~(05wU{7SU&Tg)?Sr-9k3KI&4OJ*Rd^E&Z0L~=#ou}?S)Sz
zkF`bg(K{UwOkw{!;s8>}^hN3alD<U*xh0*xDIF0Crx$!z`hdAQR|V!TNPi`LTY5>#
zrF+sD@xpB^>0N0mJz%~>n=U?<z9fBH`g7@N=|17Z5uQ0E6nCVl^e)}-_2BsfE^WwV
zTR@o#UPN^3cKA=A2oG)Yb4JN3`QTF$ntBv`3FjO;Yi8~`61pijA4$TJ4fu-m*V4D8
z=dc*(7`cZ3{YctMFGz1jwIPY{%hJD-{!01`Cos!!>l8RYC9R1gyTv<{J}3Q|^vBXy
z7^+*7M`l2}2cWtS_JN2Hbz#|X+8&$!e9u!;({A&mNe2hgQ_>5#ku_7Cl5NPbL^E#$
zD}cS`blRL5cs8-}|LO@XU@2dFS^Bp0Z>7(YP^`>dryqSJRnptihpxg$n65vSUZ-;~
z!vMvU*jJ=O={@P)K#?+ODZMOxQ~IX#Dw{uYqKbCJl~hW<l72OqLlHn^U^yZG147}4
z(wX!WlR0}<pfbj|V+VyflL7<!$smZ20%VDhk8En}q>ot#iaGow5xy>cOZp4xHL2qG
zrz1P$VM~7cw)8<rAzl0>>Hn1eTDlF}a>*d+fS{j~G6u?A#>I!yXQjWC{!)6H_Ng2o
za}E{Zk(5b4^#Ewt2e5_m?A6>7HJnh{idjHcpBWuW=S(u1U~2@h8$-(G_FGqX*wFpC
zF?_qicSi+dcik-6<ddF}zAk-J`V7`TdChy)lb%wXXMuSV%%HjSiu4!KpGz+gxIH7?
zptp2O9CibM>7S+hX>ZM?SEav{{**bQ8|itX=>_L1r4V;3y&@e;cU_G#0MO`pIW@$2
zXM@?!tf!*q12V(x8ms*Vb0SKVna7cF=LJ0m4W3{EVwp)NWU<-6d&l$7q*tVGOMfIi
z4flO%<Mji^p0VfVVFQ!t`j+$!=>?FFieC|XyCDy`<pU|E(zDVZN`EAMj=!s{J$(io
z_oB3tK9qhukbjGw9uo2edb(p9Ps|vsy$?V(U?Y0}GaC`QB88uUH1xcOtZi&!i1m)5
zN4RHQ!%w)9!qd`Mq;E<ulf>8XrX!eYNQj@34ghtp7Vod6f6F-T9aV~Bqk^Odd{%l%
zI+i|3BA80gNne+~z&)eyKVx;5Y@s8Yk<<G`xy(Dfa_Hqd@`%caY;~w`BvBLF0u5X6
zF&xt1vekZZVMULK0V31Br0_e1Cp^DS{?AK)BK;BgXSyIpV3jry;+FIj`|?WuFH3(Z
z{k8NtG%u%_e8!(lKslb34yAiilXl*V(!Y`Z00@4G4{WTRG4X4jm7b2HbhMuVv^I4!
z0Hh@Xh@7Bx7%gl_j?#wHsPKa|f}O#1Sqwb*C1;u}p-RNwxTGh0E#1c?4c+gd^t$v1
z((|M#!T+dOj24Su_(bX?=+8^vV4i%!J(nb6=>^ZF1;d7q!hxkbzb1V}dR7u~>5PK*
zO*(p_N`5k`LjmJAIDi;X@<^&US8Gjh+yOB`gS#!gXl;A(<gM+nx%9}{9Z|au&@`-H
z&MfT-CoHUSDZ?5-CLKwyNuQT)6H2wToy9Dd31;LrXU|>f&$09c>8niSeMAl71_wBl
zW>1{mbCqemB)u*@C*8L4Wz8$Bv4_Xf9ppEjM3!cQTYh3`O<^^;Ztfy1z-&^Qdx0!3
zd613u^A0imni{|w7jtamJ04)hX9#j)A#SJtC(;+CZ%Qv3Rc?yfWCru4B41|)Qq}o&
z>5rw)V{zxw(~@l2B1F%;2ADR_7o|UyULx^SfAB)W&pwv!vXRTtEnN0XkI4-_kRC&P
zxWllY+_K@T=;H{r3l>%+J}Z9WNID@W>oHhjUd@vIbID(=@!ly*nHe9t_wdKk=cI3f
zAFl{KYP?cmF;HvcUOEKfThi-rArFCVgK(DHh@KArO)LLZ>5s_{q5~olugK$*13YpE
z+(UvJeB8t`v?!Tu4N{Aky!V;FWSf|~SVKgKbUL%io;|+$6j{gqoPhA%<qtO=_zpI}
zXQj{4$P<Wd0fs*%lsF&E6U#}p4lt3?>M@a?;vAH<jgiU+2I!7-3}U-xbDV?07mQMf
zos{nzP^Zd71$RxPyt5);gL&<AxJ7<P+%4u=)l6EdaE#}JQ68C1Nz3_CW#p$zlAs-1
zWn}jrNH0jw^pF2dR~~~wHaye`=s<Vzi_%ND%QNYbbjWixc<=50AeT%!kY18LCp~46
zZg3AynP`02XEKf?ku{QA_D3jBF!JevkH|6<@~^n^mJK>=0G(qr&jbM%gNr@hoMDMF
zEOB_{!j!MK1WErtFMXCwt;O`rZ5m6({d+bb?Z>AHA6s(I$Yyp%K&=_M)PWbr((BS^
zc=;u4u^E8tCF5EXJ^5rb1Bl{CP?v-8m>WB3iACzwU|Mj5Gg9sb2rClW^uJEi%VODm
z4(WvGZA-XQ*h3vkH}?a$gnxcsdR@9f3|P~ipZAA|ghV?Na?B9nsd=^)TlK_x&>u?=
z{Xp}y@~=ss<uHfboM~y<P;R(rHgo{O4+`zQq?C|z>kG6e7(E>l98AIpoYXIQP4G#z
zohsN}Jhu}}qi_+wxL`EM=DIV1GlK17u8jMv%^$Tq=?aQ*S47EehFT6#z62Yr0k8$I
zyY=Uv*Z{AL-^#EG6L@Az17CUuC^p)KbWXmq!Q9S_qjO+Kj}`+T(6EFB<dMLd8(cz~
zh<xhcQAIhbf@PSIs!k}2wO|y}CqO_)DBu)ZaR!S)pimcvC-*X#Xk!lM*~}(Y=Vnru
z#)#_bOEU~WX%>6Scg|6sasEbl-$*u>rs_2tC^t(OVapXZ`B?fDQ0D=v7MYqCBg*{@
z>=5P6h;GkuW=nifZRP%94JLjiTvs^2lw3@(q__8u{k$@ZHx1RTI0vV|jYmS3?>pcW
zJ>8OSTc!SjNV0;Jd`G%PpZQ1zP!nP_Y{ayXw=fM}B9=E}hxOlD*HEIGO)P!O_5=69
z`mc$U7X*+ZCyDzHy{Ahdz2THu2{py#Em2{3<pWf!Tf8Nm_%%>>G=fI%Jp&Z#y+}|;
z=a{$Ls0`;;WNpnWsOJDehAT3RP~eJaFmjpP2Gm@=K%~c&jcu*4e$SAC7ceAy0II-0
zN^6*zn44Nk&q<G^`;xBfC8_Cfs_i(R9a=T9Q|TsFRvDV%VBv(82YMaAbUWC9Ek104
z6xNuVsU4(!!gQnYB3`r!oSFj8*+5$}j|<+hBQgXt?gEXUk-Sg9qAGNAGuR9f_ymK#
zX6tUitit*{xQ>O@+B28L>jh`5?F><yQW@@TpxWGBv>iUS!@FW9&-+>8)n2=YL0F=&
zvYAuigQ`*Gk;O#~<+qu2<g{k-gOe~6M2jr9{luZNjkKd6-#tY^yglJ3&WsE>GhHmL
z;d?0j9a9=Nh<9Ox0-5s9i5|K)p98fuKyWptFQ<Z@TXRYeg=!G^0;hjzO}9IUa7!L-
zH=2Q!(5-7yv1qGlY$mXXR<^N!GsJtr8*YJ0O4FG+uJ2+Hy|!%c1q?+8PeWkM{t({L
z&%#Kf8w#7Jpb=$%rCS$aZMFXbOkoKS+#>erAcLul*T3U+Qp9p9-IZ<t3KovshuAyi
z2;S6=ET)2*Glfl`Q*pWJFn&$QdlPlZfIV)43$^T++P7i&^&M*DrS=-wV@_Ceah>%E
zsACQEw%xM?8P;Ovb?Dd`C03#(6v()2ZsnuW&lD9KC8PE#JKkW&1D{C8tj%t}izdhq
zB}L@G@YjSgiQm83kDz6<m>x)5rddWaYh^Dyksd-VxO)B+ksR0vn>9h~BWWVt#;RUi
z=Y0bOx#aZ{DJJ3GY10Mz7}SJ`U%$gF1|6v&{WxbX@C-6x>E16#LynAhT=2hJqcO^Y
zoRLooyic%CJ8J}TH;@ldyr?7IfrK6QB`1Pz3FR-OkM}1j2E(UeYC+y#>iE$igCah)
zom^F+P8mh2U|(k@_8|-&dv0xdHKAOyvz&V8S`cNy5=;7&l}=_>G-xdS?|s7i?lAog
zAk)P7XBAoA#bBUnZV6U~2o4CE0#Pd=hMx_l5vNM~!t#nO+kV2f&u{>{QT=Hw(qEcJ
zi|xE4yxNei^uox{{Pb1}7P-db=Cl%>qgtK*G^|#nsg>mzt##@SCEr0n0r5o!aA^@l
zWutjA)_EE!*v?MV$PLVk|17ZWmGuF%p}HG_(C7iJ4eM^W`O)Ac0au`j0sSkMLFiDj
zjU~#w98C0xLPwz70bMRg*fKW#a&V}0fMk_5^2}MxBRfZT$B96zeKQ&g5Zzu4k2j+>
z62!FK;ArrcCY24l{E(>s43D=&rB96Q*e!m^6Ru&;&PW;7?5a~N{CEa-E`2Cv(o3XS
z6aIW{M?~+?jxL375o}>atO68ahQLb%G^~Tpq-Wr+w`3qWWWFE*?e<5}`X_a2q^^aX
zr70~gear7|2J+7g$ebXsXc@RgDKVOYO22JuO(0XE@d7R0V3;G7$$a+@KDoszTl6}|
zLxsZkVBnyg#Xei$LxR6l!ZRL5@GbjvYn|i=?BB*tL#~K@M;^XKg#&jNs&;HZbVd>}
zY-4VjeX?M+BWRnU;0=51au9&k#?6{r3_7!;5jQxHE!%7a!fnh>L=7~z?&CQDU^8%F
z5t>&}&rd*m6K{UPGCj7w!fwr50*D|MCG&c~`d3D+Z2jUc1DLVtOa8s4tQZxn@GM(@
zf!2<iIhT&D;b@NOh^oLy7fWR=K}Db8Qk%N3_n$sKnSneh1wAs_#De;45G&ik252!O
zGqh}Nq>x~A<ZR=f``zOGCRn^3_HAY-!=4gEk3^vHl3SvaBe)<t8nY*qO9N)wxdBT5
zc7*F`q3~P~zE{LiL9FZm>=oZgP{{%L`^~;$J|x8&kDy}@=g?t7kQu&ZYVlo97Yg&i
ztyN%4RPdZD*MlOq2z<7N7Ho++XQpP&VC4r7Ul7(*(qmHIl)ju8oHyZIx1Rhb1Htk*
z2hy_?Bs1WUxvNm*t}9ZeskOnLQk*Z*;<M3R6fw{-N#qVVq@sFs#QV3NM1}Q=FyIDT
z?bHCboI3h$;KL$_jt1(|K;l5bqI^Bl%a&OlT3S8qpwAg<=r>&tcA@&ndhGlhrLK)1
za13Rrr+Gnv+hO_}4p-T5F5Z&o8eyVG0GC<CA#JBQ=+Y$`xv&twgI}U>oLTE{W%f5`
zkClG^nkiNnyi1sq8OG<*%0sogp3VWJjdsJ1HBEbX$&+lT%w)KtwQm5Co~2Um<9_eJ
zmMKCP1QER=<T<tNMmi@(uWa_>gyel=_Z!&&xn)SXRefhxy*Nc$oegQsrtJ(Q3LAXR
z_G+2V(+dIu#VU<77C+kXju|U75(EXsEVnxN%owW;M!y(@@0p<xPg(oMDTw{d=G;e3
zxYL0szeNTz!zx4#Au)GkMcfhL-;iFEUSJs4V<<pJl&D%Z+(vpBM3RiSILDGi1#Sf|
zMyS@zGTLqf=a%;8yv~-iVnL2SrBFRALbgOKPXhtTXnZB_xI{s=`<^7X24Y#rI<Z!W
zs20rz)^|hVIKcpJa6M<Vu13~54ngGBsZbjRp0SZ?<G)m%#|4U(VJ;uC^DB-hni@rZ
z?bgfpe`P8(MQ1k_Nkwutl72-ctKrEH_?6ls*e!>N_Q33t7&{}FNnY-pZ{%E`S$)6z
zgzH<Qvn5e!r1C+mtULqqk3$&+e-hGNF_tFhk&94)#B=<BzTt`WM6?F%&*1ni2i4n(
z|GhBSJF_=x(dfxQ{yje~o1anH$&xjCzav%%hEi9ADOjJCY0|`G)v|iS6g;B~DTntK
z<;x5RB4m&a_%Hta@d_mpKjVT0wqb|QsRxe`<crqyp~MCRQzgd-^uoW!28}IfI$l|%
zQo!$x^kZZL<Tk`}Lop-bvw(MVOR#%y?(Ex3JIz=0L*FL?8$uxAWeXb@HZk0M3c7J-
zZ9ctE2>F0KEVWuaWl3aI)()7-cohM~2CmuMp~F$Z%xnf(4+4q>KcvtSuh$TV7gm@V
zuj92eZQew&O8|$BP3915+#_?wTf}~<xqoYsU4tj;g|8W>*#`gY0+2An=+;)dZTD?x
zL>5sX48lplUQ9s7ZLq{Hj?Ii0v6Jpe?@PJ#5;V>V16mlC(et5!;%)2{zns<CfC3ku
zcnpNU0V<4Ui8)650(2>ZtFR$O?JZ<jp$oFH3qqHmwJZn62nfl|z)ZKmE;UhqMiTK@
zdUzcZI{bc0AE!W2;WW;WLL{-Zzx+cmf}9%)Rd3A3l)$+?2zqM#xiyrLq@G)H{{kJI
zy5@F%WD8gElEs`MLmB&WP{g@X!zq}>6f)r*X+z-th&ZgE4Q?_>{ts{f!f2f{R&Zxz
zzIq=yh{veOE{<r|dxRMa#C>XBdT9L%G$F`v8`y8b0o_<fNUwq>f}(Sq(6IwWs9DL#
zX+~^N7^kF(TfVoU>bYR|Upo^Z;@F%U&cH%aYW>L0x$C8nl_g6Vbj^r2Hy9T0?iqm~
z62s*N258CnkU2Im+(1&0uHP>wb`6Vni48t6QAGiwBco;{%nPfv$Rf}S>9LJtxt0PH
zAln2NzoF{p4_G7V%9D^_H^`Fgl*$}3QZF}2cNa(Qc8slFgJLGY?}j9+=?xlXdasbP
zLzc!w4Q*Xn?lOeX3LEh^Mdnc}i)wbwGH(YQ-h#IZ%DHFYg)vo%(K0}w$Sc0OgYLM7
zIhKCz7W3+V;v%S+GI(z6<ybbaElmvzxze8r4!yJ%sx?4p)Ke}Pc)1!?p>O~-X5)lA
zS4<JAtrFJrM47cBWoG0K%;0PZ09S)LdS*{PL4>9CUp7QiUMs%(^c=t`g|DT>zOqwV
z#2(5#l*(|ex0n`mfEXK^%{P_;9u8hT(EEsbHdMe0>gp3qbH-_84W+c`e>t&{b`ksU
zc+Jaj0}*4-a2+DpDJ|hkx%f@Q*gb!4+>G2v%u7Ji6~2GcpO81Q0Up!!Fk!`$VjuFI
zAgE-62(%)#iWZ)TeJ98=5s5w-O!d#rxdz*12CMJFiuf5%Gy*}iIM+)a;g%i9v9g@5
zHegoq2sv2P90F+6ODpn|6YkNoUXCyXk9uYpYP`toy5!xLAdu7&8(drYwdUoAbvRl7
zQxK(RxTJ>kZ$dE9lm7(5N&4;NE-p=NR&2P%bs0=U#UPhX$RkU0S`7vH+!*s&*zkZ<
zmeSKuwWdbEY}qNf4cC-lsCoCXc=Fh-nWa?+`$4!+l8sR!0}szJS`B{wVi3lq#W<&D
z@EAs0j%}}PvP?e6L)>)+5O+kOU}9y6asHLB;?w~1H9B7qPgmHxqk$F3NuNUyL4&O<
zxtvKW>ArN&XDMZ@(9S3`JK~|joMCHGK?k073beCYc*(9j#vyI2Np)%w!_`f_Dj#9N
z78;?9B0Wr@GOF0QNivFv1*t#+<{$C4BO5sEmUD4}2oXv<CHCxb8Fp4+o8bUN9DPY~
zvmzbsyx)w%U5$P($r=i)x@Fh(k%#=ZG$Q3pp-s!NwAyc!5@();2a|8hU*yCN+kP)o
zpsHO2SK8FZEt~!byAvGTHQDXuXa*1&gtOK&u&@gTvPN=r;dw}uOTu-Xu~+9XVs;qt
zuoK<lwN|1zH#1aIyA1HMy6)lQk7DZte`G^`9L2q;)n@Vh4cSp^^Wf7fIC<lM#Vb69
zC=C5CtRqtPp5aw?qpu*8th7Ufb9T`^^lHs0k{*m)Lj`zXH2<lUn@W?;2#lYhbKyp7
zW3B`wKVHLfGnkvZFD;~6TTjFWbhm?`BZds0P}@sp2yA2iu$Mku;u%xEaSU59N*Xz%
zL~eTz`2H4dt@~O=>z_)uSX!^=b#ERI88q<NJPjtSw$obLfddGqQy})RYq??ZJ9Fk&
z8?JkXXy(kY(ow-$VT?<t<c<GKxbLlzh$~kAn2nhX;Ke|JQi1dl)^mdIT@#ic*nrt_
zCmccigl2(DB9JNWGp%!KdOxGklf@%kNca0>vYgc~Sh^h1H2e3jfIW^3!U)(SY@-0_
zMljqtF!q53UNb{zCv2eOK@iojeo<~o3%%g&0^>TAo`T)e86ZL2Z~Ai(GK^Wqrh7;c
zZuqG!YK&klTIR#NHMUqi@4GUpMujJi`xo?SMt5HoZ=OK13$qvv8!g^E#2ybg@+pD3
zCKik=v7vc?ynoMzMQcmcE9V5Ek?FJ^bq_w7VObe*c#ZWF;X#xSM<Ada&e<&_9EN>o
zuOl{JkHuFJ{nVh1YlDu1!yH+5G-7q3)0%$~9{Um@#UpIZ6AnKUBVD#;u7y#V^|QQg
z3&wjE<jysgBe2|^(b<L&?pk_m6If?_wI-Vl8Rf>z7}5U?y?8U*^O?2DEr|y<rWCy`
zkbJ_F+&LKE7vMVn*4^e0QysdQ=5;*AD8|pNY0uwt0HyWVZ|rP<jIC3eNgS!BHMTPk
z45%)R#2*xfjzrxWNB9O1S%k^C)!xEr<WHayXJ$j92(mI>#hl1sNW`efu#b$)xrMrT
z0Xh+FBHi<^2(Vs&ed@m(tk4oEj%?uAu?7CEMfFP5V}==YiFDUz29##DNOMEJY6g}q
zedRN1e>>@UkegWch6rSehg$A$fT(m9_7qb>q0G8QD&}MMfV|rNyR?Gx%noEZvTn)R
zgWw&dmzrxD%PE5Pl%XBt4j>G0;FnwDzXZFhLM|gfWXnA(%=OOfc5O>D8|2;<*jhOH
zDdeqlf}xyxz;qu%l)>Fc7^9%uCL+(%ROnhP<S>FPh+aa&X9%UWs_~`8q1~sT^|&Gu
zIVB{@@cvKYu;!SGLu!pHwnIcUfdW@f4{aIklIjLQC%Z6Mp|BuY?2fJgRa%SroS)eF
zK4($=Pc-@#Yo@N0AmR}l1=89YO{~*`hMiMNci6l&PAbE_of$de>iKg#b3~an!)^|l
zPZ5B?h36inKYAdB75ibs(@)G5W{^a?AaVjulkr<F*c$bSK>N(Cm)8(x7N!LH7ge)f
z`Dq~fa9|5+&vPq_dMMr6_?S$DcF!;&w?yJ=FqbLpq@#X(II;%=T%20?6-D{dc;MOK
z`E%^!xuG>BtFVURf5htdu#C*4-JqqBxU|v(>F3f^dTGCo9v2@mk*2UhM}$HtAv?Kl
zoNog*7C`?NXV~(n>9<ZHy;uGS1%|o?Hk{*$3;tzeztsc3LSX}2tZjB_X5ste>o&0a
z;CPV2!i@m`f{v3p3U~}|ytO?Ol!LNAl1&2xloLJ|Sm^*<qpT<^@r@fG-*dTH+{Z9n
zmssy&pTZPE3I@F}=_}(Nh@y3m#6B?-yS7Sk&0bjYIKu*n%YRMZ{2UWGv%wd^f?E$v
zKx-a%X|$s=Am|0e<QnhY+c^`W^E;O2w>avpjUa2x4UI^ChPl7saSkv$3k&&c!k~_Y
zRj@aNgvAb@vF9^z@tK*KE5U4}2a<?hlGtuT<v4<fH{!iULW!sv)S%L}(N}{hF|zxo
z)JO}~J#;<T>>KG5l%!k0SnGmHRb&t+HsP(owu~>p1TjV&RuhK??%0BJccrL|8HL9N
zwCBj$4QqiK^=be~b)xj(^pFS^YYRok0o<&gS~XTRE3mkkwe}2YtYUdG643)*H~_aA
z;q@pi$ss-~y~5cuk+QS|7nkh*a&#D9SkDRiS)+Y(_>|$*4hC%GC_^;lTRB+dgv-QB
zMW;=de(t~@;;nW#%@$~&#EoyS8$}Rs`MI5^x3N^zi@1{knJ%LCIbw^!AwiyL31)V_
zWMM!T5QYk)?xkNm15sSSHXdn2iqxdSzMO+LTo{>f$+*s57?WGeM?;@d&`7rs!3wLU
z_el5)gFT||b&OTt;#ih8a%lJoEM`Nj6GCc@2$)%xJS+p{AV=pE151k-gp(R<nJ!!@
z5kY0UEyM38wZBDk(D2ik<)~2Ml2o`gzZIc)w}m79Xw7?6mJ>{@uPqzMKQkb%C9+*e
zH`$j_md~x8sJL?{hKxTdth0M+1NbAL&IVa{MKjt7X-mm_*Lcm6t-CNR<x2aeqz@as
z*#m~hPHDi)ZDBOJ0Eq>$#26%_eUp{U&H8N}$q%R|x88?^^_}L{$dM1a@q>aAQP+kA
z%&?m^9&U+BG@}tvg~iYDVFfx6E!DEaX*wN<GH+P?nVk#PGAy`a%0V%>?Ha7GwNBiO
zpT5L~EcO9dSDuU*D>zs2<{5jlBBu(rZAUNJ8Gk409)VeiV?}y2ww7l;XB22*n=3HG
z%(^MYi9k^bzXVGW<*EhabOKb|Is}nEHKw<=Weay77nEGqy8O4uDU4$PX%Ud-RwA!P
zpMP$=>r?5Tw1swdKtd8;y(h&21JqD&otioprhKze?;Wu14s*3YRI<8Hz`^5Xki<W_
zNgN%+F)EAR!V>o&tlWau(un_!)s(|>Z3cN%jzQj9eX%k*73RPfmL-gspvJmE4k$Y0
zmc*`%*xIW>*4#W<u_ILQl)sITB7($j@e<woz*?~mnPa@;C}`muCgUt1Sz_H-5E8va
z4Uc{_h-1SeJ&+dC^Zk*9=|KZAU2<x45f>LWHs-=iOd;KXll*}EZ!-W;<5$vaU7)CI
z<6^|}YzP;l5u?MPgWotY+QTKDWYM3L)P+ysdn2)K?S?b6UICts)B!z$URvQe0+$jn
zH|WZxcM{e(;FV3?6iGx2c(Vg$8iGeV>ArLUVJlucLXiy)?37y$<sa?y7Y2$fEMyD*
z#%h2eC(rO041a0V)iBo^>(1%XP=f*z6cneTH+|_g(2!@>g%P3Z1=&yvs}8KDm+C1O
zF2x4sQi+vn4EB2{Rh(Afk9k!?+LH2CtX3F<9ro-6OvmHVfwHlSa_gQxmhMTii8$%$
z&S(Xx2d^FdB2i)r7EHYP#4x7r&QmkMHFB<rDuMwK42p3I&=NBi>l%S`yn;Vh_bDt>
zo0;-dRzukGbh~{NGysaing;`m4#(Wu&4<;WsH{v0I>S-zFr;f}oEP3T*b=cOGWp7m
zWxrrMxfR_E?%x6r-LV?4SD??NGajKH;D6mStfhz8kODkzf(M@Bz9uBt-5SImoY-j=
z5h)jDgDO~gy*gsU%496Sgp)yTQTtm<AMcM58|c8?4)sZ4$qj697|BaBfSmyfwK!$+
zPD@`|P+~*Fq8M`Zf0dy^O+QVU8@5;W^<?BzAv3SnMAdR7eInhEZX2h!6X6<g{m@R3
ziCw;Cd$ug|cJK--qZVIC4F|H6AP+lh5*{LnQ9hk>0!B#@bC5EYY_;C?^;llBvmVrd
zt+stotNToBg(fDKX+QTXJooTDw6*G7OT1S=S1k<Zo&m(|;^Ga1@^08xiF8~x)i4O7
zSyV7<nmuah{};>zh@OsH(gW!O>A{nmFt%h+2N03UekdA=S~c*&je*vMk;<YJ8hLc#
z5ol}3XTiYbbFAmiT2Ct@G>ovwq6`+-G=^mzvF$b}buW^NEu48|gfR~pPc4}1m0w#n
zT*+Pv{!-vC1vYB@)7U<r8ZHwlPl50ntHX9bS71WdcDBeGSfWLXcPw)UU+jo`(_0*Y
z^8`p11hJM`Uft#?-GJPeK9Qb<?XabaTym?l6VI_H9sY1Z=gy(kA+tUWQFxDD7fIqN
zd@OwgxK=_&obtb=bj;FUj_&@}=KiO5J;4Xm`2Gr1bYaGLxO#zNgo$!m?G8=uMbIVW
zr7)u<Mt6aED2T?lqXtwoKi!j_;&(T2e$r=V^UN;?jcnog9U`rH)B_Al?2Qh9Y@n%c
z(WDtrOKt4O2|nOj3_63rdkd4WL4ga1v0JOs_b}MP0JGR@NmZ6m3<`|V;{gDiVU`=}
z4hw!h?^9TIL_^!b!?%QHXV|BDuQpYgZR*wW6G--e6>ivu!Q#4U*=R&doexT{^nft!
zl7S>sI|xmr`5T<f&=LiaYQgT8^t3hf1b^?qP>fz3Isp72u?Mx!7yR7+v!O-yq4X5W
zv*FF7UKycJAX+2)_Z&J`-Y@rUY&dU?c6F3m_dzw!EN&ZgK&Lg#{o;g~J><!!e7hn2
ztqtJ2wh1G{L~pP#8%#i?KP7qiy#bdox2F|0Ga}}xaahlZWV<DZ84eZlUlMl&=AIK3
z4!v(L1Gq1JEWHSle{4wl8vlM_WQjaf?}7BD^oP=GIL=ERCJNjY8@m?-C#UqL^nIz2
zUgr7;95d4P9Y+P8?=#6f2-{frQ<x2JP3fD_EG}urCDTMJ>%2-&yTPAySyYC_X_=^-
zq>^a$C)6Hm`>W}G2GcN0`4j0S?iR$jOB~w`Muyx28|?PrFB!fQe6Vo3OBA30n#c!t
z{)pjhr*<%A&SgQYE39QH8wjsLFjLz%Ia@c!jO>7!dKh$+Tw`89ODYuYR5=-aua_?4
z4WoS8SQ#V*AQwhIe=ObKCz2!)ejvTc*(HS;^UM<V4G&nj12~i3lirhVfW968q3uwR
zBby4>WdO+me<b}<dLH9`6Cf`qGfn983y?vl>VH}kb4zqQ;ei_rL(y0Bh5Mk&&YugE
zJfNMlZB`iB^FcFqMb;81M&OjWmD?%;-ysDlMEI`s0q3Gsv_h=e?Q4$Zo*CqH^-7o0
zucVKqm+cUn_-ATq;Sfbj#IQG{H*nfhba7^*1GlhRLki<Hye&pMDlX#vCv2zNgY!6I
zk>dI&Jx=-0i9yigM93Bz@6=lFTS!#}yCd3a_8h=UdRO|MbS!-yoTj!po>Rln(>A({
zAo5iDiS*yV@DA};!PgZJ(+tM>rmgq3^c}YG5g6iv@Gb1&T6*B7&p)wCE9nvAwStcp
zRi2EpNY4gDu{r>ore8`Fza9;8c|;J64T)XCxS9}$PEgk%G7KZ&*Z^-zKag%RP;){Y
zaX}~Hlw~|+0(G}|?@2$FUL*fIu)dguRS-9bpzC9}l-`$qB0YzTp0bmVZ4d1%b{<KD
z1DMHhS7GRCTxS@mieKr73{e*z<jher4GL9d)v6BtE<i#{tdh(h&M*x3t+vv8_)F<+
z>7S${={A)6HD}@+FrX>5=blQZ(TSQaena|>bc;j5XGZT7!atWTDba5I;`gK<N-tQC
z#?m<Dr$&q|JbD_uLStmoAeC$km5=bR$3ti-U|blB>cXn2K|;u23rC0Tm3_HogI6jG
z;9BDs1pQ++8UUS2KbHPhnn*98vnvWKCsbu})5WVToMM<?NdE|vduf1Mjyg?Qf>S6a
zZCw0w&P0Bi7%)sw2?TvX9^d29yZjyJ3G!gOC-dAw;;s%I*?G*kNU7Womt315r!M~)
z+7c00lztj|MuU(rao4FqBKo29ckHSqy0!rZYY2|-OTU!fA>zHd_?OcE!ybMOHCov*
z#mTLHB>h5q2fF*!_wGtRl1`)}=_NAkIr+gEa@$ECNbe73IfwP11I}K8d}P#nO5?I*
zs6h_~PrtXIlsPeDxPvzr1cxpNjof3)Ar7_eo(79w+8T@ipf3MX`V;9zBE5}tMn+d5
z)Q33WPK2LG-<O_~=F$rU%TvUbVF}NqyNm;B<KmB`f56>5Cq2s^iWf{f?pM;!rFVVt
z;^S2(5s;L0=PXI^V?=}8hKHAOBsDYKUj1ViW&<95D*r#A(wgCMSKMF*oOqKH8b;**
zvGfD!hIC7+rQ49tN^`morE}?L(l1>9chbkiFl*@x((~xw%p!zS>0{{)>E|?QUS0ev
z={wRLqK9A%Kei#fDQ<d4dV`+rq5PxMq_iNkhVv4LPD5d%`-zNIiK0)EP_QC{zm!`k
zyA2z%#3xkJDac2p4^iGbBAChd<-d{MmHxBTO5c`V=rf@glwK04?!NTCi*O@-ApI?r
z`4`PswwV6(c_e)zedw)7v3SQsziXy`PJmq>(?Rlqba(VXr`!y9(1`-@+OjDsEBf{z
z%mWJxOKajO(fqmPS3MR|H2TWQ9XlG~Hk`^D(CFF`lwl(EHY<B6J%jdSDD#E%2pjx{
zn}Kw}TPWRUjmwdC{1jvUq4ZDEFMI(~1pB`99jTD+ODEC^ErDm!J#g0t(vP2PfTnny
z0Qxh-!<LpT_qYg!kp~Kv<dAO%=$+i>kV4U5^bQOp3)7pM0o#FOj(ieO{g#wV7t$-z
zbCkkYBz4I=o=Lxu-rAS{o6>)jE~QrqS)*Go$S3!u??~^s14#0JhZXva^g8)QnDhip
ze=Pl2`mt|4fBu>hM@|HlLtsk~$jXWoJsmC$?Gc75Xgf24pmQR~UN?TgxD)9HfnR7|
z&`F{rXK@`7D}~~JFKwjHU`A^0wxx;XSJE$}AMdO1+fpOllirg)V^eRpaJ249zm(pU
zeg-JjTfAibzCb^D-7glVF0Ox+zQ>`x!{-kOAvg{dM3NXyR7mHX9NIbHsMJO^xU@QW
zYxLq8>(#A)V^M#ke6ljujvW=P1S2s}a!YRj^?io)HDyyDQN}uxD(Ow>7X$rINr0q#
z(rZxA3aTPOp#50-p7iG6;<fai^#7L{=_6^zNg7WveEI<c*gllr4D7={S-)p+XGNC^
z%qiGOS)cCEd5M-eu3?vQGd?raxu?I8c#H2HvloLK9i4m8U)Alr-Wntyu#$ex`9WI@
zc)UsShMx>dX=wp}0g+~oPs-7xj5zA<br&be{?J-&0*_2)U@d(p{c696Co7iBaFzkx
zCpA8nX2d6RFX9DwR1-2Ttaj7z{!@4@+rc=E@bnd5O>x}0S)!KwvAK>!EP$Y&OAlzt
zU0O|j#n)ES1L+g+=<edj(hsG3P^nAHwu91^N(~<jve>lp??`9TPguW5o@?`7pGbEH
zcNThb0S7U&^$RNhfi;5<>&Q6}P_RH`oa!ha&i46FtszW-8P<582wbB8)1mKAkpF!M
z$TRduq~PaN3?J=_Frj!4rJpiTBTVWJ(IoKb(f;C{N<WewOMh?K(9Vo$(w`5d5BKFS
z%-B5}hqJ?CZj9C+57M*2ImX#}USgjFrE_VElKY)^)-9G}07dS8KtSFSNsN#PWIo_r
zx{09+L)qF$%16=%qx%0-^gldaWb=Z+dsq5+U_v5*y_9||&AHFg<|JMM9^c)!KLJEW
zGw?%G*Z}P_*aWSMFhYbKp=)%8G$aR=4JV!qyoaF4oB;dG3B6il9ADTJr^c@wu0eu?
zKenn-%cf}!h8;m78+!ad=oh*D`e1jo`xl=}Z%ZHacd6gMHs?C>X$^M`N?`0k4?O4l
zFnq9u94o8v*OAMuiJf~MxUr2sXQRYBXB6>79q`(2{sv0dNBv4;WAA*e{A+NEDBSIv
zyD;V>i?@<~C4Dm3-z-5xOt~9<_*kI?haK4s6IOeJkl<XpM<{poLqQvQgfdU8dK7-M
zfG{~Ev6s><>5f_K8q6e!sk?q_)%d4@D)uY3n18i3K59Lx!k1vM0}(d1dB!iUnY8+r
zbsmMU3oRJmJVB(2qSeYqT%OojUNt1qVdz(p<4urg_=AXqW){qKqN$0})<9KJI$UFz
zW*|CIF1}X(6S)6AcgXGiLksJH8S{(N;^nqyV+lQj87~99N3xOrIybWycdqvD91!#z
z?{R^}4p!`<AHDQ`CA2leduxxbTUqHKA2?tKIXCw5#}utLWYCu;RB7J<2(Pgl+a>MZ
zzl<#4_L__R%4=7Dw@;A$^)F6xE9`*eiTRx%n&&i}=B~r7g$!E&mvc!(rm4K}5d0&L
zB1}{tq7@;<p@-0A(*sWktJc@q(0Gvs;`MbReA30c-$nlI^>=>K;s=a*!1S*c;Z^<S
zo5=rHgK^LzM~uBzPzOCDp{Nb!=+dte&eu+QK*_R&Gj?FZ9dmQ$Px1i`_xa7_C&PDj
z{r>rs&9nV2lwVNgZZN;5rEYVCKMi8*WZ>Icx`jTHHqz5TP(hqXV;OQ>!e%hcu^}3X
zW`{Yf-V7ivWI4I62H;ZIJaL7dB#0L7$#1N}e(`U!cyU2=4HZ^^IcB87l(a^%ky}b9
z58;poX*9PDvo$7a7r=#2laXyozL$*LDYH(sAN`i2FY_ty`l+Axx4!t(UCec##&Cii
zN$%3<ZkffAlR-Ci4!E3TJ;QgOk@GIm>R8Du8=gTITNt@6J^ZxK1!%3Ksrl5``CUc$
zzvAKyn)!hdNur>54njF$?rWC<x1iidq+fIXI$T=N8y^m~XHJazSZeY6EjxKZ5||U8
zuCJqV|I5h#caq}oB>yRO%*LouIT%A@DZ;{ro^-%aCLO_H4gTDkGEQk-r5%MXGtN<9
z76GTWmQ&^U^X+v!h<_RR{|o5<Z+v}gGifVgz=XkvQYIAUh7FsSGl_%hsD2DM0ZUBy
zp_l8s8eU^=cxPElPHpWHQeZ_kc=aiNy%+r7{S&#Fkl5Cc=m2#xAe^BBmUa$t%}Ok2
zg;){<B^JpM%X8%bf*2IzP#RXLz&6!zY0l|m_`UkHaFZT!5L#)OQO-2cDF)-fSr4s|
zY?d}au(F{t$%f3S@C*Q!jscKE7)if-0+TUCpj-M#s$cuf{?&YfFf}97odHKUI<zB1
z?}%&}dBFiBsRa>aZqtzj3#u7RV=0W}R$1XQ!pt-2l;ry1pn&#!^QlRV0`1%4iPP4I
z;T|)_ZR8>Ug>(znNrl+A7IBu=<~bcq@@uUXBT$bi53;jjVj|@&oQC^*;s9jetY}G{
z^x4EwpYg1<w|X_jvK@%f0j+n@Xgx6|R!{mX?%eQO0Vfqes@3lpS^jVPX(0%#F!VwV
zXNdFA8p9{9gAvr6(Zmom$O<>o7|xv=U+C)QEva3gcrgvK;NrGF4dnN56<h=%2pOb2
zF^s8n12m`2p~ARC^U}26`IH+|u9JWKd=z^#J9h7q3Ua-ld;flXqDm&Dx&Z>a0he(l
z9gr610|zjJIC4riO9q%<7{D{@PqXdPhxpTIvMZpal(t|iT<;TW0RKuqxv`w4l(};A
z##R4x^V)l1D%wSFV_%#S8nM&Dnqj;AdBZWoTZXo+u{g;qL>#)tD*pb6J$-`Q87)w?
z%?M8Fx@278hD$mJFt?`Z!YaTOoaz!_f4g7A_AZ(Vw#<yIEfNZ6gyp~1Tm7s06lD8e
z{!E`0Mt{mTgA{fpn*|K|(mG))8}L4}GtM6Ji`}Q?hXVqQ#+3_(v)o|lMb}04d-KUa
zTNW0*<Tg7s0vI1!(ZwK*0d7lnL@4Q+RAOp2JoAfdd`e-=&4Nd(`QLleFF6_N>Qi3#
zGG2cV7w;AEW;BS(jQAe^>EElLFbpj-?UgYfGDI3IfDsnN6s+rlHQ4cUrBx=Q)wH(&
z=~g>t?;Y4>>DwEtPxfqhJkkv7e@Zt(DncF^3I7Zl>ZjBIV)lVQo!_s1!cRm8{AjXK
z_ZdaQPi{ek-?=qaYewZCt<O73!6nJjgbDA3+5O%t1lGB<Ux|+2U_-AVxC|*E!Geex
zueG`Ume_Ma&+6pK251oGMu|H6b>ttNpxK_N{LQc3JlWnqGPS?<&aox8&|oK+p(Q50
zz(a2ahuX=S<kAh7TpTgI5MeKRY9UuXmYyQVll5+tfPzbwIeW73!wTemN9Clz`Uz#6
zyl+MPcinPbdvRe7w6%*{Gk)#pm8F%NL_D~lueE_ixN-V}P>I^gHI<DY4lA5qIEcC9
zn$y4%&j4a0m1v|-r2B&|wsQY97PRC>nXbWfPGEOD@^0$E#kZu}XvxA3sujiV`_jYx
zp|&{~<PxT9LDep&rgBLy+VIZKl54;qFRY(f7MZptuucpIbPB}utBIA5bL&Wou9+(q
zP*}f`0^5>P3Fg?d2JOwCod2dBcT>7eM@Z9ekV>qw`-6d=e(`PT8LFw5{CR^uJ?5YF
zz6ci__;q4!(53C03+cXeZ?q96{6<Szu!VPa2n8xd<HLD?E!>iV{v8>Wd}@{~XeM19
zJ;0~dj#3UvYG6J)3eStdER<3CpBQ_wwP^-3Na**ZNBi=RR3>?u99F2v&J(sCO-1eK
zoez2ExgD;vM*km-u6)Zx?K2wgO4zSy^PCv|SPbS6h#LEh7_~9Fas!Im8vQ}OdD2#U
zD4p8~?WpoB48!(eQR6-!XNb!Uwqs^hxr`OPCw;Q7g}EKg79%HQ(8!ux{q)KDe}kZ}
zF?(K<>yKEz(84v<^k|`77zaGnfj`CpbS$i}us*{rV_(fErEkr*bPg~Ijaz!9FAWIX
z@~XMzciqM1Kn8Q^Md=Tu*QGlwZ-lsqlBAENpGp59z2}KwiZou4{*Cli28!NbI6{V!
zejt5M`mXfh{>7h>z9GHH(dY%{F156tO82C9r0)*yEG*-Nq5o^MTU$VZ7Fz6TKm1&f
z<i_-}%%<q(tj%WhSeghKC%Az$$Macs(uz~Z?l}KtU;C2?zb<`QdK%q0<VRO@qP`*h
zK>C$elam8|N&2Sr$I=VZR=RI-a4KlOC%qwkf6z#i_T9_USEbKOFQAwwlx4Tf@%S+-
zKhg`<G7xrx<a7GDFZjZeFZV>)u%6MLH(|#`9`T6g{B~eKTDw~S*lQ~q#B8i=beQ|T
z{J$o>&P<Zp4p%*q*3z5O_oN?4AGsE$&GWqUHR&tT%Tg|V#1N61UWJYHL+Nj&cL(xM
zcm4zEB_?_wuzz-T#LG{m?~V3?HH>w=g(BAIc}x&kTM`vW^3}q$Ca~I$bPO}PfyuiW
ztY2;?q%}54E_x9|f0{x0b6@@o^#2ReOW;yl%e^Y;J|cX_r4YvJYtn1d4I3>Lk<2~m
zL+O3#htda-w688s`u{cQOVX=S$?0U79nzWFWdDA!cp?ABGOgMeC_COf;^}Do>JfS3
zB{x>RYAlk;Z7gWzVw+;W0^o>ly=Y%(q+_PM^g`dE0KX;umGl+qL|QSOHOEM{(wX!u
zPPluAsq~!mE$QD&UnUe=uz)dF=&n>spD=9Wibyk+UXcD=`Wxvp<V-s%u!l@&I+cDX
zok{oisq`_(WN!V|(GGTiCbmQ@y=KTHkS*9k3Ovp&0^1Qat?BUX+;=d1T1?-4jwVY!
z^wIyz-G6@DavfKm=uEfWao52E1PB_?NR&j05-G}*)ZNt|-e27Bjo1CEtE#J1a+K&j
zNPw`{9k*?o_hFr1?3uaqT#z!=)!O3@^5Wc+J2PX&3NvFyEU96*!qIIQXp;YHNZ||8
z8UHjVky}b1NDrk~h(^wxjQXndP3h02-;-W|S`$*dC$-W?(hsE#@^9nfbmu>nJ}=Ga
z$a-EnqxE?yy(XQJ1oVQ?&S)d0`Ry1bQJb9(%2khtzA~rX_1o8LqPD0Y?pY`q-xs0L
zh9Ky^4Fzv`id*roB!w5HuS)-`^aU7uF$yHjDO^jhK@PYWkbz&8eqZ{g^d;#fsgpjI
z#F<7Nogw$7kEMqL`M)ZCL;5r6O#&@RM_ioX#5Aa*=FYtsW=5be0m5970q!X7Hbzd^
zc^tT-|FCB2g={qYme!^YBfU?e0=zrnM0r8}+L^$k%_;0*H!A;^r9YJZXX#69n3mzs
z(Wk$X?nxig&)dno?>D4>FTFwSw6m7r3=#eeuf7fWC(HMB>CdFMq!%rHtI+85+dl%t
z7|Oq-P%_0L#6RZ{(<K$s^Gs{&T_5Rl&PF&Ql6^`H9qhK=C-9@pnDF6@8`AF`%QP8x
z0BO7bk@UYxza_mOO*s=~!cdeA)rc3Rmy9EGGy3fF(jQ8{ExlkU`N|AqE8Ue|m0o6+
zO_OADPx>9{Po>|M9>7$+#-RnX`9#`EZ%W@AlrpmZVY4xZH0PIQ7@d|;RlRv`wVgCF
z2ftY|NLOT=YuYPU4xsMPl^~>x$)FV$ub@bAM2<iFl+tt3?@3>mKFuhIj8|)H>!pc@
zKbC$VU4{tnNMDxzMEbIHpDh|x_>xdQr61puz8k~L(#2ntzA62_^gKCE^ijk<Pnp5%
zTs$PxwuFXjGOZZ`3OYiJXX{Pm@hVSXt+}1#cWO@KYA_DEMB7RVEj61ljM>zhK#D`}
zD+2T{NWUk2L3)|(Is-H>dFBVw8`7twpM?C=;{Ad27t$A)>Q)iKh9Y#*Q+k%?(z&?F
zJ?SgbH>EFfG`ZOHF;cCR?n|ZgeTGI14agBqzcX8#oESjT9qZ5zsqcwwsTb)1?^&@g
zcE(~!Zt%>_NNxujhsWPwGz&t`D<H({L(LJd_q(RTQQVA5M6|G7F)_C%|F25_PWoL&
z55`!R7-gDq+|&1^??_LR2-C%1lm1eA3y(2pB7AP`SV=EGcPM}Hc(H7AqkCpx`x`6o
zjc8#>=qqbl7ltTB1?SR3n9N-)qpaLjf<KnVAF8-*lvMi;AU*LL(jQ4*k)F3yyCBEd
z5c@Vz{iXqfr_AlE(ibTJ?D{p`EpK%uy+II_OHFdWuS;K(-m)PIIrlkcl;@J9>aLAH
z?JYpYLrw_U3X9!$n2PNo5hNPyH#RvYI6sY5hBkv~6dAkEr99_=d4c@YW~9migdwcN
zo6^^%FH3XeH^aiUT$4=l1@Y-#gr6dFet^+!NPf?ZrJX>TdvRz=TD;e!FOz%a9L~D7
z!fkN(mt33=9B^(wTA&`$7BK||-WbT-3*u+i!`-qdQF4zWOWm(0>=CZC-H&iL$CN>K
zeg4JrFn4tQ|2jS}BLF!vOSHpUoMWf=BK)lMXVTZDJLd7{JZJRrZ@^`G6sWX#Um$eq
z*vSnw!2&Gx#1}6<oy`d7xZ4SSZ-vfgxa3|RfUGPg8@n83&Dzcy(P050e6RrDE^LI;
z9gDy(INNnZ{z(d7lYU2fu0P)I2!dFRGdKq0$&yt$kv=bdQ~Es)Ml3jory~Yy*hS77
zySeil(jQ7+kd8^5k}+Sig##U1nEmNJ{{%<31}(2}1~pjP6qs(uBV@jL3Uj@gmBA7e
zFze^ay-_GZkZDLof)!s_rq}ZFd;4><2a^n6=Rn^`A#&qibZBxn;1MUY{zd8cq}S~z
z%c;H32I<YD=LWkcp`UL__oWQ!AMwPUC5%^;FGsTY24wAm@T)XBOpK9g4He$G07G<Z
zqY*|+a80j$N(fFI!l<a(i{~sccWZXS6-iw)N<m2Se^dIB^eJ3#WfYj4#|~GQx%^M0
z&k{aPtpZhINkkk`nODEL_*i;f`mA)1U@suJC98dG^vn?+cfdO`&^Zb-*F^9WD=F-;
zn9eNo2Gw0MfW2ZcdJwsGgJE(Bq`9Qaaf+c3cbFJ}e0kd}-z5JpN-y&jnZeaDUMs2Q
z2?Vhl19V4vOZsi;HDHSsoofXn*(P-4%zYX{y7&vySJ-?F7OSR#v!tvU-fJX-X>D5g
z*6N@wiQ^V-^wJK$ys=(WMn)Xr=wupRQ9vpz39eoKTQ=#&<{i|Cs$(elt60M9Q2t2^
zQ|Ynd9G#lS4lZGOw4M*mrMuD>r7uX&6GQH3Jc%Zt3CfmR$Iy*#zb?H_64Q{Q)SN(b
z2Y)VOF^>4Kl32Hk_?9rHAr;C=j(Q^SAu>Ugii=OME(t-oH)!UKmG3BHW_F63BT|)_
zseTbSEx?P+AKqCTYhh=S%<SCCqk#>1o^f9hAgl?fPdHXKFoMFhZzkQ7P9<Rp3JmZa
zei%JQyTL3NUx3O^BkuTWPi1OMT4mIln~QTsK<5;TO9~Lt7BD9`UkotNnEz7~5Vovx
z&22Y;>FYxUN_+4whak6P-nF^0#!kgtIh`nzj-;2R7p(zs4VhAw90^ZwIViv!*&(DQ
zRFTfwb7wZ$VjMzgY>`Tq?FnyQ0f9wk(4!#+-~WvLy+yBL-(B-XQJ?H$17t>yi!pvX
zV8hIi&~f}NgM4*_oGbg)*ph1!jb0;8j&Tbb<d&msj{s6H5kZ3R?uScB7k^56iM_f*
z7lX^x05d_%z7k4`Qw2uH6z%8k1$3TSNRo4d$I^Qa;bp*EQ^NEXIHKf*1Am(g4m9eF
z#GXDiiBraQyd<i-CtVE=1a~w1B31Wv&pRk-jR9_fb}o^~jRRgtFGvqi-U%X<?X!wJ
zq8dc6$I?qoz>S@|>>o)wB_~{sPErgO))gDGA>ZAx!JJ;*jj|C6i(A)r5a0wulF*&|
zg8(#VtGAe{NS2}xAqw)1wGx;E5Dw~;mcSkHLez0<qi04lY!-WP16AP;NyG-mS_M6<
zw(r_QC{s(RB5GY@Nh1T8&<R(KYJWkbIH6`<+kWgY>}%Bf%#)DH0G}hKx2LB~+jU_s
zzVPr>R=A8hXv^e{3Fc{vN}Z2JpsuJbqm>H|8WuuHGCqgzRL1j9nBr1d1us|}4Sz2P
zPOsdC6c*`bmJnrV&o!nyI11wqxC5#yarZ51xMfU=Y&7xy$|*&}BNTl>?oyH&TuB`P
zWgv1lIKvj>++z1@Is)g!Fs-?|kqUg=YGucd$k4$F=<^i^>g-iGw-pNVQuK3fh{l77
zx$h&%h|6l$Jd*T^XjVK(p!FjJ)CtMPn%Z>`P}66I{@2#5bfbBll^dQ)OURTRJdj8Z
zG6RKf?p*ZGpgiWga&yTGe(i+e215umCpvy&b1VV}2ng+|F}X8;{vcZKc*DxpJkZ(o
zppPWt#<@i~p;wI!HP85YKFVT>F#D8zdCi|=3|u^B?kep2r?GnX1&~9{YGrJvUMoPx
z&DEda99<uAxB76J?EY)IK23eXBgHbd=<{|kt}o0&3Qh0WJha?;5jJ?O4QHzCBe;Z&
zy<n-%HDN(%)!6AXP84c*v#9i?yRBfP6qu9|1D4)oLvbTF8@2?-3<`zo!=3@$*i)>m
zZ+J^XM1wsZF+j3f$$6i`w6jjW8d;QsX$t-o3Aa45#?Y3fo`AwsgUcosa=Kbpxbebr
z$=zWQFJSzbhZp51x6Qd(>wPsU!{P@)yaEex8eUFdl9R0Om*84@529wq>(nHBvMT?r
zWq`t9m(xC}D00^pAO4Z_#7}h$aR0)>h=Nog!G=@3(xr1m!lqwICrGhoKS$fw65)+|
zU-8;gb3;*^>Z~47)2kDuhW!@KtX3o(_|B4r8Gh=*U!kyRte5a!TWh>=L}R4<8&;q*
zhS!?tvtkuQB)PVp*FB4AiR&*cU+Ro$;C6e6!nU^OB7<)*mlv#GVWTuhP^^lcx9I-u
zxZB2(>skNA@|%lex2J1X;8a_3m(rwI4`Rp`dm6w?%|cGBB)Byk;O5iwBeGmP0)!TS
zSz(^mhX<l$EM4sWEe!T3=;n4z){sTBA}?5wcGfn-{L;dreBd};fg7H4VlnN8s&7E*
z9fh7fm3D_u%2C4<_%}G)CC6J%5#t_$h-Ztyxn%do@1@$cK*XxU2q+BDHEup~-mO6>
zca0s?LqTfeThHw5loSUpthjVU9K1K(3mcmpG|Y(&Zd^%Exj}gwjWt7)r)I{kq^H2B
z;xX&N^`-f$9Tci5RzCgq32$2%-{j``kIAtn1e-w<h_Z4EkhHvQ7~h0OplD@i2t&dF
zM|)@MmN;P8+GJ50xx2IgWGKAW!q*va{nq&UJJhD-{v%0nYev7dGFEiaMK!W>3_SMF
z&Q9vdfPIfq#Fd4C5eDt+*y+!u^ebscuk{oMxC5hDOOJ0GhFHNR3f^lx=Ms%Ll3oN+
z8{#Js#uUW6p)E2#r!(gFk_)YjSV<rSB|(p@!+Ioui$~~)TZE&UK-!MLs2+7d0o7*Q
zDzki7Ji-bQb|L(TZ00s^HKzvD8mKx<Jj%g4Bqv@Fkd^#!Yx!T$IeKeQfaR3NJ``Ed
z6s_nE1J{6|XGp9C<rdYnGoY!fQ5spK@Eu;y7dG;zKm~j7U}tQR32Ja^)1kN4{L->F
zwz23_(&M1H2m3L@Cr_+RWBilR{dr<VmmDzg7?T=U#I+wv9svA~hjOh{|A4i%yN&>W
zOR5|Zn$L~Y6LEBpgg<!x@XrO|?~ZHE$P6<=p`IBG7E44u9jev<@|I+V5zy=%@R9V2
z^a8nj#R?}*z_p$6Ho0vWSrm=tL=Uz7Fx+i=ma~BkZcQb(;J#6c3)nJ3g~lVejHq=D
zb0wJi1u=cg9Lc+*qbxf1%r#k0;J#6xuZ=|Up>*K^P!jAz3a2wxdrLNxp7t8i9(gHu
zhnG5nKs~pvgDpTxM;O(Mo?;2-R03z_TpFIa0?3)7&H<<FA*>SJ2so}aXgBfzjTUiR
zqX&lfj)4|YLCyRygn|C<P|VWoT4(p)QoUFWP`r%)jkvF|Hs;XIs6aOZ3SLE&8U%xt
zl}l3*f5XpveMn`zfcJvb7~=`mph)2#@B)>;eOijhZ93e*xwIjgn-kD)1{hpv2U&MS
z-4W>r9j5>TztsUG<yi69YvxGI&Fkej>ZzN72o_3XwZ)=s?5N4zVff?(+qqGVA|URF
zhFdG*jX)5ES<i;3=dMjOzCa?E9t5c0fO#yn{AzG`OOTSs(nkXkh7X$|s<7ul`^oX7
zxdpmI0~&5L$DT(_{TvMRD8uwp*oLI6vr!NmL#*cnksFJ(_YOQqR8#X$5%12;VogWW
z7vbF32r|OYDCty|r3JxfuROAQ2H+T~wzEyKIqZe%C`&DP_;7$h{fnro7=?fW_ni{>
zMf4WTn-+VV^~1`&hIZ6`TQb-{ajzIv)_5e-vGSRX{3vL?3+!NyVSD`S6a9jJRGX35
z+QaA8BC>yk4d7EyhlMUi=`K=<?2yBmV?PQWIJi*GBWZvSwxbRt%<_b%4RGWLi_}@O
zX%7MlsJ4Jt8SJXsyxRIO$SbK>C3ksD=`=FvlKW?eXJYq4xt-0cjGfdHdtO1p4Zyf3
z!_o##mjonFtp<>G;<fZ4aKb)<6UcC8ytN$Lw6OUl?da|emR}UYqt+1e*cef|=Y!%Y
zD(imitXDWEEXql+@<Z>D+fWXf4Jv8jsIlSWDR73nxn`nC0iUcwoDKL<JMcjnnzf_u
zu{B;th4x-ZA2Um-@vXl^vl?QYj<*#%F5;WrVFahX>)KfV&VtsusK-1(1g$%;tqm^d
zaHA{k7SyZ&gNJ!!VN?l6o2J%4+z^D0AkkrBf;>}NC|3}n%%Fz!P^b!NCz+j?#Z7Ox
z=7J4xF?@s>N7*)3+1{B0jvCNrG=dGB@yx=I8Xr{Qn9lH|_hT1#)@QjOT#NKshU8Zi
zYWL3{6`P7JG_h-T6dl6)<hOaRggu;drcG3OCx~__J;gM6XO3jycRX11@HfP&*Ay1B
zK?c>CyQz#i9*phMkd}J1h7=ApDhUnIXT>i>47ruwAC!Mx|B_+!K7x)N8PUhkcoiJE
zy(S?(=H>x}pJO?Sf%j1CL0QfiTRcL+6GKupc!AR9{iRQGfA{dIkD-BWuqt`%o*d-k
z6yG1HqO2N|9%s>?l)C>kAg1w|QZkICD{MeVuamB{%v<P9B0HY~j~BKpdzFY#)Zocv
zwpI%kBRHtzECRQo6+JhR7I-up$?`CijgrySV*19SjwopMbf5;PzlUs6)$bJKp&@)J
z49gm+0nC|wx`iqdZ0w8)9vj=6`xHJ3?7@QB8jIx`&D)yIDgELV{6NR{oV*j%+ck0>
zes)j<XL#Z%_)DmeFJ7;X7&uGR{vtA78y47%4w4Gw(5s6z6uRIfL>SSz^7$i_mLSy3
zi@0OylbG{&F|dJ++4^{~@T3i07H6i~O<>Rk*utebfP!*U#{I)=%nk#{&H#&ZNjQxa
zys}h}`WP#+9rV&oxz&;+?hXu&47;~Q`G!o^uxcx8%ZB%lv@l}3;2;k_fy8e(hv&+;
z)f;I}0dFTg@)$1Qmlc6>3z2L?Bqvapp!xK+y$G}-IL)jo+A%{j;~snO(aQizW@35@
zmGko*;%%&~bn__;XXq?d4!T*Gn+d&rh2QqLKSA*tp&t{6tr<(9(Q&9DlgK8vh9zz3
z9xPD6JA-ca^pMF!O)d9`%4CPC>_FRlB=$97s<<Ws)Zis0p&uQd$Q{eLv;E}(&f4te
z90fiK6fU#A(}|JpLV=@ssDX?*@eL5qnJNoA;*FMszb3Mt64Q?mnSuo|Gv01#snC*X
z0&|Go!wpnf;rfx(&=&C6raWdA$L+(v#1yzTTP?hDW*ro{u>wah^bSk1h720HXQbWX
z$(I8jL&ElQ!)(M}5Zg(Z&R!CgSsQtHk{!h6)&`8v2I%`h{*m3L`#-Yb05cXcg45L?
z2C9jXXM9mOy$k6hR$im&UxT^3V_osBVe^4;EJo!oVt`P1C*I2(a}Ibi3_z#mu_J=d
z`WsQUQ>sXRIt$!e*0vy+xgrnG5L-SP0JRAAX}=+51Lk{|GMLO#D)eWC3Ca2}#g5&c
zktMGOC{|z(A%&>*%`Bv^fPk-j3Cn>DXXKA-%b&xl26)kW5F8lXit<;98A$h9*otRH
zP`UZ<+|tX4-8NXM;OtybxXKPApspXpw2<YGtSMyP=X6h9h7HHv%}Krriv})@J(-N|
z9tP~_eUAo%k|mk5uxn1^T(P}}7-(m?U6dGdQ2h;3oI}hElfF;kWsbi!yMIkdqvN3#
zhjRepVvkWH&yEPDGYDd6y(8fayD<=Qkif&Y?Hy2{2*H*qOcyh2>MEqCsNfz#oX~U{
zgq^Jo4#;hML}#x&e1!rRJ0po%+lYgPrQJZy8i9dp+o+v+!fQ+CheWne-VIM4+03!U
zr!{8qRurXX&1Pg^kEL@dlOABVTDw!{5Z0OWfUu|~7zst15<u++$`?y;L}cf|O?sC~
zx@%PYTjgKFeOR&DX+<0M<tgET=fsLFkZjWld%m$|nPm6Zqb$PMKUM!f<N0U!%?nt)
z)5EW*2|72%St{U*)P3SZJ>c_3-LbMU0$1Q?Q)_OzW};B(`zQTvW<<Wm3LBYmp>|eJ
zl15oTD`@#GPZfN!86kSl2808sjV&TvW}q+=GJg&5lL*j)`z#8|E%nYYCPVzm4N$=Y
zw+y5#OjZq+Oa^1_7a-ZdQ=6O<Ki`pl+%>SMr#%}AOr>=Sr1gA=CoQ@2I1fD%w&)hx
zu^)>5VmijK-vAxIG$R-8P|O)cyj*%snoHkv5tcSaEVuIzntmH&N~Io3ACFGsh%_f^
zU!4i4G32SVmYaPkYHkBH3Wj#2^5yfKDkucp4z^CR566aHCq!aypwN_cX>W62L2gVk
zKPP4BhzajlH^M#)$$;%<%ue0*Tf4%ORamHFk3hGa6m&)Z<TdGQRHZI(DVxy<TnKzc
z_&kI8@s#F-3I4DeoN6jC<a>ZTwUKJJOn<9vxa1y)Xsv1Wj`gA?UAmM$K=Wrd*Lma>
zQY7^Rai(87CiV>-+Y*~@eH>JW#k}BAqks`bqXm1Ubtz^P&X25jWMOdq9B10$iAFk^
z<MEETf5mDE?{{Wg|LZ{|Dw2xMSVXmDkPSBXh@(PsH@;VWu6xbSnd1*CvrD~Ru$}`5
zlsGdI`eW&oM(Jp*{Af_iZ;cxgRD+z^N*!00c3Rn9hLJhVVL>d&J=SmtPw3<ScoaVo
z8!av45-l$QrYF2qV=U|aC%p55TE_-FE?D1NX=!EE!VOTtz74u%Qrao>{{?biKI25N
z7Azz9S4VwDLBxL15p?qi6WVC!MC!G5gw5GE3jmJH4QRnpJa^bJ*K{kDnCl2DW~18Q
z(MoYefD}wz5p_jo<Qd;y1!74WO8FhA`~!0Uk>BhMNM?`SvUS#&q0EM&q$gjFhVPkK
z$&lX`688%8)}i^mN>0Xmo}kVFfwaVN6KmwtAOH$TW<iyuM&J|7TTS$kA8vt2qH=2d
zp~Wq!Ol^JgBc`jc=)5qGAC{!XC8{4K-)MDYYHKjp9TTZC!s5;_s-FBiV_(05m0I(|
z(7zp>@Eb2<WtQ4H4ZT3Fk-i7LwWk7w;YXo1L5C}Gtidc8XEC)ltYLyiUQh`+v%_uH
zHmR#8f(-*ok1^&QS<B22&0!;KYkOm7=B>aJG~9V-Re`<bySe+84Ya_XB<=eMb&op8
z+{&F@XkTtaky9LUO8Ilg+pqZh-bi-fZXZe=1L!Ax^!<cML37addUOMq1UPf*1}&)I
zijb#fjQ(a|LxM~t_Sy>3&(Pu0x~8fK|AMpgSgP@rQAdyNl<UI=zQa$0eN4|brJ%I4
zIH>D)t?b`?Vqeh`3Kk^nCv1<>gW!&Tt_;FyI9@S;l!!)7OqqMyy9S}FY>LX%d}T0d
zg3GnHfdP92?6g7c!k;8vc*@Bz`^9Ssq90=M1FlYjtgMJ18+arFm2qxlCt6LZAysy9
zIf$Tx=)AR+KPEgV44Zvy{!Fod=5~HvL96ACoTg)S%R>x2o<Ao#eQI%RPG(eCHU9D7
zP^%V?cV%6a8#FImceEo;{1FNh?+UXKT(BcT)R}2ScKG}WZ+KpMozPP(*p<z14+q>M
zf|VTSbq@^-7(Li+b8CgUb`=VqO_bguO^9qZZ0I>y$B6!H!MqzV%QLgs8{&!1Y}9@W
z?C1xd0VT|Mh463{rKXEP5I@86MLn{@*F>zhCh_SN084*?h}WWyduoPqje2hnXA%|`
zBeq5t4Q6|X=t@hrZ=@eKY~FX@l?98LxuIb*+rW}dtPyL2#c#>q*MlBl*S=F!q$F5K
z0r@e%Q&@1mfBu3PB^+op@~pVNBCnqudT?0;cg8sFvfmaE^vaS@rt&N>YOG;gmv$Ca
zgP%(7EoN18zWZx7Kua}m$(^GSMpo7x3cn^^U)a2gC@kdm3fsXvzbJxjScpb?BDKu4
z6c2JOJ+U!O@+F8Sp%&h5P-l<%r`)2!5fm~0q~$%vhP9@AYY^U3j{yoh3giOGpJ22~
z+gD|u;=Zq*IK3($?iQE0ppCS+&H5)VF=KL7MP?VqB$ydjHqvab0z1|w=WUl(m5MxM
zi-+B7f$!%Moc5a7b-@{qXSmeaVJCConNhBa4uq(9wM^)%=@Z_68q2E!r}D_SVUIP2
zBJDrHgkBh9IhtSByz?9%Is(C!XzrG;o3Qh9#4dOYCC{~2zMM^ZMy;g+IBB>z3jJ&E
z1da?Q3GdWE^?-6})_kOi;N}4)9LTx_`U>!*hNWBDxe>KLeMu8{0F8}Bs*MeYnDl2Z
zNpuT6FFnshk*&27)ZD)@=QfT)v{=8&FwE#*5$WuS(d%xmY>X}$rZ)`bjDLzPS$hQC
z!p}U#fL`=lz=C=ham)ydqwJp`k+Y12h2k5WQ8@@7Qjh%;={d046{3#48peDx@ZMpA
z*F@73iY}3P%c$t<LkAl8Non>hBWMtNV~O3V2R^**w?}nc6Zt|(R5Bg3(1_oj(P~y(
z!h6+!&xYx~^U(;jHhjP1o#VSY)97I%VQYOTQ*%j~-8u5GI|Bn)T4!sGY$DKUftEU2
zeXmAqNP(;`>}1{`(q;T-vZ^Pz#QplU1OVZKW;}TWc56c1ky2_npw`+vDrnG`wrBED
z@0&we*pTeh#*Run_7j%5bWQ2Z?ykwhPY5ZGBv}*7#$I7Bx!jn^wZYa?_R@kCJv#JW
z!hYVNbhTk2E9=LevNf8~7&gjy(XWz|GCh?Zf(Sc1vBzQx6utz!>&#|Oh-Z4(jognr
z1Tw`?<*>om7?u=-eH_Pmxd2YAWO8jID=TurIpq8vgi=xAKQ<4Z9{mwF3B&j-6eT3l
z@lfLW!ivj*lWtx6a=csz7?|4X&+tmeeR#j8`a82}8@iu!LeBV$2I#%ZFT?tFMyuZ7
zk~2`r8fVl)B7IFG_gEMgC<mckQlhNffusfd2<v%7!d?=JO|1yHIW#~SAW6<rt)+)#
z!G&Rr6WeM%7Rkf_tAcrKg{Art#|^FjAcLuJ!!g03B^s@7V_}K4S(7yc`Z~2Pu$J91
zY$S@1YYQDIynJRRCGf&Pm~R4=$O{^nn1xBCMtMBN7$FgC;fc(M25a*0Ehgm{I$1A{
z^;eh~!WSWD1UXwPwcI>^fmvG`=GB5ZENK7f%*c)SfE?Gc#%XLV@DFEx#?rKdz39cE
zmN$>0$0O+>Vq05ZXA?IUKz}MU6pY3XenJrZMxr-iWG4K*GQLWQBIVp*?g40Ks!~}G
ze8Da(fNd&(ioh!NCS_@p_=2+)@x+X6cVUWrV_5eqnHwHi;XCF?Xh8&%Qz$9J2}O*a
zUV6)REr~RORa&xiy(;zv+ik+x6CD?1+%TvmBl(xBL1PW!jRng!+FDo^zOMsCyXX?n
zkb`*d7-bp(aj%XQTW>|kJ;mb$q#tF3Y%t7z^pm3oCAXa!xqpX5dobW}f0V$I|6LQ%
zSBR!MG+;Gd%3B-R5RLP}8!dRXeDty7D~>Rx$L6#e7C3r7H(o<Jw$q%wbdn?BXcZnp
zSm{&SLGYFJMC?dhBC4r9bn%@KRILo~@2Fq#+VyebA(yR1LY{%fts;*KOR|@Lyiz#o
z4K^%-y$Hp}^`~W)as?P(8IY6_o0aVTZg8<oZBu-51>rAf)Th!^djC*c<C&8IyoM%n
zL@(uig09_Y3^mxlok7}7pUS@j#q9<nY)QYKGe^I(5V^s^&zyxN5S9x_5EYb-hKP4<
znNd_ohM$hXp-1?anjfb71oYkxK#Y!{Cz;wy!#l3fmSdajFh4|*BhrubGrA^n!(xvQ
z@z0(Eh^;f{!8VprB)t!^$UV=6q7L_%O*gTs*A5P1R3_H3F<PcWJeEEMk!NQ|Xw5Br
zi70s(LB|3tnYt&mvarL{vi_W`o&alP<qVMtA!~$~8ym#aTfhmEtfnTE!otUj<a_Tr
z4?j^m%Z%3$b%)ruLmiC83mWLwC?HPpm!S(c-`C(MPiek7p_L*S37zE;BXLlIClc#?
z3DEu`O24&&Zi}nDc1gFS+QFmmY<6s9?uk{m8zfOdA!+b?Gq_M4FSw+#t=k+}WA);k
zlqFs}g#>f9?xmrIH}V&eOv73vv@U#Aq{300-FuG`n>6j8q%mXt9se9D0;i{aDm|ig
zePvKVXAZ>Uyux`Hg36y;*7O0>R37xlBkmLJj-9`;IC^dPM-;?6JpQ=+Tk^=vP_W#}
zASW!x&KQn2AVSm%r-VqCq;f4=@Gfc=&ZG+?mlA|m@O6=jZ=f86KOd(6bwoc;EWzt2
z*3PlMg|%4q++fteR@hwE%V@98`*`*K=E8x@Ex6uVhrN(|)L_Pnz+njrV)|<=xy+Te
z))BQgKpihoS-iNj)O=wa%ANsiX!VM~U2K$z)yF27kr4<Yl?y(W-XqCA!XZ!X1ltIu
zDnA*dWsvR1aBPD-5{1t>%z<d8yU74j+WNlqRC?Vw>cQ2E7L1lC^VEmghwxhq%}P!W
z%q<BBoVLT@@3+yG$IH=#2X+Wdp!ea!G+^N*pLm~}Pbt@|QEWl5S(#z%$$u$5Vp!`G
z26t@{ctjFwW6@-B@%z%RIGC>?;+U}e8fL*%Xvbc1sp<6zg9;neE@N-CBs5FfV@Hap
zkvQ%!_cO@($JU=PLHT+aScB2t5dNL8@@sB5LE(D*mCj6GZa0}(FcC%QWTB4_1BfUC
zT_M#KMc^>~0YvR=wqs9(S0G1QE9kepQ3~vyNKgDUoK$6d$c)z-6Yv7ibEK>bs35D`
zW?==xDJoHw`K^p)Dy4gaN=nqBYu3EM-9{1n4pGhIP=2K~kz^2YayT8qEp7%~RIdNm
z(g#GJGhnx1HY|9l4Q%OsD^<GqUFkjPIlA9#fY^?pb3=t;>3s}o@g7PS7>3N^7B_fd
z)Oyvcy#r_%h48YG?$oD&b=-aJlE@6CxC4X`C{Rc8GBaGH7dusUPEd!Ga@!J>ZKfa?
z4J%MjrSD0vNoPjIjf}RlnS!b3ch96sYVkwq+tPE=8<Onkza38z#JEeR@uiD@ApN~`
zCcVmAOo&vcIEiR284n`D?qtSg3Z)Tsyp3^DdPwY!)MRTsi|D+m**cyR>_4%AwxriV
zEZcG(!xU&_pT4;Px{`h&{X}}zRw(+$1J_Al<Wuh%O|`#YN<WjnU^87eHiaqWSK&SO
z6~I(-|FQHP<Uch=Rth<fap33D6LRd~CkL%s8{;(!`o|UodY$R!`D<Y5f;4YVND~b$
zogEb0^B&<%8?0x=%^S<TOUg6D3U1ht_oVlw&q>d*%Ogr?*hs<#Ej@<0lzt+8PkIT{
zEjwVYG!(4n<imXie<i&w{e)q5EnBcfqDusvoYdn12Rlqi0hpb1_JsW#z<w0Njxa}i
z`ETi8m6?qswiUIFBk19$gCO{h^qiDQkNPudHa6P2FltwJ=zuS!pGiNG-jH4}-2R$`
zc*mbD-3+_1;aAd+q`#G3mOhX5c3#;Bq(uvhsucSFL+M-61L*}cY(}84GZISl>g@G@
zzXx~HIj&@Zvsj>8!f^(lt``IrpoGU9+qJZIh(JXnQ@R)&C^G?e+EI|qppu7+YOs6L
zLkEy-$j_v|k)G#p*U}CWzb2+h48xyEKZ726<3xWZ{U;9pnn;hOiS)erfYz9tau2+g
z-j@DPX(D|?I<q3MF!w>oeatxKcJTZ%gzB7{dZc2>7(C>7zS`%Z$vDa%OOozOXVkGX
z8!11rX1$&P3&eDT{3`&HYg@uK3p8|KNeaJ|z9qdOJ;&}3{2|q}@4+kEc_}k4{u}8f
zDt!~$3hofKq&5GT18uh+0lh8#71gnqvBw)~_a%684o<!ryuyG3DmGx0VoMqw3LC4j
zM{!!htox98#X8@??gy^aivcpTuWQ>5(Ug?4=Njvd8leEKq+dvXODoND=&Un@OTf1^
z6~h}5E~TGH|EF{;{WfLdxiwU*@B|rVeCeC#$I|~MEv2^*cmUO{5o<4`kELI6po<Qd
zASj3t@7G>2>=OigOy=Y|9!s5c2Oupe-xY2&L!3hzcLgf9GvgZgS$MKAM8g_P3e(?}
z{#JTPdLX?>>v#aZ5#gmg>XFNTBfTg6o%E`7PkLadhom0aqzKpSob$ov`GxdX(tYX6
z$UiqUryw)?KzdvHmD``a4G9*))}oFekp}vEWSHANkq8PwYene*FDf&dS>LVim%g>N
zo=BH4a!zc8FK!#AFD(0aIDh`jY{TSwo=}r&r4OVZNWX;Ub93=y>DxqlFQWiELY}lA
z(nI}{X$YO)`Mbo(uUSt+RKTZ@;r>qgVYnXG{(>AKm!1%l9}&m2NH;TwxmV-B_Z{S!
z8a=MG-onVtdh##HIj>0YN^DaE^qt*#ME>W}ccll?k@RVU-AYo}9hA1emwx0@NFw|@
zsghRGZ%LW-lx=(p8~R*&U;4iE_F(an{L@4J4sC;LP7}G53TY+1D}B#h|Dg?vGU?8S
zgXXYv9@&XK&EU!w7h9M_x3u=8@Fqas&aaFPhLW3a`C)5RmEhoxWME?Q&p(nlz9l_R
z0qxS}$2^rDNxzhSMV;;DyB|q^gNuDxdYK?T>E9(-+=tT7rC+)TYnC+W=>zFpI!AeH
z^lv9Ul-`#<I1GS9)Vshw2M{FwvoSVE)ERC*1^>38(is(p4ZNa`vrSh6?-BlGj@Y-1
zFj|mSL<`{6Z8Y2zssBK_E1gJRu(^HL)SxyX25(F6y8K%L*#Ap9mOf|0k<X=H(M+CV
zfFEPL_KWva`YsmgO_I1P8@#)co=ERVKazgo4tT$lE9su}l8v1eIpYLBQt|U<uz)Qo
z*w!Yg)uh82`*CZfu>JM9X?ffyBgWXE3>%hieH)8rCw(aWH|eqT9m>=bj(K0gL473s
zSbBf36Vt_iEiI)tqz6QOiT65_&P<Fy1}W+7zHdt#>2>J={FjdMMpA|kq;E?<bn7+P
z106=Uv8K9$`0|8FBUghOKt_YZG5N~Q6nkg$HF9HD?B!qeJM1>VJ_{JcwavI0sbQqg
zR(fChFVa?elc;G1rE|;H{XqJ@^iyv}O&j%n>Aw-sA7Onmo;;<f?@B+Aemao<PWo8-
z7ID~%zzL~(F$JFfmGph-XM>AFghDGbeEkC^IJOMzuWY6E1|)hsqJWl@$JG34bWoHw
z_2}lyYVHsP<_W%b#aeWx_9M!Z<e&6EfdVgpAv<B_Db<#rxD+A?{cGu|^osNf!vG>D
z>L``{OnQ6JNt}%Px1|+F65O++#dg??C(=)(UyRaNMd5sI%~QeUn-WA{0R`+)z@WVZ
z)SMppl-ih3<RusP7zjbo$S`#QzE!p<E7oYd6Fa2vH_}G>4p4;H?a@v0Au4=5Fd<K*
ze~>mL(Mj{Jn0}a3y?JM_c<Ig`NPi=pOTUy}wh=(lqxw+#Kzc`d+b=Fd21nA1(u>lW
z5!f33HMhWj#xLw2ptMszGOYilByjJbX*juizk8h89dKuaVzsry#C{z4bTAh#r5{=M
zV}!z)F-k55)tq#3N>9Is0pEbZ7ktZ_AYgd$hS26GP#VReM;Rs2q^?KnNDaYCs9S|e
zDy^k=!n;Jp=SKcBh<d{BMBpgo7sB592I%mwm&W{xI(A{?opYPQGO__89lezPo|GXJ
zrXe}GvWEJh2;Y;I(hrR%6Ge;GV)_yJZ>5i<mGpBkm)b@sB?EZHX*479k32MrRx6xw
zf!$u1Gw9V3Gox2biD#xv#ObVqrLZr};>oXJN=)!Rt+|vf{~FtIoI*Cx!bQKCyt9Jh
zB{i_&;=RY_2`YrFLbQ@DC^e19|9$D)=ITTw7y0JeX8I04rFDqr$XtCCsO!^-bl6H#
z@hrKGh7K}q^f!z&(Ww8n3jp8J#>+Lp+>_h%e}f3W%L$o5YKz-k-lm0H7Vn3aKGdvJ
zhb>%=8m|;UU7`Q#Z{Zz4;MHK~?K6PfxYxlmX~?-M%gcg9-`fDmXgsm1TWh<gwR9x2
zH2LOY=L`k$I=510#2Xc0+ISGu0kK}NQL=t;r+&-G@ZyxFJQ?dc$P6vNl?nDlIya9Q
zkz$52H@96JE~>C21p-?RoH#RMzJIzFzcHhx*FYc)`h3G?8e*6&4q<I`MLLAPG&dF-
zv%XFKEiwMb_QQDE*lo=yijhqG$I>ISa-Hc}%f;>J#T(Lu+*ZpsYIob6WqMM=h&r-h
zF9*fg8D?C>x7Oy9h25z7UOL0|)&mwtZAJZPs#;mebILd7x0#+pExfR(KeNBM2%n|@
z11}S6(cHFqS|dCS{=~0FE5rCF()<Z%OIVB6#v_Wbz9)zb6It41%mz<fS+nQbA-v8i
zE5SRRAoJ)(*iuQ@KEnaK7CyBc-9^<s+X@U!h-}-)+(`3`JD>Miwpstm;o_D41+~SY
zQ2>aoKjrExP{sW_G=%)Y;0~`Vi+XbGN$&+3qRTrxPjK=pJm@uZs<wmEG)BeP*|TN#
z^H2T^Ohk6@ufOiazuA?U)eW|E)HGQ24BzNBbmvk`ciRvetfVvR!rE9hE9fAFUmP~A
zGBRI+5g$<s%%~DyJ;MPDlUD1W^yzG<%jm!UX7c~F);xWpj<_@X7*V^3CM&ZOy>HLV
z2Gs~(X8mQ%t-b3Gn;Jyl%4UiLrMNctvVDdFb}3}H|H7{&!g#M=OZaiI#xDMZuLv{T
zqGv05DmwPUF%*r>p<#_Xmqg7jzt~7MLGbqQ;*DYQr$#&|tuOW5jx+pZ5$?v8;MY~*
zf0q9L1}i`81K0tP<wkXge+nmj?0X=>*BmnE&f?hE0GS0u`%cJ?c>O8RZ3I!V@1iMW
zX2-`rD-QY%g!Zp>@o#)(LA-vB_YEq%VCSTFuBn9Vi6kl0dMZ66Nj)P8D@o2`#760d
zdS%x7-BK@0_pHE5QxIAM1Z_vdmw(~UKWo|kDc1+uxTTxn!ssv^rTpkiy!KLHaQ6dz
zDoxuPc*>9Xt{xLNynoI*msG<qh=e>9?S9R-`{#=AU*KDQgNvhQDQ8@M2@A0xv#7vD
zdiYOAWKvRYs;y8EL#g81=Yyf<!H&3wUUy^`D)RjaXVvJxhw=IECH5W!l%V5d0Kn*)
z-`N2-9gLRgzzpvoU`6xEhTo1lW^MUTj|efdkToUTE25zZH%y!43TyXo?bFd`mJ{i;
z24I&K<?rlZhTg@AB74sm#EC^w*pRIbY;*s~@a8Mh-tf4`20K(hc=|Uf|D5TERbT8g
zjPVKz$t7bA_6~Ro>=`|iQ8Euo-VT?(9VmZpwa41Pqbop_j@c^{`^JB3p90f;Mf=1N
zUNK_G5Yv_SV#+FubAt5-EU)8BYw)9($#C<nsmirr>{;?JiEVgnT&$A9z-|<R`G4rA
z2LYw@=mCgXWKkJ9e2G;SvbhN&OKL<lP{SJQa|g~yAUEA$mQD^;<j4}<m4*CM>5ir2
zy#@Tl&u^TC-FDf(^2Jd%i0<b|g`yf7tq{e4i;#lNV<vUv7>%VdK~qiY!n#myJ`F2B
zGi+#wGYCghfCbHd%?#`rzwyaZ{Fjt}fG3}DvD2ZFCtX;3Xh8+_deDp+RDM~{;R+U&
zvuH|Uqh2HA1lGiom5<yjs#lH8(&}{*|0_z-um8;g(tkSmv>|Sc!m}3xTf@J0hmEKu
zfl3R9H)@LwMXvlXgUYOdKJce&+{hVrX=}&LH2>-<{4aU^vo7dCK*A3%aeozAY)yTk
zpwQU+<o%HgP5&I(4mNTPiquo!0F5&`AEJmBMbPMpo<5@rKUmW2;D_lmUc~Zu$qr&H
z&%d8_an`@yE5N_jolm3}>>QP-*JU<P;mAh1?|tYoCsqdG;tI58%M|2^aeVgpT}@f%
zdHB+itO*O1L5FxiHukJf-j{zQ?3vy9Rum~jIJ&~W*2S@SUH_kMy|{y7_!I(TYowBz
zns{w;8P@S?(u8p1`9UvMM+Vo^&KxTDjKg+BD@F(N%ePkZi4feO{G8L!JzIo>iw8yY
z_RxSHUYy^4@w4>5h4fon;<++ubqu}C21gRK6bAz{ZCR8JyI^W&sy9q6I<v)3Yk-Gz
z{RxV&F=FVmKCZ%!R>hUSmi$MH@c$I!^{mC)cLFD;mhiDPdOHCU&dn7UFv<2XbX1Bv
zqNC{8Yw+wB6m&{_brugDbg|ZsHLch>9+7=8vG8=Z&|!DWO3tJ{$5X#Je@A2Pgt
zs8ln0anz2Y^)rYJL8TwLxPv0MHReIlZ}**;dHWDhXCFrvknLAs^-HWrv@><mLV8ZR
z=ZAWQ#?5i=Ip=Q&dvFIvayB}PW}s42Yu#<E9W30~X7p465ns$~++Eivl57Snj@wq=
zr#5&JyT0wt9S4`Z$gqXXs@V;>Z)NYYzj9|I?rJ(Eg3%&shZ7sZu>b7%(Ud)ta^E}P
zKU?mcVz7&66yXEWm?8oZ-^N?iM8hK@EGc>v))Ue4Es@#Zx_M&p-1Ox(i)uT%cpase
zwP|Zih7m@4Lu@*=b6#(r>Jc+0D$tq+wC;-7YC;mccXH|Ca~i9{%Wk2c>`;fo_*&0`
zfJz(QoSSHb@ax;Gz(9q?<_nwGK3o4wsK;@!j1jlPa`hmry#okcZ|OOjl9BG{EnSeT
zuAR*u#pjC7ml_?;NypMkob;t~lRsDe0f|Rs+iOFM!U3&rn};3(Jd<9LUM1x9md=bL
z`F<(AbJ&ocibyX>&s#57V?s+zftW_n4(@y^y~rGfBP*+BXz!Kuq4W*}hv5czDm`U#
z$~Db^fggnATAL4X6U6Uq#MYL-r1UGglR66ZCq9rg<M9@hJR6H(Yl5S#`J-oH(5e0X
zQ_?H+BwWyfcS1Uz=;l9{&IeLRi}xw%b>6%otc$?qsr0V&4zo&o7avQnO0P@zV2-S<
zn3VeZ-k08%o{W}33jSWAIV(F!U_zgGji&FZ{3+i`PZ$fc;qL{mw`3^M9t8@j(GgI?
z4pl!g@FBkOitX?$`M)N;C>`THoxt58|F@;511Xf!ed$f<6;eU*!X3%^N79GVhll!~
z62#Y}*BAqsqbenHmDAkK5&6rwXchTRVFPV*{>*rgUL$13me?4I7G#MfUG<$|K|LZ>
zYj!^chmYv@n3#bJdwgpRtd#CauSj=6DjIWXHG{}Le1-{mReF^GYz?$Kg(C3i8OH0b
z^adB_JaD+4E9rgdBMx&p=md_?U}daXS@TW>H6;Z~SH6Y?aPfr2oY+X}y5G~Sf%VQY
zD^dQ6fdtOM+gcnwPJ>H8$Y-QKmVQ@ynNb28n{mCCK9v4O`rqm2?MeTO($}O<ORq@x
zU?e<cOD?6SbZq~~n<0}~eM|b9^s00w-IY#BVLO`f9!uYs{?lP6afJ3QG161=r5U1$
z+PSyJWU+YB+m)eir5Ww`h1e}U7&AB6$#p%^b4zROX>J>4CkCi}UiwSvE7B2d)K}76
zdd^6$-<JNr(hpq<Nra!1{zUqHjyBs;_0I^RFQk8z{&(cxTlsYH*QFOFap=I>Ov;t?
zed+&@zBg#Y>>y0trxcjJJIqMS7_beius1*f`pZte?F{G4ZIJX4Ec?#y+_6qkwT~y-
z89<TQsRkn%Sk4fFZ%SW~?i%4PMhd2Td`J5KNZ)lSq^$4D(qBqn!~q8+p5lb1^uF{1
z=|4+9cKOHV`D5uT(t<ezF*s%m!}dqg|3-U!PyZ*>(Pw>vLym1vKM6EqKTj{`B+NV3
zh8^ioB|V_T8v)cl2+kNZcuaf}CC>CxCtyK2Ow`lcYLFrSKapNTVVeF4JWJ_q>2Ia~
z*6(k+k^fuLH>KZ^?n1-P8Euv`&E#*T{~-NjApbMz%hF#+zs;$#J3@yY(*u6Y%(;CV
zetGx~=|?-Zd$b{LvA&lyV(goF!+Moqm^HzDQt}g!%#L;ec?4SaQ-c^iR#<yvwAGJ;
zTq$>WL;8K`YtnB?4-A8y+oAAJrK$9e^j@G?X>s3@{zCc_hDe;@04{B|`C58Sx{yAK
zL@eQ!pO*ea`g5$$%(y%)8{lK9mEMs)I84Da!;eJy+v|O;oflROPzY}u%Pr?VHeJn(
z9=C6W?<`Udx8Tf`4p1zzxp4=OMEH5>Po@7!`kYitA4{*ZU`OEWiBI<n=~tnK$!WbN
zeNFnN^hKhc)B!SQAu@(b{9O7~w4#Ld|98^Am%c0=5v*Rb^&|JVCw;`7_ww)XuT!F&
z8s?kO|I%FVNGEXU{KQ6J$H1llPh~k>Z}VjRnY~dHeZ&C3g&E0l{ZAtNL+MYX&*8~h
zEXgJ2HN})Ek6MKaA4#8;{zUpy>1D8-jHcl!kG+&$kRC}Ng#45Ke@^-f>CYIXwd5<d
zrctSm`cv*abN~w@o|N`ap}(y)V)fcFCrkpY3?h$KqEsH6l3z{ib@q#wp_n`RXr?y%
zbHlpK504~`Z&^sMNq->yJLv^;Bk}MF0)1ZU46?eB|FQI@^bP4xrO)H8VlY648@eYo
z(!0`!1NpxueO3B~^f{Ztv?Pg7e)A3K4XN=vmxFgGkYQ_$>m8_a#*CIwkNpPPQ07e-
z;F2I{LuYl8TY^w7B8cxW(V?s2b*ENAZH=fsTH?LY{%=TsB)wv9F=H}k#8hWYy`9Im
zOr+POZ%Th6eU^ck!5Nss5X?}(&r5$VeVm?mD!nTG5rNk!uvG#0&>*}!((}@DhiB48
zQD|W&fE?laf?BdMn5T{nkiz%7lrL+u(v@L$H8H_vFwnCza5fB8$NC>zHLNi7a+rZl
zr8lHMm;Rmf8R^tacR27XzVxp2fPZZzRnl|P7p1RAZ!&WyV*^wKQIdAji_)voiS#sG
zJdr*n{h9O^(&s5rbZmn&rdu_rP`db|fdj6oo9wI)qA)!V>oYw>!J?FY&M?B*w;L)k
zQC2Pn7Z;Xz<seKQtJU;X_|^j0RC-bRE$PeB3*Zm6@hhe<@*3$m=|$r6n~PtO{y_Sg
z^oqf;8II}1>d-6V+s7gQm(A}Kb{dV4#eyCE5&++$K^PItpJdjK0okT(&<H?ghX=wG
zq`7Us4Zmh1dh=d8;Gxu1Z#sgGz??TWlVxM)w2lNo6X_-CE7BK`{|Y-9Qy<c{yC=OR
z{Y3hK^eE*2s`L%%>(Zy~;NhrSZonlM(mm)JdC31o>9?h~r2Cla6HAs#V4WAGj3nTr
z5eE<gY(avj=&;DkQh)r#VI0z$x~&MaJ8X0GZ)Aq5-K_hLt#@td^&P5S1$Ot+4C^og
zO3Bw3rLRk$2f)k#r7}!n-5;=Xqd+;~&+kaj*+7c9jjRYnYfD&J#l;Es_#^2pqOP5x
zEy4k$+dOAyg=io@p}iBE#S$A>#_hZs7~tSRX9ve6wj9<KFhlR=ndMkpv~<IRcsF~P
zq>=KKOW_L;If5}Po`1%8tc7$pY-<wCo6_${pCMQ)?KGh)HbYLP{2UC+oB#fj^mXYq
z!nY%wMgaRcA;GQX>)fh-M`pcR-ap`+dT=a=i*RGph7*@46*^9gg`~l;!YvT_5-%gX
z-$Z)A9`+bugoT$P$S+9uj5jKtD?wNbQ<goOBEYxbWsF#uTcI{H()W$|z)Y5YeNFl-
z$99zl2ba{h3wmR#L*T);KxTbimCX|C3>2OYSU?>?+ZAX>z^N6Xv{;}E$}mGRXgG%>
zLt8fnKRlE^8l7d7GXeLvr2D)}fE!bmqhnJXk?!o}|CaO(=_>#aE%@h@#3Mxyccl9q
zd4KcR34ZvR^m!?hK4$%#udrjMOa{@*P=*_}Kq8JU004jhNkl<Z__e9Xvj2UP!7O4y
zqCLjA*J$|+^=?t1+zz|i-+2S8XwC&2?B<k!B*zSm1VKrLpOQW$-67ss5I$@%CJm`k
zsMk#daV)(ey(s0(-g}Ns+i>Tdm13?VgUO|Z^s@8{2U$Ngf1P710?eO{Vz?O&X)Qgm
zcq<CJrFC1+jBIfuzt#@1^;WHbjXIEQ=MwD~FN$?RUYR1i4fi~@kwNw0?pa7LG0!~W
z*kcwfK<@=jo>PYqPo&REza_nBoyf@(UNgXd&YvSJ{LSt;lirYCWZq<DA^#f9jf7C6
z`jcU6gtTfTAH?JnD=9St6s}~WR;I88XWDd_-r#>^1N{$jVB$+mDAT7%HxTQ_j>8+u
ze=0pEeM)*sx?>Zyf|N3`RJ1micC#fvBfSZECbv1Ft?^bH5Yc@TNs|9(rO!!c7`mLq
zF=dczI2~7P^KlkU%f$^KWd&}&#Smq<w_X8f3))dLE3E+ktjup^w@tF^SdXnu(TWAT
zwlURf3QO%k{&VR?=@scdy|jU%2QFUl*inY)!g#$Vy-fPLMdl@R$~hPNUfR2!Pf4Gb
zUb3^?wwQwj+b$ZQd&;x-BZZC4-J&<iFdbV2QnU4X0~w--Gl5L{JYuYQs?r=tFM@B3
zK|SZ;q9sx|#L7(i5P!%xLFL%m_u~mm0?K&jQx1*DV)vZ#(-o>0Hl(nmaEBUieGMuk
zv?D-RfWeoRR;`{fNT&cHd!Jnu_1=cBZZUlg5p++GHKVTz+~^jKSy&yeIb6H}`W*4n
zQu<Ju(wwj*l^T(M#AOjac1Sp~j>d|p`#3f|-t&}Ra`sJ2VA1l-M@FyfY2KW_99g{e
zSb8Kq4=E+Gv2h-K#4r`BxTAWI)INgvpzHVKH{n|xMGIo5D5*4qDJki(&j5XM{w<iS
z8)Oz{yvLz-DLjx~m0kk6mPHRG51$hZ?YS&T{`U<wkfA7*aRRoZZR86kA5XEq9TF-G
zy3g2`!~O3|P{Q?WjTCfE5)ql<%Js(!ICD#OW>!pz68jb@^ga=^=C(qaO{MGb?Q=`6
zhEjM=dI4;6VW8sN4r|Xi5~7#KmHb9yiN^vgJs!9vLc9ob7TDeyk<ooCmrm_Gi#f)(
z*Fw6_0IGh}a76gN<Iw{S35730dSMoq(t=>I!5)>ov6I91?F?Dp)@Lwyos1Y;Cws;Q
zjx4-p@WI@$gkW;UAdp@iuH>+f=$5MaA_t(a;U6@9ePJc9iRpS}$zL#t>(O403{QVV
zG}*F3jWLjqZDv~@R|@X)2?Cp$iE8=krr*t>FaQ$_U27S7XKg1n1l((n9S<Kqj8o*_
zg(adD7w-m1L~d-c8e6#}*=YK6k-{t9d}{EMpgsn-r9y?G&1W-uo<wGOQB<Y22FHXw
zu37J3UERFc5&YVTiCt{Q4KmuX)ApfPY~U2!M2;9NsHH7Xu;YgMQ(&J0w#%$&mD{gf
zAC`fVGAv2C8%uxEbFR%R2OQb?=E+&1@R&PROEyCO=r#^z)E!soZ`6bfi{8?TY&;Cf
z@KupVE$DRVNIx>FlJmpRskDf(w090}XibxLg1~#;KOUii=)2{&yMy5>B7l+u*%_9(
z#N!nf2E=T_4N5t5z|pJ~9Zb39F~Qsmg~<mm-}^F|v#S7LubI^liF7&eBMs!xyF@A}
z*gdtFY|U*N%s?-K%1E_l=F@Z2r+|RE(Fi*D`diG!F<KsDC0YvAPozgN&LviFV(iHf
zV_+_gQD2Ke5ZZC822XMiV{~QY<lL&fLj}t5yAL1_I8?JHd9AQ#y$GnqbWeEy8Rg$7
z{I>SYJp&SHUf7^xYdq*!%No<6LkA$1%IUgMtPdJr1;h2~(1&+qfCc=x$MpO($auvP
z?jg7io0OivfIBUY&pENVz@Z}}H*Ywx@HJBS%*qf`i&NH?mhC^Kb+dR=-ptUV6h^EL
z<?on$q>?!<Iij|e^zmW)L>SN-gsy<RB`ii}8#ZXGdmr4;h`2!2m*hkxSk{IoZQTq-
zyjrsm6PCImWyo=9wbfC(f&6zY>;vhHB`gsBF|KQ6IZ5xc!#S=&-olMez?h4Ek7DMU
zSD~k+z1Ny>VF##uZ%{rz+{HNoY~CN3R)MbO*u{Mjd&&@(5O~F&jbuH5u$f_0H$tk&
zx?|5xtol<BdMu^O!OHh)0HH&bE%(BLu<+x>p$9u6eC|-7C-74Xa?S}i>(zk@7*}(f
z79FMi+;EAF&GZ{?;0@s11Ae2RJ#$HLKPRcn5BFfkLhty-sYPRj$+qzHC+(hd=^g1V
z7Jp_#jXM@iWNyu<0}ET(S)>(NS6udp68p6;WBlrdTv+y9sqj_xxt+Z?Pq_Ipf|=k?
zat>1368oGUjsXk<wC1E85qQp_AUvglwKYh&cX0~b?vO7w2xUqrIH3XkLno1E{L(S@
zHPX~0ylUHz-K#_WNz%IDq*f88N4^;$?o?WX0^Jm`JEKd74akg37PSSpc6%*oIrvpp
zfle;{8Z1-<AlIWs_za!jk(UIWPUuoF&5oeA?rnI>#+t$T+8T8_N;0yN!d`@p6^CT_
z!<Oi4X9;Qeqv8Hd&Ab;Z^%EFX72#PhJPYeK?tMiE3b^S{{s}`83fi-QQR`Sy>yg<)
zVKz>9mL7%3rLbd!R6$AN6kEFE`E!WLCqB@|JEcn0E`ZpXS)S}L0FK0`=02sZ{Ep~3
zl_#GJI?;O$U}vNFIvyqgDN*&v?QV^K&9U@AI<Zh7N=C7PE5jJ};d5@&&#rmg6MSUD
zi(MhWa&!YXtb1j+agD@}(dRiqM6Z*xw!>PZK+_llUVVOjFMm!u^Rcvs<g)9}aSS}w
z34bF#AnZXXL!dh|Y~#m_ta%*$&|%hEgP_)IlnF1A6TY^VS?u?8=y1d06_jVT#=ASa
zjlvO9Tq3Qg*t9sEK=Rjz{Iky9BE?}3c$SK^a7z)caUT-7#~R&;O<%L9Yg_h{(G-3v
zJ>VF{^a)>)ezLJ7wD5`Ap#>|dKY=8d%x;+5ETNr+bvI(W;M|2{UUiMT$<U~4>0`p@
z5&bEx*<oYjH?{<X!3CMv5h)4-R1=0Run(2_%_wF}1{5OoLq`#sE~%T`!3{QqR7bx5
zWzr7RAsMP`g5`+8f*yG>K!JpMPkH|6IV{;fIb#qvgU~5-yTV8bZQT(GhEb^p*Sqp>
zta~`4TpB4;c;DD;H>=pvBXq$IX)J&&i9~{Y)=Od|Af1x_%KDB2IgZf#S~~YKWX}Nv
z6*)LR!GS7_E?x{q5vXs&FZqfiN*=-KU-n_d{qE}s4cuH;HhR5fg|^S2@fBvJH@K0`
zC>##f^lV_u%?mU})>;)QLq;5Y<SAI<Jly9FQ=3?~wS;*>Z)BxgK2a}?Qhm7Z+&WUC
z*1O+;4Zy^NY_aT@0Xnm1-CzkaAjX}EBtymKfEPhz7<XVnY{>u;0yoc%7}!ZC(jyAe
z`)$}+)w_lT01QF%zO^&bVMiW!YMb=tkqX+?=Xk$4ng7P<$u%Xd5f!R9%60+`v9Ua0
zN%CIV`t@R%+?G0L!H=<i1^Mlz5mELppE9s}u((o;dP3mau{tNx17P%#GEBj;N2N7A
z_M+ctkZv2*<UPMMw$Q}9Pavu}+i``~->gArT;&1??<s}HfZK!PGe6t{IU&xBBs>-5
zPdJ%)ix$P3?&IKqKwSzQPrmC9I4U`+wkJ8^D`ptyjvCa&$TvZk7?FQ#T*is*h47WR
zIo-9Lp0&3>4Nhl8HZr*FmF@r5pFb!}SJe6{Q~uJj)a${a_NBi-s*ybC*T01VZP^$T
zBLm(%fF~R)NT1K)oF>Gc0fF>n7(vyHZ0{JomzkRdI=J!NBOYObx`f@WtfxHi&?UnG
z7Cs1RZ6|<cToWaig2<%7&-Dr*iGcU2G^Ne|Ih&X^mmW$N(i!1%kAf4AutTI#cx`x!
zpl+rRF}?q(N1*NiTC7h1gS9mQv`p*UD^ShM6bC6HYKh8nZq$sC3@qaC#)5&?$RoKm
z^j}I(4+rCCW+j9~W?WwZ_20ps?RW2%LFTy)e`)Qg#S(k?SbB1ZLC0oTU<^82f`Up!
zV^7-qgqPbeIH9xp?xlr-OW5Q+6l;QK%gJLi3)X9DdTAkkc-Tmo5jx!=WR?wtP2GW)
z2Ipi06t*)16co{LtV;uLY7(-25MNtFnzNm*tN^w%7Sh^++oAphx-hjK+OT1=L{T1n
zhW*--tO}XcXjEZUv4vUa9{4M3L~D(%+ENGJ3>ugROVHAz699h4&Xdh}%E)Z@R-m#q
ztBvH;K}stjNN%MIi-B&u@7g+@ye1z7WdR-@Gohf*0*dH<BAsINWH9C#Ygkj_=+)tN
zB-tepKtc-^+$c8=)ox(GW24PUq}ImMnG)0Os7Y@}!_VMOw`6oR`cSc+E1onUgv@0a
z^@W|;;0AXgmTu`zG_#N?nVVCqdgrKKX-II7g5wGtnZ`Au&T*ifkynP{cVHtC<}X;J
zEw57=Ms(#cV24|8?Ypu*AJ{SM$tVCDJ1K}XTk{erzHO+9MtyGt#Sz5!4%Mycu)W7H
zw$=y1M8)ZvAnb~bR+#H7Y%tjMA&oB#O4y*VkI{`E%!W1Gyq@)nP7k+7S)aOGN+;4a
znz|SG(x&|ccotT+<8YJ|l8qB-3M+JVNT{$5govkFEZJTP6)D@4g$SHB<Ao<s(4tD(
z!_Yf(-IDIJgqhLqwvYf;qjmU>h$$F#O9PK1QJEd0utF;~L`Ic?7_on^na_G0Hn20-
z6;H9D{9Cb+1AM#~RNq_$W^DdoJ%j_=63mqR@}4gi*+I)t@F~N?R+h2`^Y7T_BZvOf
zHmRaAa}ufigd*iVhYyA<plFZE0Npz3TT?V7n2`IWtL@CB)=q$F&Fhsm;_kxDc(|<O
z)SemzB{-;C4kL&)8}R`m3rnsi7=II(-AkC=8)SB4!{>HzP1w^=tcKgJ`Om!%i!$pK
zUM}vi;)!;4x>2Nq!-t=c_@pfI4&PE@E2G3C62!d};;EL<uAWL4zyYn{;k&~KDuTU+
zB4R}2zQuC8=XelQ0{Ui#@|PHZ4VzRDboOf4Sl}CtFf%rAB!@vtTKXOgTIjWL){f1d
zMfx&@;4>^^L;#f8)P;tW^MZ}FV>HjO4j2WxmS7@m@SI(g9Mc_X=WWpH*X2u+xZo?a
zq!NPdx<0g?f|ysLDo$Pt{4rd8hNj;<e1(Q4-6+h(B;l`&WA{jU;(aiMu_q&dxMK{4
z$UrNb&pTA1g8uM(MhTCSYlHHfOUHdi+n$7S23V2zxmdaRlNyce;a@969l@2;2P#U$
ztpzzF0Z?jmKVf+`)Ui9LX~Ex%GD45aP>^W^Picx4?g*e{-fu6$njASQJ`MPNC~|3V
z;#d(l%y4Jkb%x&LfESJe^z!%I*xVTbagJcRKBJ>oK`yL?Xu`t<=pQ{R?PwR7o8g9s
zY<Rce#WWV;m$p4S0ZX3Zw4!{tBcKx6w>0VW%wSG0P02PYfW6K!Nu3$6ktC+0(pU6l
zxq-!$VV<%>8fB!A=aPtkGE?1}qpdcBC(n>-!>qa^dcNAeC&<BK#_^N%qbbeAjbTpF
zI$T?2Y!4)b^^JYAwDy6DzM82St?JMKr`logzqUrqZ0#6;8Q|YY8KA*FObtcMZIH_~
z=Ytd$I^MiOlvUQeWM^6D)Y`dn<CWbgPiArGq>sfk7LPU0kbijZfG`T2cFFGAkp2y8
zV3}cNJLESv8s_te;N1EM_6&s5o?>g@UyzT&u}w@_dY_hmo^Y!QYY}*UMJl%&%rq&e
zX#|!tV4Kwdq}Aap`H0c4jS-Zy9Z%u)r005a+kr@NS;0owkvJu>EiCHa4Y*u6Tlg4l
z>rnfZb)&R+tF_zkWYa$+$5>!_XS_wtJw{w_IS@k-Uo$gf6E4~S@<%ke_h4a3Txn(O
zgdnP)-WG(3AaaK!=R9;k>Z{T4UuGF*Y?NsJ%IUTYLU)e_8b#DgMkabXdTdT~hxYWs
z=P469?z8XWK?+Jv(F|CR?%|H!?JaOpV_kUBdr=UgtOhWC2*0v8I!G03R(D1bdNQzq
zIWo)XcwI9Ut+rRFZwo)edo7I*S-=6kCY6~I-w&q&IC2!AZD!Qf^p_Q!f-~n}cKGlK
zW%`W#y~ZkCFnzrqRi2FdTo9?Z#&$VkEn1?~r-MV`GmC=*;tnF(6(M%;G4}ybW)a60
zGhACVx<r5HRuw1?<zEAD&3MTP&mUd!Q>?``108z%H{!(&MssGf*(#<~oDEQ@V4+nO
zIJfK;aof(+bmaL{i|b?QIpn{vIKD8yc};NJi^Ds}v>jBxYhugN{87AP8xE{8D;<3R
zL5xkl?g;#PefaRQ3p`2<7nL0Cxio?6KN|!fp%Otyk~L1$c2L(t>4{ZBZtk2=>F41H
z<UB&8)fIcRHPd`!aQ8NFDLn=5@5q@e>w&yBfTS6`ea*f-wR6`BHej&vkBnTufBO!6
zJ<2?_5wkiTWNY4R_-QD6UrQI5xTy`Us8Gh8@%i?j@M#m=XJP1V0m!SvS+a$Jj-4e@
z8Fq64dcC#Cc$~#i;V8m-%!r+~{BYx~;PLV`R%QTX6wvR(NG|;Oqkxk$VyePc6ok)L
z=wmjzd*cz-;Fc}FUE5JY6JG5C^Hc=}>VA6xiY8X5V}YX3Js)%-O1dk(%1@sFwAZZY
z47;(p?KmZ&;aAr6vBY&H$=#z4yB;`z&XlA<$f41>`Q|MrOzb^rNp#qe<hQ6m<b(w^
zgqfuaLj$xT(##1HXDDUh_B%FP>r&_pNXsyN5lTC{RWfm<@&a0noT%`=nI#@Uq!9`}
z!lKEkTV0@n*R-EhHp^>9D(VH7eeQP5-6H%=X7CB?=VYy$Pr<O?nH9P^V529}70m&c
z&zNPDk^Xd+<|Xf+!0&SlK<WVq(y>(<V}50%Zxei2hM#P`3>OgCJ-V(cY6uUBm`){O
z;CeLHGQJ}>%4(DhGdydQNCGPBdEf?wL=4%fShs6la*HNLRbbBok^wU<e~zsswx#q?
zTZ4*RfLpXDlO9VG=GXXpdOAJqWr<$Lk~N&NC8I=HS(@5wBG_1Oj~79scrPHq&20`K
z+{nygl!ExsdDOD=S+dyuQ8<mntu{(pXBotLpu^EGG2?C#HATqNp`N#PFy!V|b4&hZ
zD&6v$!zIiR$u-S`QNC>~E$B#a+d%|3rvf_V0vWCt6r3aDgAOyW;BiOocrAS(J%!|Z
zY?DdyfeMFK1@2vPPZ55X7Jd#nu%(UOn^>=Fa9~T;Vs{7t$0}Y*AJP(i#8)q2cutI9
zQXcAmYiUMiMBCUAOY+llpu#O-nS)d%_Bc4{t?l%o{-<R7KK@lG@6>8jIm?`OdXG!j
zVG~a=JCT02G$N+vJ=t*4p7JH|BvlEHiB+y_u2Y2H*P}Iz+(4FqjXNIXu91%}X>;3u
zOMt90+gc#P3`0~~C8dbysbB=qi50!Ne(5q=39<oxx_6?Xy)%eH9qw~wXUa|>b<78G
zWW%GEMnH<~Rw01Ec6-{_Aj6WSUs>h5Cb(&>(moPFi+63LbLO(vcyp0vM?dl2hI{%@
z80)`9=_7rJw4&#LTDv&1p~!aEq-o3BhUsS%+tNZkl^PgZIUArfT<OjLi7ZgA9Lu@`
z?p<I8DztxfXn-<Swd7k$0^(}|$C}q1DPo5fY``pbynmnzJ96Cl;W(8JENzA{o?1&}
zYvwsWG#wKv-GQH-Vze?M>orEEJTw5YOD-+(l+{sNO3_h<C<op6o%y89JYk7p3Ph_L
zZ6OP~VRV1C6yTzTCk@9~-ByHbYy?l!KLaDE6e6mb59*+e_39Tymq7w>YE)sb*;BrX
z95;&nQzLT+sW84}sQ<}&pONoocB*P?J-C7h)Ki`am!?(;$OwaSD?RRDa%P9Z3#4I6
zv3epsuopQ3Y9HCc@t&b=TN^xj%81&a#P*&)$TDF_EB4IJ!oW(pBfTR1%w1oHe@{P}
zux$diTUj1BGxsJPj)D;TX@=^AR;;9lWE%GeY|@-ILigZ7oG(bBC){&b0bE<_Y=lw+
z(0#!s*KK70Nfj{IId?A2VntXx^D;y%#ZvkZl17jhk~2E)&)FK`pk3oVokn_!8Qk_O
zgCj_E#n8E0dI-}$2oC|Hc9!Gq!_NuUJ6acOJCdNX^GO>k<8-m?TVtHXiv?$7!>O9V
z793$Qg-zVTnA|X8HGUy@9rMAr6c*Z){Sr+_aJynfhr2H~(-UsJwqsqRGS?YYJMsyH
zC8&3&NPdDuI_nJSMW87zo!LAzfgE%OzO-1Fd=P!qmh5B*qOf8>ZAtV{{v8qg3(Vmy
zjOe+r($~c5!@Y;U?tgvYysZH_0)p!m0CFpqN4s>~JgjfVb{>aNT#GWaI52_Iey2JV
zL6A^_rxO+6jJji3w<EV)9-nM1_l@R;xn(e=VX-5hocJP8#B1ZmR#w@c`b4l;n7Ix6
zEsU;Pl2fEqVrNy29t17(9=YW%Gp>(iK7fEi5>Cg6EMfwTzX6qQ2Ll}o;*}iq@Iq=R
z7EP?2-=gO|9S8_1M?S&33r5I^)WN_X;c<8T^aN*HQ@(NNGahH=iBLtge?ipOnNN;9
zCO1c)5AK|7<vC2<s;{mSs3ji`(7(e^Yhn5+PgmL6%2<(m2OOShi#{D$rc;s&H`c4Z
zfBv=fK20*Gs9|v5cl{BS*`fD}#PQ6kvC*36<(<eTdfgzcfdm~MXGR=$1-mK1B)h>p
zx6D@Il5oQ*ea=s)1~r`ciqrym59?i6HZx}g&?Q;)_7I3^;N9M#3{%)xhn;C*V^6&|
zKn-c`%;2o31Wj=!;nj!ApVIF;(gPxaH4xx}usD#mwVScbk~JAlwKY~@YdfOw0$Su4
zI|jXpI(%(WWs8Sij~dX9t-dwbB(r!uT6}WWwHHLk`Y-Wsb4<Fdp`@|X8hW1`!eEW$
zCRS-}Se>iG!vqek*vhK@0uI|+g`u@nelJ2tVOy492P-rPHB%IHPa>FrmWS9gGg!f)
z4r*jie}*Vd;MqG$$l^6teHgU&47rhjCzL5mOZ6+;3CqDSrq&#6iIs~id|_TexAkGm
zjC;x3uc2`Uv2cz8Z3c;468!_|MSdw7HKQJK&Oa4{EUG1N%B<LQmypR3p-7NNo<E|F
zBk6?U%Xjd;Io2y8xYi9&6i6!r5NE9A*19T7={`@F#l=_Bhtigl+an;E14<lG%*dWm
z1`v5n2c&vTbN@BoCm{`2jB~k3LROrklkrPi>bN=kK1%fMzySpPpfDEJlvJ=|wPQKw
zhmCAmKQGBa5uY%nacc2##!B@nz#X<Ww?4xi+qhtZb%)k3vqD#H!$t~Q>d5gQ>Y1Mm
zfd%a-bY{b&B|o$DATrp4m(~+fgUgl13eVUMjql%RA4!{Z!I6kjtlYArCWDJZgiGlo
z=_5POBjg$p)7DIK?-L-x(t0t2g3}U;2duF7L7jQoE974o8$^~t?nui=Y8W|h5$R)R
zXm`XwQDMpkF=UiJGxk{omK|#}!4N%hGHY(A+Lpv|ZKG(z=5<682M|!ltf@!R^A!Fz
z7N0~Mc~>e47xywueOi~&E4<b*{Fgwo+|>5~_zk@Fxq%l~MyoAs8|k(bJ~M17`RFsk
z&r9i?gUBLK?78w9friD(sMpT0P_Yub0f{>5H48$?BZ>}bSF9-RJz?DBNCdQ$9x+0D
zVkZM{EQ>tCLvAcw+J{eplINz_5d>$%l#>AfykXAuk@SE@u9{Ty%*;qVsD1kL$0J16
zc5J%6qffJd@6)5<M%EMQOJ&ukv?U*7QhIgJjvV2N{W+t|y0+ODmo_cGmpjEHtffcN
zBS7EyVW9FkCFYjr-RsYmJrh-~z%RpeUtxnb0|T_>B#k)`?GY$U#H1M-xf+c?GqW<a
z<!DiOjQn_K8r4IAHgMeMY=MUQdI!1U)NEVl!rM?QI>xGmJ&KZ2VEeI`_Mc>;-<RGe
zgDI^Oz94<w;k|c$@@a>EX|O5*=?`QS0PW88r)I0)XOnaUQ`0^dXO0yaA;QIPm$2A6
zGLYQ37g1H&Z=@grbUaA7n-=U~PGRDTX(rpj{t2pOu)r>igt9jJN{>cd#sZ|CrJ9Yt
zF*-*OBLi4lA@D9;d?Ecr`l<Aq5iVwI&y}U!#UOucf!;r+f;%%#bW+!2JX8h0cUXnb
z*muF@nG?9}2!CXozHtv6HsqR~oy^eJypLJ5M6<QuykonZ;i+R`I^)v?$0Ix30GHC+
z(l2lTF94?mX)Wj&tHViBm(nk!pIawx&2}sMtr*z=x$u>r6MHr&ZNz(>@mH<~9Mll$
z#;V3KNi3cvMA%rl?nZtS%dQH*_#@*cic^LHg4i3-<j~)@6C-ns)(U^HHA>_NizbQi
zr_%SNyQBig45klRR*0ro2}$73FQi{eGw2u@;;4+^vovgezj*25cSv8wW)8io%{*)l
z3s7QXg<@qm<I1LBMFl>So})aq-#qjFgyPJY{mDbML|8o@LuPSxYXtIW2xu(&yS@!V
zNm~8B^q%y2LjWS;S{cPYWYmTTSJH>lBaS7`V0yHcM@8?;#?#LfQU6kUBE8D0wl?{`
zv1mMknPCPJ5q$#<IQYPU?rm*c?2rf;>s(tqZup9+os+u5ANBeGYM{!34ZG<-XCT)@
z%d^h_*3x^@Po!7rIS>P=Cb-W4psw8vx6+mLQ|a%em!;1WM|60Mg_*>fW8-@H|Iehq
zlI}=frcFM|;W=%Tkxq}WIAr+I)`pR7aYilBRFdj&0a_B{YpWWjBF@Buy%d3`c8TQx
zhA!dWMlVTJqaqi4PHLo=3@O;(|4RCS^q-_-)cA;pTo`!pNO~%L?Bi6D4f&3AS1P36
z!|&DT${f+utipZ|C;7i4{X}}raHh=c{+7nd-64f2^ry8N^MVX0<DX_gs*#uPfxL_|
z^$A7rpb(|ly}?2y1GF3jF;25zQsX;e-=#9p9iy;}Kv+rsKVl;EJqD(nS*%jArQItj
zY{++|=cJwV1tVpZyw04ku(pEqe)IfL`fpM$y=E|Ujz62*7hPH^G_;{XB^M$r`v6Uj
zdAMT?>zmDUB0a|(hSDlI=`-VrO60m%fy7ljg}a|wjGP%dxq(Dq9YSy&&8$C@z9_w9
z2NMMUFL0gua2DCM^b_g7NzX}NqDmQEXhDq|BtiSl^8@K`rBmr;l;i{;Sfn*uL-@Cc
z`d`CTnG?X4a26+~Ji#d1C&7XL6h`h2C0!bks6gzMBPhX?aSD{IeJcTcI|8^1$B*_k
zfUWe7^dF^M`eW%HWU`mdjAc0SM?Q}}UHpOc-=tEyl)i%Fu5BF6ntse*N$)Yd>*nHL
zN&f-g`*|#WO>np)g}Rd7k>2-Fit%Y!?uq%q9kwM0nh?&t9W-JHaaS-&ucb!}7@1fp
z`$P9ZQF@qCZn&ntkYrn1LA7MC-Uy9#E4?TEwRBIK5sgH>EiL9#={@O*^h@a-9|*CL
z-j)7ZI^}TI38uTFrj?BG6X_SyyTo)i7ynB7JL#Tu2Q|*nt%$eN`h6sQa5#!0q%i4c
zKqWgaBlawfXWRMpkq8$yh$%DN*%AIdevuJkJkL=;k-Q&s=PBUdtu0_l{vS!-Vkq;Q
z77AV&D4N2@pGj}iTYMwJUrPU%F@)|J-#udD3CnTrC`eko|4N>JjLf&Zf68Xk;{Qx~
zmu}^u58K(ar<olFu}172TQIPc{pJb4r@%NhgaNgc^<&;pZ|Bvh`ZbPzV*k5FR1=OV
zFAwFvlRlRIL3%;DZ;Y}A<xkfnr0$2(PhEwR3HgtNKCkgyQ?p!aGRUWcS!5gOZR!6f
zHPUZO57<I7OJ3rMKBBv-r~l!7)2_az|LYD3iLg9VBb@IJH&5vK7UOv$ol74}VqV$A
zTUi4C+88uY8J{&rBgOCB;s8<>{9~NROKATq(mgPev?ZQO-;&<((Wv2s*V1F@q4Ww;
zSwLpIAOiTI^fTAtPP&qQLVET#vm-O$fRwGCOCLx-kiJJuF#MF%Xo61{hN3eas*RPn
z{}hB-PYIs_^eyq@!C#tq5YrMJWWd$IV6N??>J>X=br?a5Q2cG_KTDnTmh?hDAkL)=
z>0Rj~>F3f9-2kPFe<A%BX)0Yv_qb<<>Xp({>1WcnrSA+bzLtJ0<<hzIMM}YfGgRRW
zo=D%4elqF+CLA?&w@+Kx8W(B7R$mQJw8p@@sQ66SF;T`y;qpT}n{%IgRCcI*#(h#3
zV!*BE)S5?%*vT@d7-%KE1)Meq?L5W@Jd}PQ{m8X2Af$ro#q$Q9MOiuR`yU{}p8o&X
zxYjRnaY4#`Aw87dk$x`y;&2+Nh}vgXxU7sh6G`Ax=>u!`*awj-YXH77jzJ<2POPO@
zgBo59PwNHCG;1qs2YYuxY}Jg)e<}T3`j1j6Ju)j06^G=QK9c@M`iVP$jtnm4$Zvtw
z9|K`UHF+z29})IYxbH|;($A&OOP{ux<6DbL9!uYrekfgz2B2$-Imvrml1Ht%+dbB`
zG6BoY_{#X4B9+du*qJd=d-}hlZknpYr_%GrbM%E6hF(2aoYeS-(iK$r(&nKBesV3n
zC;e)G1Sg2^m(p5#7dRpig$6IxNDrl74z^*-9=MjiO|a;6mW)#-5>B{2T!6xkc24P7
zbkWqNl3BlLe4heOQSlv<uu>4WVY?JY<SD%seg~`NntsNGX-`ce`xpTI$>{TkCVo%4
zkp9l*O4k^`hDrL5q=ytXdW-i1X(@f1O2EckU#bj!B)u<vG`Ki%UcM(ilKug}Bt%o#
z0>3N0Gq_k-oQVaVt-074wk1a>7o5MaN0I}2p4hnip!V!o$aCqjD^C<9gB+OiMv*u*
zP++1nr8>;O;thW$T}s~p*C>H7rX=#0(nr!egH=rOPchJsXyA-Z>NS*)rC$yP%O}hH
zUFnJRZKknA=o0-)j|s1~gD1~y@InL5l99(=+gW&57OwBDk9Qz6(v?+zqHsS&|94(M
z%TSLrd*a9_);qo^i0aW_P(MTelL~(e3RiA2+VONBNbim6{|}`n(mzV4hGOk(|E#5t
zq+bjcFX{h}q=(XvINT{fWnr$v@~uXnU)Dx_1`3#AykcFVKce^WX;8qqF<xcF&{cnf
z1gO4GoYOz23@-5FoppaN%#H8O{xBCyl2}<N6>x3KQ`N?294_EWdQW<2*Jq~2t+}h_
z@XlA#+tPamP6oT7wKQvlM;uxfXw`yh@)|R8Z6W&3k?ag`vm)4?nJ35rd5;;E(_1<(
z)&!L|!`1}DY{gR5w`Bls36|2^mbQ0B9c=;QcZX6))yN-7zqImG2uQ}p9>4gKhJYX<
z46I}grE&y--q5YJGD{p>kPTIVfGILZaSDt>ZGerCXiLtXzAm_JdmT=wLgE4)LzypO
z?6m}obB-JzcK~9SZ={b5!fA=1GLyo%3ZF}B>6ga0_NrSL^YM!>rJqZ`vR+XSf4vwp
zUId<7_2&+He$C<Br#OHeF=>y<=}}UOVCUR(V*{#kYtr6-N?y3a?v}`@whNXVE>qkl
z|AylN-afER0hKqmNg=trcchQ{?Z%zye~k)`$bZSS>%s^Lo$c|OCAbxSxDRDz0$XXz
z>U4JI?2v_3TIdw7?GSKb1^qJ&(aq+Wfhy-b_Jr+pfeE~nUSt+hK6vbrHCP*-HCU+5
z{)}&+h!H#j%1n*BXIQ^y+&Oe$Jbw?aVeEC`bfR&*g9&+Ycqmhj^*%w+i9LFUq<&{n
zrUq9=;il%EYg|um&|gGJliN1HKngCx!zK82FCHw=Z?Jg1SADkp6F7b@J(2Fvl;Xg2
zZfx75!-0~y6|{CnolAw4_rWFafXn3m*c<7w9S60r@GE_9@@+5M46$($$cUx`5e5)-
zs}w|mbUh~gkY)Sq#oLZurcc~?BE$%|J~Mh`Nlus0$qP4t;fmLUehVY@B)>gn@ybC4
zl^({c2a$DV06qoT=mrN;==P6`@KFBxMEkQd-n1`U{Ac@#7weYIck7)a13RLjqO+7U
zvt4jRS%1i5m~sYaX8vJIteP<vCW7h4?;M=Ftp#0MLaN4I+dC7aQ3s&@TaKFHXMMU)
z{Ec7p;@|AfHK(O}9xyeyeql_$sRtn$(hnz`kwlzIXH-U==e6gcMS=7Z<RUjBQD>yb
z<OY^L%lKbigu2Ct{KoSC<cl-Q6dN{EgzN!FrLbn{L1afpGl595w24LqHISJ#mhbbh
zbn&_Lq0~w*+6bW~L&K)V?2m5ve~JiyEm{0?7jKZY{kq>3spSOWCDU^%onWEY(2=6n
zc=IXD+*C%VThdXM;o&Dhs6|Ae6B;9Tb|&sAKiq)9w}vJDx+46|B<~luzs|4zI*XV6
znzs$LFKh;7tZ2!>pfe_O3<qk28DANZu_ND@Va4jv^ra&MjIUeS!4Na9_YCglQ|gmG
z|8!;k=YD>JJ9jp$BFu=aJEowUe@(gUbTAR50YaD=(AM-%H{TiOa36v02yKo5+GU~G
zmXlEz)+Ura+X4LFB*lN$l{vBFF{KlMc-F)o6Uy4{pcg871cIBfWz9leTabns|Jl3w
zjOadC<543lr5E_#N7ChOb>M$_5&pA2zp?&rG0uT|=kQ^L+7(E#a|o!zdgatR*KCT0
zQ1*%RlqU6k8-19rWS<gEVIsXGNt)9{kW<rLJ_A7iA4c-O@gsEB%h-Y+Z!GRiERDJK
zruWFm5uj(3PNs(CR0aphCTib&B5|xN(5h`VV*2=Gj8Dn{{-5}XVrTM$YmVc|2y1F;
zQd27z-(0zqo=P7=9>{H8Kx_R|k$Lq3g#y0ssU&=K;HGN^q&yD`ru%0`iT`7MT2%jp
zz$GUSSYnu_94{!s%{`DN8>6+sQQwn<>qw^K#H@|t;sn6Hz==vr;J67aqM#xKANSwX
zC(LkUM=L_{)}}a5nQ5`~8eV7VLkeqRCdnyLXR5vLq%%XyJK;KN`=WTFXhe%4B5AdU
zWwd_-pWrud0S_<C;!oKW6PWZ%kK;SMWK=0zelDYPbxcg@l~FnhJ(rxGupkU7jkdYv
z5j2>_`fKfk-wZ|gzi9y`pfyJptS4J>&T%|P6sLx-6Ak`Xn42}yk2;{x_MMGR+~4_#
z@5-%pR5<Ghd{zcB{<^RBKeFWG6UD0Nq?Rda%t+~s7`RWr3JzbJlNxNXC)fNWl^0&5
z?$x8=cC>BHxuBvQNQT?XW@G*L%E11+d;<MXp9Uu9!sah!;13OWes7#nl;UM`|BkG?
zF_)f`&ahMaBsSO?sW|Wg5ckUFBXope_h{ArA7B2znc2&&I8qx}P(lo!kd|Iruh5MN
z$-vJVkh8|VXbuzfq4Yin!tCW2U1TwLGPf?krA=W9qQ(AG-u5zuZPCtd*IF%pTxgkH
z?1f+T5#sTS^TDJ|ejdPHdmDzB+1rbmpwV#~m8mQI^Ku}G;9N|sr#j$>jjiO7&jH9l
zK4z5J*0kE3NoRCOJ^^`nWL554^{&?_YmXLg=vh(-dO>b#91&RajElv}x%~b2p0)C^
zazXxStfxJS<DEqy-QXr^lxEEYhmzwz0v*~S#UXeRw66xxIdBfKmIYW;_Up($@5^7j
z%O_}|`g2~q2}S+uKY8)o^#?{UUc~R<d|cX@ybY#1dMRhnQ$ISKNn222+43yj&w7tj
z%W|tD?r&Z*OR6N<OIEI=L?5>4)QW7MApgAo!?@YAOvpfmy8|;GTi{uX=OC?HzPS0w
z#Vr9)1AE>ZdrR~$rg8L&92vmS&K{)VvI0x~6l7uML41I(6YHGYvejY|z#RI^#h{IQ
zD1ZqA3Gc$BjP}Ns%G&whYy3%4=6J^jI_yX|6E^&Ga3E>OU?$y_?i0$yVg$m!;?%Dp
zhTAcZ?%b#fQ*xn{S0xWxOD9Gq-9r!=txBghPG&<*rm<o8$CTCf5l{mFSlJZa+=kDd
zG0150?EE+50Jk#(FQ(Rp8tp_Ehus7@TYNz-SlKKi+1Rwp!;6DYcF&r28=F7e(Zf0-
ze~&<0_`o^jktGrl`=M|%?AxjWp*tp7NVvudU6@Kwy_42CyrSGv5kbXxuAG}!&$d1}
z3EioYF#|W>+R(|%QTb1qCz!yeBjXc?8@z(*ICAkR!{B1>UyULz(Xq=>{civtMECnO
zH>nKH)ZuY*ZoO)9hotj&sdxpcYipwvZXTf~qMh2vhz0TH72wkxV^$4reyGA{Fk+&1
zTvz~eeVZ1>;@xA2$HwRzEl+wen57je9L8(PcTFr34Eyusa1w#rxhv@oaByp+&k*Wl
zKm_b8BWtN)#hz=-=^o>ldk(BHtUso>rLQ}|heR<fx7nOGKN{MA=cUg`Z%Ln%j?udu
z4{{~FFMUt?E^$Q9biE{fUiy^uqVyd0@Dd((PQ&o`q>ueEQnBPU>35~KXrzh-sA&Lw
zEWImzS9<?&9@&n5`5g<`Q6E_viy^H|kA)IuWWqGb(gKXuXl%g@&wcf_Bz9A49bDig
zL~SAg6pvWL@d$7(y)1oEdXw_q83%nnqL^?gy)AuPde7-BA^*=wFG;VFi)SE?lCJ53
z`Hu7}Z=^|gepULi^ce=>EZ8v-`#h0;D1BS{;E+gDvQf71a;AKBiB>usF--KL7pb!G
zVjba2$4rq%de11iHxS$ru~&lu4E(KPKg4{5>1{qZMEG0MXZX?xCMME3Lr~t4e#lIa
z-sX8t`l9qARFBkkuwe;OZR2g}@1%DJ@}EjCOTQ((DLqdY;<XKwJ!fp$ccl*o`X5}J
zML$4H+dGQm8)IemP#7m2hDnOv2d{57HjZnr|4*flr01k#rZ=Xh--i^ATQlId;y+0X
zKP7!hdR=<Krs72}SuK4aeP8-6r#JOP@PhOu>9?g%(HYXPZFkHJ{J!)<>G7fQdQtkK
z^afe`9A}<t7*D0A(odzomVR;A3t0p12Xr4^p=B?`FuZaw5F>f%6U)E^D|gFNbYx@0
zCZ=c?Us^Dgq4qQLO-Gh~9b^DW2CqneApL>#1?deYFx2LU(x8D?;L}~iHzoA%8`57$
z52RdrnP7X#|E2*>&r`J6kJFh-uS$O^{kilKZ7cD(m(s_y8U37GsAof?t{MSG1mr7p
zpoWWZ#6cxJHh!e1>ppij;?)&migucP5o}AEk@s`E#@v74rpr+lO344~(l?|xNxAM(
zc>9pD(pLH|orAqx?w<5z>5rw?aR3vT^$i8-r_zro>+O4KCen-2A4z{Iy++|`W)+hy
zDb@>iGD_<YU(jQ5Y5i|YJNse_DXQXm_m-d~cDlBHfxyBh90yn0vhP=}1Ycbd-ByH5
zDTzve-yP_=w??2z3coM?sq|&&lEK{Sq{q_7(#JIXUidaluH%oS|3&&VFFePM*ET8p
zyV6to7WNmXJO8EhCF*rGTRI}(Yw26kl1(|1K|~i;hoft48tW2ha_2RGj2`af=#NMO
zj_o{6S?DwOI-<wEPx-D1S6-9N%)nyXL+|7I|Ely2>Fd(#(({t&OZDFTkEQ!G-`8$`
z?n-Y-e=hxj^a71oF}NiKfbU7i094K4;{642Z*7=aiAFCOqVnjl5v?OZ4A?Itj9KG`
zCpMfT9|WKi0{j@d7oq3Q{&NPpy`MzT8d4E!zTgDphWo5pwq6Zv?*Izvp7d4e|0R8q
zTqsC)H4k=IdRf{^Kb79bq^pqbNMDftSo#{l(gN1-oY``hL=!(`97mHbejxpx^k>rN
zk=p|tV*0EYq;bmN>FYr+O965<Bjj9>bLXhXgamJXc(QOs&bftgwWMt;w$Tc{P&Cxv
z1h@-3G4Y6hi?JOoNl1*w843Rj>Av(;>EBCVmhND87wpXm9SrH0e=PkNOMK%oKP&yA
z^bP5Gn+p?=d;)0hN}cqY^d0GG=)fK6i_#xUe<+<%^r%pQ9M~z%#re!&5?5@19Ra}!
z`zDHmv;J_5i$Me%T@!24#4G8HDN?0$)UO2W9Y9AYaw!$kU04*Ijc{3l)@+}_0Zk(O
zlJrd?&|_l9oL-L_2Uakb=PFeAzVy4&pGv>a0L2*lB<x`=T}m%YuQJQ2iJkY`(jQ7+
zkY1BA=?S|nVz>MF#($80bT|*L2Jna~L&gO)R(WN9dnkgy4=Z;2lqQTd07*c~dm3VL
z2ro#lk@(E0V{Q!&3l!zb9<%o;rRSwTkp4pYqI4G&@Dxxa$#^c^VQ$IK!wQsG|F244
zWgv0Rg5_+U1npf(KlOnQNo{^h`lj@|$T^6WchI6zx-V^|&r07Nj3dkFv<RrJVgJ@P
z6(MrpZq$JX+7$iRGov!(n3lPp=OOF1?1;g3pm0(5x}u<5_+gZL2axjDH>5w3zA8P3
zx=-+iTT4xnv>(LCmPme+=<gu<9eGI&J@6QPdR_XK^s&^WwC~f>*GRitd|GB*Tu1oF
zQ|U$NMEY=0#45qP3tN>OyO*H@g&~>kK>96mo0uOiY<$9o5Vw~+R@`>SBZy!prno$@
zuxK(s-bC{IY3b|I@7XZ0HJ(V=&}2sMz$(32#)<T@^fl@0(q~8$HYj7TbHsGyV{av&
zOK(WOD}6?~5Av{Kqqpp#6IdKkBp6!%1oc%m@3gkeza~~(N*}oc$*j;(V$Tc0`fFak
z!Utat`g9wDwY%)W2B33kgM4!7QhHDNXml1+!Vlh(-XvCt-sI?OOG7YHeEwX>Kk5Ic
zrEf^T#VKwD7VjE>bIw|vGA}9*{eNBhZJbU<rkZTiB@tkTJ$k^!t;@eba!;*To1y19
zTX4>weE`&f)dv%_!V~1E>WVT<Wzf)G`A!)nJ2xe5tl@EI1*q{+E_3Nk=?|ngIm@jA
z&OO3%v^djOq*p0S+=%zH#Qv|MOTl-qu_%`S<M-mx(w$$Ez9M}dv~@-4Vu7kRh-QwO
z4KtXEXAZtdM|==q!<M}~EaiK$+gZi5vq_9CJ2GmkH|a}BSu)ax8b;Ba=jw<UhNEc8
zFo37h=cV70p10#CIuiC3HZ=C&+Bfit^p^BB>C>pyk)0{C#_r7^q~3`INS5VQ>1F9I
z=-Z02(g`<6x^>0RNAQziY=*9FEQQM1Au$@}Vo(R#asupv9Y4n(U7>eVe9-X#Jc|@A
z;MT^ri_oNkXn06N%!%N?0A}EX?u~VxwRXZlHn=!JpEIMN1oG6d+7-CQe&tW3&q!a!
z??lDo6f$43$H)B14>zz3eU#x}!DR^YQQ!qV5<mk&xFK%1M+JFjDR66btbGKPTaH(;
z&Kpp`0G*?Jdp$~DSV*r*uTnve#J4dBWMPv<7Vbk&q&K9`!eEGA@nC=#{O{bj+F2~#
zo6={b2f$99bcZo?F*~GYQ5U15L^3d!hv<1InRntAD8<~rB~{rzX83u{ed1d-fEum!
z_9`w8!NxePE!~lxij4EegvWbLx=(>Qw%63!IhXw8$cL(j{(o6|g>yeLBHaR=3*<5q
z(#-GtItWvYftvyfcLXCb-Du>~!Vf<OXUVAwifcBubK64)kXem;j<u{1L5UR@u^%Pg
z@B+<E>1Xt1=2lO=_VwGFfivlKj$$bI-?(o_7P)0E?bFysNmpKyUSSW0LrYS=0v?;d
zV%hKB%+8)|u#(Bv?bu1tRh!$it`Q%Q;Vf&?>xId>vxH*b%I*f}!W6x*yRC3qIUC?C
zc2BSzMU1_*E}Rl<ZUUofoW_@Nmed&|-D=MCT^faAOE$K%S5_gNNuQQpv-2!tiR(V?
zW{|AjxURd>3k1F`q0Eege`8}}x`FkLPr;g}=yb_TRlxJLk&_#jLdQC1WD0XasW3WQ
zBwupUTTgg9SW&rgjtVyO1R!K;JoKRfI_>L!%_<hu0aH|T7nd*@pde@cS;#A1<Swbe
zLyUgy@1GOVehTC+IoH_t(NZd#EjD8Pg9l!rwll^;)^=dyj%sw}C_+@-chm|Bh&1VO
zW|*E9Q?~~)gBH6o@>|%JEn6eN(vc#3hFLGH`W9G5?Ee5yYOh0Q5Qt;&L}mLo8iEoe
zo+wzoCX^YcfuCBYHL+jH`%@aWqoWvt`(IP5oAuL&7Ib-n8}6|XBU$lg?iFeqIgME3
zUL)8N%uf8?+>o;j<2N#eX%7Kaxcu5$l?vR)%pS7E$n<Pz7=(gv-QjPK*ivhL*mD3A
zT7WWQ7m-0+qu6Wi5p?_ECt&23ub=V2k$fLBieTzNKs@%8>!&t_cxutmgv~JV2z+O;
zYk2zFYJY)&M)`820-14#1si<_WAl{tjN*T5`mk5yW7FH6HRI-FK(eud`<Kt*{7miH
zHcXB=mhPbXyFu;O@1IdLxDOA!B>V_Yas>)Evq6V@Rh!VN6tpW500bP<a@KT@Lb0V+
zb<H%7Q?s>|l{oJVCds+}&!GVaXJ(CgpCPjapj6L*%1-f0nBA4l<q;G5TIxaTLlp5;
zx=X$s>Xox2Gm@CA(P^x*Cz)X)7G|+C>tl$FX`jIqw7)DM4iq*$KDUz6ZU8`s{+u!f
zZDkv=GjrE~JdJ0N@xmuqn+BA=uvwSasCKWI797|aY3hWM(i#&Kpw@?Q!S?!p#5ph-
z-#tU%QO&KGtomf^#Nquap4ep6sL4j}!iMjfy8OK^9^NZM_!TkC5<@ynA39w8#60G;
z5n5UhwRT|kdSG-7))%c3b1R50F=Ho8JuPDchDY{3p#+{pJ43M9^FcdBTJ#qgQ#fVc
zWPN9w?Eb{VSO0XNI9nhLu~(<4$HXv#-Yl91?U5nIu`wDb1SdA<Esu-CP_O#&%2j{<
zM{Ob2ID}Q$LvDzGH;j=AAJPpv#X|nwLHHO=90hkC;lG4JrWLw^HghRGXY=%0Tip@i
z%VvN@D4%{_^ba;jPNEuE|0ypLS=*GDI%rn1vh4mTBBQGfu`f`;YwnqoI2E@k&$aoN
z$Yu&VnR#j<(Gbz7>`2QfdYy3FiW4%{*rwK5WVL1Sg=IaF1TWD4XsjK68VUesP!Ym!
z7GymUn=O3{#^Rlkb#7RN(6g+sc|8Z3Q{TEzva+#e?qjg5Ep~Uj6~<$TXeO@=`I60H
zk)O%b8Gd_@@x8$+o{)-Xa9U1$5LkwPm;zg^S@YbCVrQZm$pA8v<%(;9HYvp3@oPF9
zr=1z~Hsn0D#4BSC#B^{TW?a0n8<uSKW5kxeVo6B8zkkn$Cw|j?{7o=oY8-E%hiBGZ
zu$OGGoVEZV5fDu%B5qBY_FFGVwT(4zbZn29ak92^D~6wz)f2_@V5#1<m2B<ooqZi8
zid`2LDOGHPHI8v+1cAK{MB2Z`)6UuAIh^P~vdhEO7g~}7FLVSeHK}dIR;!pS(E}rM
z>-Jee9-8zgB1MR`^YF>RgA&tI+fd!k4#4b4;>JI@;eRtb_#=ZWUNFmNHZWryDtK*L
z6oJnUA2u@??K#95*v@@I(E{dDZpfLSofU^ZptCWH4aaB9vE(@bW!pzLI;T<IWoqn(
z8qOKyr7drsqa8y78sSFxt<1c70VG`zUk*{kj)Z?pj2=;UZF!7v<3l=BV?*DT{cN~2
z*iS}A95zCRO%9;ECKZi@pvB;h7{J6GxKFdk#Mr};EXHGB4$3%1zxlH>yS?bg;hA*_
z?eAQ1*h}(^cgY|FmpKQL&u-hlnF&(n)k9AT>w~BVB$SpW%Ll}29WEue_V(cW$W^E&
zsXPXw*kS8Wz$Y@Z0;ONPcK}<?GwX;^GIrt`)vt|_(AxkrTByVZS^<6(*zjnIYCMJM
zIOg*sj;hYhO0@=Uw1lM>qbZoI2_VOVCJ#8}Ec4cOMbCy8cHZflbZUxj)G%uc4n&Ub
zs2hiPk4Pt2_lfhg#gZ4M20aG6>%a~=#tt*qSY^EFqqnWw@XChF?nr_ff`zm$Rez4=
z9*7FI$c{%n#ZaB1PZ{2O-22LubJd65SGe@2M7bgIVFuG6>R@g~%F_}SHJowP%cF8T
zKr&~GZLn1hlFtvz&u#`5Hpeux@UtYRd2CGw`wSo>0-mC6LFvnQgs|8hO<imE9t$w^
zxwUlGyjMwxSMrV{Nlb)c3-)4eb68vBvK1t!y*M_<EM~0Vl3ld3;{iHs)82-(JnhC>
zP6Cu!<Gt=klFo0dft5BUAsj_yW=;^hq&Z+O|0x}w(J2y@gkXea+{R<=o9kPw<Qd6U
z1B4o}QHvGX-Zn}i{b-IjYh<xvPFrSiPwA}`JBCp{FP#v#9ns3OqzP$a#O};%kd0vn
ztp#nZsr?|8>pK7;w@~iLWfHV;WP^<wD=P1^@Z73-4Q3?h-=%Gu<}d<Ho9q05F;$Sl
zRXkK^$Z!FI1yp11qI_LbR-J%^Z9ItVY@+jtO>#@%!3pfUr1V#VDME>DaG$KQV#m!8
zZ_cY`kS#_WSUf`TM<RARvhnU4E41}|NQd3eq!U!MHDauzOO-kBy-j*VfwW`UPHYy>
z9Mzc~V!&YrHgFJ6sN1ZluLq>O<Ck-{A%OyAM%?a9a*-lzP^X>^Y%R1cP@xi~*jg%c
zF^Zul82CBLEu~>eH9KxCeIQMw=P^&Y%fBP%4tEhYB&<@PgB#Ca3L@Qe@W&m>x3Jym
zdv>ITkt!~Cp*YGm@-d$H5p4LO{$!SfBm)*qgGdFVhSTdrbr-#%wdABvwg}`kwXj_s
zqM(uNJ7=gc2arRen6eA555GlLAP%%LGMKDCD0p8%t|)Yc@Rmfv0Xr{%GBW$qd;Tp6
zX2w=4tQ%kpczuK?l|Ov=gbcM=vRk|!6xL-Q#)Mltf``!ItG0~QxNnuTVqiRbWX7=g
z8Og&3(qnt>0+#r&m#G0HkufwTK#w(net6J$@|y*iSqeABfgD>{7UlS3C$S$Ikf3mQ
z;8bAAgpn8~axFZiYhEm8qHAjbRp1gs>zncLwQ&^#E8bd;ngJn=M-Ux~GG~`<i2YjQ
zrR)Yv5YD0HQMZie3K%%+YgiA$iqL9ig~Qg+qA6KIGl(Euge%T8&5=dUimWXZX)Ou4
z`M)UudQ^G?3k%$SOY==RIM7*G)&Qp?MA-p=PVJblaRWMq{4~YZY?0QQY(_TGx<|xl
z?5xZj{Bvi7jrcP?oFx^xTu_}FR5+|+%YW9Rb@+?|%pDs*9|?a6iWMB-DL38+!5L%+
z+1RPZ@{kFdw<6Hrt8Gw~Dmzvtw|KV!<9HIyqk~UW=GK%OPkHzWe`(+gwgVBgmRDsq
zi)&)>WFRs*uhxT*7EHX0GFoO;wK>1}GDzb{8L7jBMm81sLS@U*7(ZdZc=x53Ns~^^
z(1#)Qkfdiqa#&DzEl@flkB;ak{LU>7Ao@4Lr>~h8aK{j#AhI;R1`YOIHiNt|-3Wwv
z3!Jp)@?`MC&e(%1+>IA|8*KPU{wuS=4S8me*cUi}fH3AhP%nb8rD0AB1E4qN*)vve
zufsKFtwsk=*=vQB@(MJe7=8H2Gs94K1OeAL;H^n?FZ?h-H9`u+)$4VgdE(6=f4)y+
zQ-QAv2xY;;N9klU+RU7R!iWpPb<g_DjrjfH`o|oHmY$)4z&X(zKCrUrK7Da)ZSbXQ
z(Ay)BljhKRh7G&6DnZV6jfkP61*?#*rN^l0%?qdQm@5*hYXn~5RU^cnxdV$7K4^ih
zb^C1W=9M|{VFsL0Wpnw9Uuy9uXBM7hZvCBTIfJYb+TIdx1~;}F1VG+$vBiLjl;b&c
z;iNwxYbgI6mCAFv4FfjI44^&sCV<$#9lbwCAiu47>4||tg};1DYIMo0fhfJio)6(q
zA=;1QZ<TGkme$C}QYJmeE{MRY7sC}0?kcQzX2;OR0~cslj|QI^Yi46-OKu?o6?9M>
zAM&SqWICa%Q4?BFwOi9U5&@N54f8(Z?UA&W?qHa%SiPBa4~{|4j86esme}g6K7mkX
zfVQK1I6#I-zFTB9<pwK~s-Yz)*aP=$j^n8XXG>Th9VVfNA6>&f^v7FUs^40CI2rW;
z1sHxTK|%R?Ly7&AKE>Kxs)V_YrwTJ(nBVPCl4x?>*D&rg>s?5wGTb*RXh$|Harnte
zIPc(J7u>nTFvZA{&XuPls7Wdsb<n(@z9hT(5Kyw`gE~`zU|nYnOseTToEXf|`?O}$
z*W5Eeo&slljQpQS=MjW;1PT!{mbA<S`DI6Vabi<K_lp;>S4R32p|mIkdiplh|8OIr
z3l-t}F@op}i0lRvk%C<iB(I!x-dHt#Gb+DGssj9t!ubUlc*{3V4#RiZJ)=p{5$4X$
z7RbTtT8DsgUUCgU;6Q573WBb&-~3Vid`fpuM~SbrlSCt!YSFIla0wit-4PXRV6Vm=
z&j+ngas3YOpQBU(`>c31S!Yl0;tqwYP^1a|I9S!stO2x0c4qUB@7vxCvPd^50QdqX
zO5`s{Th>^ZMDR$y`^r)YiTeK?W|c)PyQK@N<@zP1{SgBcEW;xfw}CiaSiTV))NFv^
z=F+M3yph2s?248<R|G`82y_dw`65F!GN|C!!0-*IW_}1D#`EvknbAPFwSyxv)^=&>
z;Qp-~C~u5QkeLCP*%A&T$im!uW1wD1Wiq&pi^Ii_BqTDZpsp?PM;(QQ&4A4741{2R
zIukGNqv}gvyv!_4BmrAXmO7)P&N(q?!~qKvmC}z1e_vbpRar0A)E#g*pqd8EV188?
z-L?<r?(w4?kx6UmM`?3N=4`Fdg8WeaTbmtPuzndnq`@&Hv8@NC@C-Rcr7&WJ0D@X8
z_wMySGp!KLKzOM+*6$-I^5cCBnJwN7lA)LXYlb~d`b^Bd{JrQSFtY)#E$X<jsrIpZ
zXBD5;D2yG2b6GqyQUePz>>V7C*b5!15hh?HgRX6;cEms%i~28VExX5b`F#cy?xMty
z1Ylj7>)ISP5|;kAnf><;af|!TQIF=Zmq?b22Mw;p=pR`S5#-t)09g?=ZYVJnCbS(9
z)W-1i-U6hJ`h;QJVbU8W8%E$WvA>i91)4$mj%Ye4JQcpNla7q6v0tcEUA!+9VAWTg
zhFTHRBsVnT{^N@xMk#5h+5$i>v5*TMb5BH?V3u26J6u9$ULj!6Vfpw9LsuBlvt#7P
z8TF-rrba#uIco`u5Ha=Kn9%{o%N01#zMatz3LBr18KmyIc=Mc1|4%^orSv>YomB8_
z)PXsMerBW!p|Bl^?-mv4l_8?inOWywNh^Me=1)ll!Up!j*Nnt3H=#K7RqXKUwg70|
zPp}#Qzs_`_wODiiNhap8^s)hgYxaqN5Q0*h4<4bmqDMm<nnLj_X1VX!6wT=KhnLS_
zU{sJ^gW;MJKx{k)2(myl7?cOrrh}HSG+=(e_$40inimfAJ#g8G1jcEsQwV#}Ker$W
zKIh4EcnG_J50A|ghBW8R3*$CVDEIanAu|Gi#*o{R46+~?T(cHE<5R|WL}(DL0wPk2
zy&5VNy@?u#AwBwiW6eg%w&Y7kNU#lbn##1-eLzlqrn)VKY<c^0Nm%J)jsS}aUf>_M
z5{VXg!VZdUX+cCWP=Jj8PSE8UGK)0H>i|(fuD+1npaso86HG+%(F>k1Ihu29KudXH
zNM$~;cIBwIc0BxquWQiuG9u88osibRxyV_Oc<rEsu0|bL-VZivhG@h+a(38-bbfdy
zN5ivq(0vQ2RdcEm*W}WB6fFX`z}Xu9vN7m?3l`WLfirmi85E`<TbD$(Ev8pC^0#L|
zCKi4+=*b<L$|kHx@O<|d7rakt73B0O?0+MPtw)tt2F8o0jZ{7gAT=RH4~UG)(hfZf
z7FWh2gjahq$YawSj>j~TO_1d=w2D;T3ut)Ua)i5_5(<0!$I5WwXu0db7d27hif_z}
zF<J2zo%Cq%<_`K_v3V=t*vR_ZXU9N6qFM1c$HwLe7T7gHop}Zv$zp}0K0+7f_<(RM
z`5*}l2z5zzTw4Gh#y1rft_F|2?*rJu4vvCaXD_|8?v86uq2l_?Y|_Fy*gMM^YYRep
zTQ9feimKlnuQ}_3(IXomocNUAo*AESN4(|&)ZrrpZ)}SvYpszb>}ju>nU?Z7O8B|;
zPfd&)+8Fp$jz$o%%2E5hppW1}dO~)Y-IhVbE96GR%0M*>G$eriUIr7TshUG1YUW_f
zSlx&t#vNEd%njy#0;^QeF?NKNy=~MH7hPg{Dnd!mzmrBh$HdW|{346rpdQ!k`^Kut
zQ~qgln*r@ep=)c4$jnZNeZKWo=&V^J;LVvO{3QTq)R%krZ^#!X_~0OscW~h*=GaE$
zU(&-I{lr^FA+(gDr({+UXYJp-G_zXspF8$dYTsB}lkvzWJ=i_zu9=O3K}sK5gWZS^
zC~yGB?2`*Ar+RY-?>T`5cI3OLkdCCgqzny=(MT@i*9v6WLjf}+l)*!ZP0~silrd+v
z=Ho@|ibl3;#*rnpUC|$N(l?F0`!CR+fPflnZ=dsA#UKEwnHlyJ2}j-YlpPry{->;Q
z7HYHC!3O)hW*cmv$j0uEII?GhA}%gTWM|xQih4&ut2qqfbE8qNdE8i%3{Bo(%<|D7
zA}4B$$H}D!SjEm>cpQHTddoRN>mb1&BmF34?IY06I@q_ExO9h8;}S)pGaiKICci1>
zx-^e-m%(1$q4qUMXhrNIv_HcbN1M)oj<bj8y>_}mpQCEm^e0P<2hmY-p5Zzor_3>|
z0cZ4r=!uPkTG%Y8aK@E%kNL3OfQIN=nHhLAv+JX>J?;Qn(u5@daN9QnnQ^p+2C%bN
z2n>AFZ-LCvg&<iDpFcW}TJC>kUESs3wj2Ef44+ess>$!8hB4%V1TkgHg4NdY5hy~<
z8}$O13J&5FEOKTw=?vF<LOo!77E|omw3pV>``En`IAS~U!wX-+h@NJ)HzQ5<s&#9@
z%#Hp<U33Lm;KWkdhVttLzIipkFJlR};AFvlIA^z4XyJxY6b}y}@Tkuf*5%)^XQMC{
zP)s`zUWvguLUU#|R4FjKVfAxJ0~O!gaqvk53pon3JLIv)*3amfs91)elx=Y`>ro;>
zh#`dQ6>Ud?8;yOO_7XWoIH2G|d0l^qKluBvtP!m=etd&J*xT?1sRUOpnV38FI&&P$
zIcqT70J$anCFw&+F21ozBhvwl8!2E)FORLUwlfB@Y`V_Mi-5Up=mO|}2Y7I5npBT2
zUKpL|y@>A50NkP<H}6ks>22T%u;CQBURmq;qCcd+_wp;E^XS)@S)nR$TMa7s{t4sI
z;yPyiNHNfzNOya2Y=KHd9k)gE=jQjHU_*M|FNJ@1jQyGZd222H6Rh%NGyv)>6wC?Y
zTf+aGlx2-?8q$a({c~H<^;h?Zz-V8FaO+#4VXZ+u6`|jPb)Ahm&<JaC+{MbiH+Ve>
zKm2%brg9E>=hA@W5(qcSD_d?eb^N8J0CG&zeVCvf#NfFNuNsejYHLxS*=a6QD|)O5
z16sSs&GTnOJQXp>DPd5Qf@=%Ix}c!7Y>zXNzzAcaZdRFH@0rkj4IuKo4BYeFXh^&M
zq@N)ZDmQp$!i_cnAsOlLicGg>!-K@0kpiu7yp<WSDM?*E8bL(oRd5!<>qljI!jta?
zKuV~1yuyi<pBwCHW+cPj&D|grz?H`=z``0BqeXkT0a_aXCkQho(D53(-5BP&pC*tq
zQ}q;_V}su)5#GWUxVLz%wP+V;#gt_03?jqsP;3JO+E{{+vwJ$Lx-NLx>0mT%c>F6M
zj3B#5aKABA(d(vYY;Z_Kr{Pa0mQ-I$=cA2`QFm(5*_kn7cF>_pSj{~qYOqygQDbgU
zNoQrr^00^)`aB^)&VXvp>|CzOIK(3cs72?qelu)gVN{fq);QyRC2PjrV*PW5{LhUE
zD-8aS0yVK3K-6P8L{RYa?l$kWgpGd}^*BZmYU4F+Y*<#|n|EPHl?5qgXY(lI`OCw(
z$Wc}(%&ph>>coDo(Sq?JeB=>pROJo@)CK`h3{LL`QmB~{aBLQB+NWW+gV;axe`^LW
z$HfL6Xky^eW>CZ1JAi@}zoHa#&)5zD+Lp9QWsXHey>y9)qFbpjsn)F9IEoR4{Gk6=
z7UHb|cov5X&>^4P9yN+}oz=l|m|VTD2!9_nkcuT)QX-8#cx{PbZvmH>j(ddsPnb5I
z6!0E1-$y>q;wUWG6WD0tPb>&`dX~7Lz*(Aq2~D1$eJy^rhk@3H>}PyY#>R-4u3=Z^
zhc%cEw;E0-ech$Ggr=Y1?jz_hpamz$7I@jpCV)&3(*!nsCUf24x+-&N>QnE|rT49%
zKB7Y6;;4b8zQhY-aWr;GPtH+Exx}PYWG2ztm=PK74QLb^qWgHpQyhT>o`b878_;-=
zs9R0VL`L^-+6Ws`)cq&7Hu@-3Rph}zOAqo}kAhQUlaJBuWThYClBecRHn$0X12tf2
zuXjaeGDQkoSSEJ{HGl;_m8n`RQfq#;a5I`w?s#5$fmkIcN|GTP8_EzPAcQBeIT8s(
zGB*^j0M{%BJ~U&IX4vq`j9QC5U;7Z7SiGE770*9st8Ju9M(pJM*!br4_Fr=ZizNqw
z*s%vAM9qxRvVVr;Y0t4$xy59K9S<1%)a@V$>&y()1Rf{+WQ$?UFb~5Bv?DYN#5dYu
zB9v>e?ISd{mO&<Ag95?`;<2!Vo53vU4FuG=nV|*2eq#w+VfoSU`IFsvB3*EkaJ>`8
z&|h1>P7le8#b{_oox{ct{zie`&fS1U+FhBsmr*cU1X+<pbfc~OVF{O%qE4iHP>^ca
zMO*4)J6FS<&0<;*CUt=MM^?DnSy;3;pcP9M>J@CENNaW&v*XcL{;0u-$2-PiZViIW
zZJNWd88zrgA;OL@C>ijxLmFR(4NAb+C4qmm!YnYIYi#TGFoTg{0fiMlgKgqv_0F>7
zeGOm(e<o<SK^`gLha|hO@*uLKx_!-;?QE)KXM+`szMmX^LIsO{_kBZ~Rt7W_iDs{e
z8g%PzACZn&&oRDX1-}8Uib_moAyG2D=hBt*Sb91-%d5rObvD7JF*@bW&RIQiCb<&~
zLW^VP)g+EJG|s0!Oi;X1q&!Okxb)gr5F+kcd2lcPo&!k1>V04|_c{$bAw;@|(S*@G
zW}HR_jGFp4F8jFUj{EH~^o@~Am+ZI{6HM)013e$n{+0Bh^p13j>x$ZAM~J(yq<jzH
zrjp4orFSt}!JJ9E=Zc%}a8G*{RLdaQ52crE?tFoX@EvzeyZQ(Ok*Y>dr41~K;3aJc
zf;uw;y;5v6D@?GgrS)b<?JzfEurGp@HUTlWz@y-~Y71GT3^RQG4b09UtIwE79jwM0
zyR&o{y2JZkvEy<Z5Y^z%Wo~RUFhEb}-VEkUMz9w6K{6W8bLQzt5FJ2X<R~3dI6`JK
z9%MF%KqKCsqQz0EtXQKUYLA3IBFd>Klh(kIG1H{M2J8k;p1|>Uq(_|LAUonW?8z38
zI+)NmkKii2Ff3!j9GD12cY^{_%P57Xr2h#|-_We`l;J2zE{~oOL02qWZ6zH^m(qvQ
z9qBbj9X>R??nZwuEPZztP0Kp1_{Lra)mbP1lHAX^^E+gDh31X0u#yS+K>A2}!H9?x
zVwg$4h49Eyj2jd3zVuLf4FYJ}KQ$;4ugc>1{o=hZJ(gaz0%FFRMI-IT$Ps(}>s4cN
z^FUFo>nwH@%Trs9b@P04`gy|%Pcawg9C&!bRQ<`o{atci=Y%jn2Ts_bi4z0kM<^T@
z(hsG-;dJSarK>>?JBw*|9s#5P?I+Stq)$U2-QebH;;1!a1$MqRopdF=E&WLP6!Vx0
zOlqicX&Y#~h%J((95A>wTp%a*4&QR-W~`-Nn8TB4Y~QZ?&`wW^L0*|)qT=z^e9?{v
z30~Kb#+u~+bLp4TOLk~uWQH@GX(8Q(61a08`i}H7>1Bi`YYW(!KJTO_(xWH>CY<SK
z(hsB;rRP!U3eDR=?8~Tx6o--z)89gBiDFJk)E23nZ_gVQYWlEcYc<KaW%M-#gv&!B
zQc80t{R7{_jV);~(`XGWsql}a?=Tbelweub9vq;?G2gYfUhhipO0OAdP?&%X72pXB
z!yb?RTKa|bBk9$C=jcqjG!E*7@47mi$Q>2zHECLIVfhrJvf&YS0~@$I2&=qew<Aud
zu#iItxG|8qWRr`{7yVbsE1VpvP_hC)m3}0>B0WIP0d1;Z$?Cb$#JYG}`myvWSd1Gc
zt4>Lx3v$=T9{Q!7_Y>(y(#z7zRtXW3E3poRY5L8l^mpj`9RLwfEsPDm<+(fegPHNE
z8fytHX#)+nnp>e}f9KePr8S`k=`nt?vP5CH04wQj>F+sH;tspw3UYA=&-9A15?c=<
z-<7^4eM)-K2#i}hMJHwcD{H#nU;K{r_p|~m4AKj@Jh`Ez^aNKoV*O8~m!t;<05%qN
z?jU(4bm@*08G?e);&)b745>}sYs;g0H}}~GBKA2l;<ngylR?}dig-Vk{*!bf{WgXn
zhxrh+@P-WJI%Wc-i+>{h2Pu<&M|y?v86`^EviOgsA4%`h6@9bvKb8KIbXWR)taoEA
zBS8x5q;u(^k8unAEwNrx*rQLS3!GVQ>D!F60ax(|p(hy+Jfj6M$Rl$?@yme$i2!Xv
z)AfXU*p7-*yu}={P&SbNM*4;HpQS?j3LsU60Tdz4ns{PQhe{&+h4fd_M0!hl1<bAB
zAgVjGChn~5eSh)W(qBt2Q7@ev7jKR_H;C<FEZ*Q18mmubtV>1cvIP#>SvKBtBe}Wp
z8pFPW&KC$k*}upBN)c$T*)s*`d9+tAMmgNc3j9*~Yv~2)_b5hO+WretPRf|}F-*Gn
z=hA<bo|8_b=PY;WK;8qO+&Bc3<o}NJchZZ}UF&3zdUepQuBC_Qf4>OaVP9Hex1jn6
zFud-^q&*v0V=*#}NsFT82rvOb=hFEgiwb|gA#jeT&#*W<V!CcL0!nNBRQe$+_BqSe
zb9Ucax|BYW-jUt~E#F`KJ?XBrkiLKqs93!<`}|V+7#dm|7eA4{!!*;^;c~3{;dk0(
zKb3w#WHfyKmIlfoy@g8^-x74Ff`7gFR5<F~vdysVg@GrYTJ}~xD3-FhUpd|<%<wRL
z6VBm7>Aw=bykd!Xav;~3#GUkx^r5R^EqyHgEkfQ(FLA>ZSj5V`N3M1eB1{+m4Gicv
zFnzVL%ThM^vGg73r+#sKq5~tM!^ndrt)!2oCsHllBTJol9M%~3Jva?h!(AgSyc~2f
zWp=BnRm=i^E71szlF&nc9!Y;^_-g|ev9;ct6=2a&|38-g-uNC3%eJIEv*a|S??^wC
zp1S@&lK!3+uGh%`CPbYz<=YRWpK|b4PlYlI{k9m|IR?DMo~~>-f(8`6*b-5z%&dYO
zJ#qm7_X5zIb9BOWH5TsVz?SVZ8X;@xiS&<5gZLs1%(0QTX!OU@ccmW>RQQqfElAg&
zCm5?RwJR{POX;W5J4Aao%J)?IE^RVP>C8@!i@NJddQbX}^fSQLz5@uGo?!siq!ZB#
z6xHR2jEUKA;EkD!)=J>vwL1DyFU&mMEMUjv7|}fCQCq<m6&BnLKj{@dl)i<OPkH+}
zj=PgCqz|Q^OFwZLii`hBx|Tj8-NhJ3QlB_ZkEI_<KX)CDJO8z`m0pJcwm>u$G|-<(
z|0w;$7r><uI7eWhnKdNL$W9xpW8M7HgwjVeD^y6k#`|pz3Ep#AV&$WWeqy4!06O2|
zn5sTZ)RTY8qyCF@jTuNvk)Xkv(MW6Q`_j+d0i>t+Ncu;qlb%X1*g(?g=Y2$-=bgdD
z>CXQ_`apVJdI1zM$_UBseIWfz`mt3k_YWVz_KvD~;^S4;Xt1DQ(!zPnV;cqTS`u|n
zP@CF-kPJV&w_aiQkF01Da7c8A&4^}~&mhu#ApNbhk=~bHher{l!$f2KmGnL7`>ur@
z*ZjS-lYR<Yts{d=GeMq8A4@-%esFm4yV8G_E~HOmiGmNBG;b+=D1Bf0*?`2KJ_QQ<
zND?cT>IoHrS!MuLkHcCLv0b1ub5gkz1~Np7xAOySghM$7=WVTZxFc3;z=fVt3>b-l
zk_i8b5a)UJe`i$fjr5N6j{^}V`}4l^U!<k<IS%GMqldgfich3}7~MP{NPi`*q|ZpN
zGPtDy`I5Ah-k1JC`o%~FAge}oU@?U$ZNxW)ol!M}fwz?0B5K$$&f=bRXUrT%4ELRr
zFa>M3vve*92Jx^XpJWMsC0$ED0`TduTT#<oGN*fYc=0>ZQ|TY+{Fqwo9DS1KG+hsO
z-@DR<^!I3Q5Pl-2xR4$KOb@MpT(HG}v}VjpvkIBnh8q|^w~Qb><_<1Q@M)vU+grRy
zGq!|06%fp|^ptA$gg|LN3Zn~i@k{AK`UgwdrZ}bmR-Z^uMn(9(w32?pO(GGNwK-jJ
zz+3O)rS!J+Q2Gb3(kNF1ynM}(qPxR~Z>-W+SmF`wBLxi|83oza%P0%uP&Nd67aYl1
zm>H=@=ST!+YzMC<!>e2{hQ0!fu11r9q{8n>SJID-)*sMRffhcJo(#gU0D_*v=PV7s
z2>#oWGDc5?tzG<&b`G!l#kgxIEF%zT%Oag}Za@y1bOt~f*=#<5a57@-kEM;($4<Zj
zf+JKLG`Qb+bF9V3AfLJAE*$~nhIbjK;fDTSQj7_xEi(>!lK=Uk{)Y(vfmp*i*gN9E
zN0icsEB{O6e`ciVn%!E{Yw>iX5iH)RCUz0HPQkMUz*E|Ek{cpPbo|z!_j8I%;WTni
z(=CjfdGid3KlFfa3F=~Ld$Y4D&G-U@CT^sMMi|QM1*5#t9=egp@SYe7;6jelNArny
z&g>z*JTjiX$aH#ZF@D2f{pcT&p`UvSR>bu?Hu@GOqtp7stL}sNj9~Yecrp3~a$=&~
zPFf31zE%E-B=wOYRGHmhzmW*z`hom!UA#oebqqPEhraT|Pe@^IfsQc7Cm7=r$z3q$
zv<D<4yB&aAhj!1Pw}uw%2I2E1HIWst%Ze4L`T0|L6bs+BgHQUpM^Kuxu~@0SEs2Z8
z`}knZ4mZ!U7H<$7#<F+MNZ(t_lum3YR>ta-Y?EwM|6+wYBp2-G@L{<z_Is8ultfsy
zEiv$sv||?*G=xlUGeNHZ`GNd16Tv6wf8IACo&8R8oBpp@d|&ylzkh}+Uk<)wLxvgb
zxoZG|!W?DmG|&v{TFLDa_z{h*8}f+GI#TvoRAW;I)`W(q_KKN7s*BsgA6KSv{r)vS
z?QJId6F=QQ=eB{kmo~dF6=b)ZMiVhbHlQJfsCFz~y7Eq+xSwIeUAOjqqfusxUd1^@
zys1?H*P{{izl{9<xr_Htzdk^=Yg9c*f5B)I)~9tkS4ZpjC0z?=_=@|OwA3KA0)N=k
zh`oTeV~SE~N1rB8;sl;`U=h#y1YzpGul)a67q^&)Fs{*=ky{2|9TM?`ahpodL0X=m
z<as{;Y{|)sU{D0zvw-STv46z$yNY7Y)#y0!|1m@u7ykw;zi&q=?dZBKc|bw!t|qLB
zbfPy6A_(RJf{;O{m7NuGYU5D$8(`bdGXr*7S}i_Edik>*!2c6J9U=_~#|iO?I(Ls*
zEZe?H-(25eu^Q?I6Ejy^3|nbz;+rdXc=^=t5U@>jp;hLwo5N1Te*>S+O2!?IGO?IW
zDF!7tWo=>19tf+!<^rdQOgTdS4I5^4*bkeUQSaI?{)G)*X&J~-^QZeaCVyFKnX~gN
z<A8^#JIaXHu7$bb5<=kVnhnz+g4iC6s{N+{g3Bz>-QayA=q-#e{cq&6cK{R8%gTmV
zWQ>3nZY6@_kz8VK5M65?xV7Vaw$xF2Fko(azq5MhQz@g1q@W$<;;<9%-^QmyY%NUj
z94^Hg0AG-~o(`zwEyVW=AnLmq?Nok!2Rg;DnKaybL!)6PJ%`1~`*VGI;nTmFPluss
zjCH<}7SaRsx*|%y#6#{gu%P+~6I1b}Y0YX2i~>m7*8pTWe`_v!Yvb`MFzA1SnlJP>
zm@iM+_Ah|sH^!;&L7)OZ3aZl*2Fpx3m)?U}qQf5g_>V1CCF@UfSkM-<u@L*;&Jy&H
zu-uIHlq&u)6}%<Y^}Ju@k$~%BPqaAB`&3=8?W=nje%v;Yt*Dgk0H8N~<GJ*Bv=j2*
zz$ZGE*Tw)(c>H6jkshGum(nAw)n5LBOZC3gaBkf607<OrsM$N<pza=7+lcJwfDCjZ
z{p5J`T<m9j98CTn_A|h@r_95j5CF+0i-Z}tDFUZ$lo~uv(AiWey<?rgCFw&?92wG1
zMk|Q>mHA`giR;_O>HO<{vVZmUi!#i@&UBe_qFaE{G4x~NC{DB{?5y7=b;QLyw3goN
zUi#XZNPQGa%?d{)V1^r5{&#i5|E$gP$rtxnKpRMF6C2<j#p_^!6mV)Mu>c(&d~GOm
zx^Fn%Yi7jUgQa?N+lC`Amo$3LjD^&(V%O5gz!aa5f(=n_`*=a>U+rS`Mc`MCuSbS)
zX+a*2={}7PMG>D=gK^X{z~KaEd_qUmj-2XMYsXr60VnXgfPG?uKxT6+Wd}@^&vpRW
z!T<gSB7DXY{JIzCw_JJa#ZktZq4^1hU)q?WkbEtD1ZTSUt(g&?*5LUILq}V7ROR?l
z3~Pvv;}kJXsYcJ?;a@PAu=zw0jLQELr0|K07gv7r#lP9&Ig@!}8d@W;#yS(a0fZBz
zf(8?|B!f$k{vAq+=MWV459mRCZl|sUR8(*z$P}-z{X_>mV1WEfitwMfb8fx9mjN_3
zjW4Q!4Jc04uf<OWW*~s(3I~$frb@~SHO~+mpl5)hX|}WVD2a&|Oe|O&&T<`mg7H=M
zp~Fy>)_9iV7ss@mj)%(m&*mAYI%XE2L>H9PYn$8doKb{4WfZ`|7{Js1;TBW$G53qp
zGvAZ$qo}!s^0{;uMnpB31{y6mHKr>^_(A0^j2vA*y9VI$j|e2Uzt{aX!x0fyHj~l+
zDfn=oAb(fP=nZXc!1t{yKgI60sA`P?D6mt<n5VrBYB4km!t|25Rm1&r%Ej9OizD#Y
zGtLy68RzRhwg1P`!(S%@&aDg)BV<AZp@sFc^}jGn;&EjC3?Rxkk4Hb*uetL|8>ts1
z!4?Iru@S+Gsv`rdjJcTX`(3P1=y+vDY2S#N^;?c)lathsyMl|fdv*p8wd=C)2IB9}
z(*J?{UHGB@^=<l}_l=JW|5o|$9YBLX1A?jed9WZ-BSg+{z$O-;=uasM)r)O>gl^Y|
z5oAW}dTgxO;NywNbPf!9@3vuH>4vA)6Fsv~ytR4!kEM4H`*CyWh`F$5taZ#ZUGV1b
z(?&VGcp*I}y~I4dz_o(5aV>oyy?=Od#37+Kx%seTI?^^onT0VO_E~Ijg91-=JUQCg
zE<JV)&1~2fJG8IDE7YXM=_i|KIDnLUydXV*`WtnF2^;HDdRKaH(1;fzd_j7_Mww>z
zB3qc@?@H&lE#8X;yU3a_uA#qwJlcsJ#GVG@vP3T@X6{4M$1eXMsZEXT)=5v9$q_yz
zB-aa_S|fjTh-%C1(;3z1)F?4dgAM&Zk)B8Y=Z5P>ufetSk@VrH{Ews;r5B9~-?583
zdQOLJ+;Q=-^gw!^E~ds1_nnb+-yiM77XC5CpeNS%USVTu3jlgaP}Gh~rq{*KOY0L(
zeX_rz739WvO({3j7X8-NJ`veKYlCubmH)Byy!1diw%lc6SzoHhy?5A-7}H?xlk7xu
zP|JfPlI}aB`hP0D!s0~$n#0CvX>@qc^}p8%ys}^~$V3qerkd<yX=(>zC=Ae=>F)~^
zI6QD>2NKkSDtho@qsgPRQcz3cP!idl4f^O075H<~H>B4&A~cfv4MQV$(hsEnApOMH
zUN;wiTKWU&E7A+VF*S<yK$=J&NdML5z4k7CL;9-pJJPFIua2F(l5*)>`j+$`rJo#b
zpxD+0mL+}ti9rJ^BWCvI>1}wi*0>DL<!S&{mz=v%`{G51OM#(Ergw$#cO0d%dj|fH
ztp8im*QC#2LrS*s5lQ`r(%(q`anKT)MEDu$kEGwihX&%lwN~99N&ma_!@<Q1>C@65
zNne(px2D_5OxN4e+tPnS{zKDq4ca!h9TT>tr8Vg(1(=}<hR0tUS9C)v){)jt2FJ3-
zBXm~vulakya-K-C(M03=e=5BteN}o>dRaQd02HWm;#q%N`oE;NT?%ROJ}v#e^cCqo
zq|4lR*R}Mn^mo!<Nxv9uo>S>f=}XdE(&wc+eemoG^4~9||0w-vAb;Vdc8r^gsSQ(f
z)H?{~F2W33S(?-8SdKO8mqRsQSh#=V0D^HF07XPOJAT*@kDcO@hd@yJx6eyol|C!I
zC>7Eb2*!*E;feHLr2j+u`9Orn(&weGNuOil+`=Y0wV;FFmHyw-_c?-cfAQy}Ka;*}
zN6R(5%VP+ZKal=U>Bq5o_71>t3fU=0m4z%N0m)=g7HT=psWUSk`BP!JbmaKGLVRZ=
z=*m#JDZV8#mF;cgXfh1=*QGy~o~K^<fV#z&k+rpSB>htQAabdca=b2mQ~C?(i_(H&
z7ZuN(nwfva=)Q-MOQpd3HR)^8pGm*X`9Wd&Iy=+iO!`>*(C26zs!>I-JEJ8xa>k7M
z<jl=L$p6yj`CUjW=^kMAh4I;XX1ui_w?Q4vZ5l{LYf@&R{D}2Sb6EeK^ykt8lD$(q
zEij`YeahhnHwDd1T1an7e=hw`(r0au?u^*~ng-xI(yyfV{Y12d^t$v7>3@+vV^3JJ
zb(52PQ~DXxg*T%~SW9*_1BhuEbKKf_S^0nk7mj8}u({z^S6I@@poRU)MfE4Q0H9_N
z`qb=M0nhPdGy~4C0sl_=L+Mk}b2Nc$`1W%Y>6EEpn^55+>9f)wOMf7}C7m#uE6~9t
z(Kn=f%p?0a7B9tJe=dDR`jqr(>C6V2G|cg6rB|f)rC*K8zlQTPL6(9Uu{3f@WuR!!
zd(<fD+$Lgg7$i3DGn|JtfRf5&Ya@?4E-0w#M1^!H|73%|ApNEE2hz*Z1F4f9)0vQF
zvfQWD_>uJ3?N8GGuS;K-K119W6KvKbo{e-W{akt{hABqq^k>q)m)@}cw+VwwuCSc<
zq=(W+vGay~0J+T-o*I}J9qgT1n8y8RLH@X~7=MQo3f!@@1>1iDn-KM`C?(HmpWcBY
zjYptU=~d}Zq(75B18H<(<Gp4i#kKS)=~d}SdXg@lNH0lWlfEf^LAp;sQKM@WZ*foh
ztXYy)QYAev{jT(NVxKw69d58Bta(j(1qB#p!BJmmYy?Ybsp+&&72PMIB`>_<B)uK8
z4;Bn^xwICt%+t5l{^Zs;utf{fg5Sjf&kbPNe=6xk>8sKoN}nTGJ|>qf%(0|!@nz{$
z`Y1ifRC-1Fy7VVRFkAL*)lY<5={dtFT3@_Bls+T95dXjh6*f4LjMJo`$1eZS{}_Le
zMlP(FXCDlR1q$Bd12#4RcW&TWq|ZloCP*{b0_p!Yc<;*0eM|?6?J&Qs6FABL>(V!+
zm&lzeBj2qJBY9K$l=P3%lThIor9YDXQhJ?OXG!N}M>kIn6!3lNThgQSE)(e$>5rv<
zPvUoE=R4;>fDz|C7-f;|7|DMRg-@R)oVQbe&~}u_b0wX_*o`nIpnG8eFR-M&0#1zy
z$>}``wF?GB4wrGf6kAEJNPi;z&(a&H(-!17p<rjg+;2)>kp3~Wa3Z}Z{l4@K=?&?Q
z^@MEgH0L=0>&w!yU;LW%r_x_aU*aU%HN>5%;b(K{1?eT}So%0D$9@AWKv<G|XBZm6
z!g&azq$LjD%djO(p4mokt-&S;pc|BN?>cgGF^O_M!}pZ{xTRHqN8Im;^hN2L(w7(u
z8EIE%2g<fA?>>$V51KOPiovB*o14C)cVQ=;7)ZAdfBs1N5~gcvMkh%)dR(SvA$p(S
zG6WpqqZ~t)H9uTCj9FTNJVunA*f`m!V=suwhjsLT;8sMfIT-SWWpB*e^qxP0@GnR&
zN>iRKgLGQrXES!vG7NA|`14ii4U|5&v9v9lyub$a=8&X||3La2IMWPUb4?tZ<NbOB
zSQmciV#=nbEJThON>95$H8rq7P8-Q-_$uJF1g_Uu5~c7STlx?>W`I49-z~W;vb6f}
z6*5!d&qz73?-Y%ip(BAS9Qo!?x${@0FG_dp_@-$7t63?pAa^40{jBu+(iez&qw{Tr
zz$y^MVl;w|GNnvZHH|+Vy?Apgj`xzN@S)cR80DDZtRLRg1J|GPY)gJS0B2_*PI16G
zwEjs8Uz5Hpy&>JT6^g3Rl%e$3G{sh7e@>*&p~8g$KDCVlukArj!-uA5>dVrfN?!sn
z%8=rY84fLqTn;8$_aGn<FJG~BYxDX`8<ZFdzmI^fsV+{fog&m|Z3bX{NM(*XduyY&
z8=KHxm{zujv2&8)YYg)jmLO^yD}z>|8oW0^r_vkJtCakr0ddN3fXBRAd&uECl|I7+
zzZu^e0OAq8qP2;s;}z`2W^NzzBIih6KxlUld5*1lw)Ah6?U)*lN(&>XH%B6~ZP|kM
z1W>)C^XY^sDkChc<U?POUZh69#h+JffCwf^n{aibKNIPWbYD6mJ~(2pt$Bs5MPqxK
zL?N9?pO)Ta2x55bODkhVh`u>&0}bJ?32SE5JsMtkZLEjNaG4wV%^^4EmYU_(M6ksf
zuMJzj(fpdK^8^Vcz#@Tqg`Eq(v?DQw71)Hqe_FbS)6Q`$QPs)$WjUlC7r$ZoTE#wz
z-5S#@p7^PViC*?5Sx1gP6EVP`R?>{hKLhYPmChK&u%;w)&lm_R@QGd}G0cNorSz9M
z)sL2tvd3cU3}IcNCly3@5kS@C2f2|`dh%b;Jek=>ZY|u)fCew<*W82%PcR{;q!|r1
zt3WTpK=v|_bnz?Fb0|p+<e9LSf+N)s08~Q<AQMB)*97jjBn3Hws#cJl_P7k8Zqag`
zfuydOOc}+|n|11pIV5u(=jc|&HmafcZVpkPV1k7)^Enphpzp`nrUf42=Hdn9g{h5R
zD^TML=`o-{(QgF00cP&tl#i_J-q?|V6#?mbFhs+3ILAblBoQ@OdCPM-rE9OlH4u4B
zfG9CNHN^2bLr~9Mfio6wLz)!^U<Q%pp>%4s_w8W~CQ0D|XW~rRZly(y1sa+$c|DJd
zb0k0GwPTlU7+-jVKkf}A&80Kx4e1rgKdp^To&gF3s$1QbM2awGg(WMve`OAQdx(E!
zHcziY+hdD{oN61SG~@twY!!k3L<M;Vs1lL!?$7{b07l1H!jdp1W6eYySyJcNN1!vj
zZ(#6^F$-2`<Q$XM3xLw*xnt(OA%>io*e)<Q;~8*3%^~^L1_#HJPc1&~+3*%SF*o^L
z8-=j7Nn^b>?v^CzSh|#c#cawb%hwzQR2hCZeE1o_!^%RW8Yx6!Gv+DuDwze(S&_$g
z1i+=`q(kHBlDm{Xl1?o?EDeGTI6F{ejW~d{aeGQTg!Y;zoS`JEK`5JBTqL8c1`IhP
zf{TEl_X0s9Ev(^U!n!B-wFQ!09cBO>+a;rX7f{od2RXuqtqC-yfg(isaf+dhRJS&z
z4=23ez?q?}Gh;hVP@trAkvZ0f-YY%@W^js%T%p5rY(R@XMACGlP&@2<knr6<cPvu|
zuGZ@Sk#r?}XeFAQo9uY5)FpCd2}$o$GK8@J$e39pPe%|}TV~iJVw_2Lq0V*qu!c&&
z1V}W93Ox>BX|$VYU_LeSOQ18=;RX`BG*IR!NG_!-_FaJ`onUbHEUXwD&~e$$P^=D)
zTq*`EJ%WqP6B{*S6=D=iNyw{%D0jcSTP7f^QTWWn5#Yy;(C5sN_@YnFZg}99L*W|B
z$xleMGMR;_jWD;8KE#YhPE;C-JCcEAggZ<7Nd@xgrSyqyxh!7(iv1s$)fNYKWaXWi
zbccD#nJ>YdF3N~?F2N5o{7!v{0K}tuH%)DH#ukOEtvhf&h(Q`wt^&iq#7h_aLYVa6
zIPDj|;cXiWRAXNj1OqLGbe{p0gVzw2Y;MO1i})fY1?~JqFt;zk8ZT^aSVg?o8N9pa
zaD`=vgg5-4cP11@M;M`jcDzpu&AtOS-4O&7L^c<GtVV~SYb@#)hBiI$9fRSvgA_cH
zg23u#JbVQHwIRVH+8;K&!3Qj9k6y833u2Xq!y)z<ZV*Yf6tQH*!mv#VFWC&VFGEGQ
zhETM4vN^cen&fqyzZC1#V5oECG9{*r#qI`4$i(Ix2mh-uUQGDiEx>*g8RV98=AL7t
zu29ttlcSD}h;eiqizAv%a<H%^_n(75TpdD)>gW88)fys#la=+0LG)7^t<lT;(b_a9
z!Y)RiKU8RK=C-p<EsLzL2dy9#h1&*gT~Y6!ST?b=hLFAW-BBW`aFKziTv3UdqPgSv
zXJ!*nH+Fbd$Z2ExGsUs)e<3v}Jpw}9Nq3M|M{j)vOc`+GJ^;!2jk`!8_+J?gq+`w7
zQT-QTOvRp)(U^mX6%b<2ja23tD_gLoi8{)lauhr>dpM)Y&{;Ro4hK*pjvfQ2^uNuF
z^D<{KJ9sUHb$88t0U`n#@G~3mtYb@F%0umKu^Tm%**ln<3u3}+sE;SN9Y~T>>V877
zbr;hy=dzYDAs-FS6p4g4;)p3<cV(wRgg@%lK{LRc3qp&Mtt;9%gP%OR%>i^gLd}W?
zbu9{COa5u+5s(ZRNy&G_o|VOJU7wv*`<<gob87i#%|W##p>TG{;q`9`>tEsk!s2`P
z<O1IH{!^g$8%sQb3p2NBU~ai^7GgUmUYp>eBYkMB@E+!Q<oRnRFl?<9I433xGVsdI
z&KoHu24HexY=DOCG##Cz5mC1YBT8%GiXvoYMZoItD}qW9+=Cn#yR-s8u)}&Z%#i<%
zRovKVBpU-YN36GJbl0fB8N8;>-ak2=5gKs>1Qqu`Vo@TdT^fxu@}T`AR7T<261yMq
z!fPvoT={{#vH!OuILFoyGP5HgGd9(DBV=Z*k%~yH0M1VAra3k^0*1Zvgx^T&eNS3S
z=cKYFs*&5)Qf&SbwCT*^gDq5q45Tgs`@${9{sl72^9q)J4PqNV*&?UC8akD{!CckY
z5D$F4oZZFJw@6Q{xa>stIlg9l&=Jzxz?aff=}Z!Fbkf0@0X`FetP;(;`M)sZwULA-
zM#RWCB=5-BK{pq-7M&d#3!~%tWszMs8bP$y<Iv(OA4qrDQbEkjZ6SI(e1#g$DR)#>
zJKI_F%5eAOJmCzw=LRbg<>yN(vm+^d1`rboYxuyag`yi)db|joS+C^|8tJKp!V@-7
z&j&^HvBGr6J*VuLc&_uq-S11#TAn>)GY4#x^x~-<j?~*c;lqz{9i27UEesdzp*S_;
zxRY)_q5YxbRfEFS5TT4e{Kf*9Sb{AkVPdt@1@E|5o~ZO5fr!k_D{RcS%Sw?q7I()g
z?Cf~OD?q6W_Dnp*cmZaM*bBq@bNpdz6CfIv=|=by>Av)`^pY{O8l#NVSjb?THZHlP
z^iaBwt&Kcvf}F0v_8lf4*#ZGlRaVtov0HPCq4!dV0Bpx^lw|Wcwt8ZuJ{_V^Dcrq_
zl0~gBvy@?GKv<7JzF&Z~6+1*&8Cp{I?M^iaf`yc8-lPP;>-tBF1%$9KAVh}N@k?8t
zer9`Z!=E9H9<(7zN75bqeI#W$Vc`UpZROT~XT1kus51td1~WUiT&|b@H{_4U_Qkc?
zT1NtRhYqYWAGEb*-l-k#d0|_!8kAxi;*uliC^EEcVilWV3K-8nw+W)M=Dn!TS7s+h
z9B?>*n!yLMPU7^h8$&xac#sIWgBB2ActL8LeE35Yq^AO<9RnF)Z2HSNhGm$$`6poT
z_~*<Rn_D|0s-zpB9H_7elyX*k4J{%tiw%j!?(pWp-wN+pb4`xvx-c+zDE|UEcd*zJ
zf9*y4(#~{on8AC;YS2j<zWo$`dOa9N6b>!uWixzDst257H1Ar4Waz^pjH(b-&W;E-
z8kFQ{i`dsN=9H1bPBow)8J0Y!8)u(~$2wgwcy(^6R0f12V(M}EckH6vs3;4T@EEUp
zWrxi6;_!%p7k!3Y5FfS%{tgRp4T0t<`!9^zj+Ni^t6KZ7*waA6gRfDhiTU0D`KK6`
z{U@V2b!wqtM18^Q2&gT0_ZJve4dOZn5_^$s<l54yz>;+M2zMdq3%4G|Jj!wtBG4X+
z5F<!BG$U-l9I1AEZ#QrN5o`oquZF!QLq8kKG)FdYMjg7N{8rko4%|O%+tL%!*7`i|
zO9d|+oB>hC-P+jmo}vUnxWL_&@YZT3HAVQ-j&5LCq}tkwMl=*b>^a-$#sE3)-PtoY
z{9a~J*X7_m#!9*a2d}d*w`0?0tnrpmYI5iUq6V@BsIM(7-oeVt4R0A<-&+1J^M^HD
zxJOn!+s80L7tLt6%Zw}#6sj$La|DLYY<xm%V!Os%pBe;qg=60DLT92ziKeshuR*7}
zATzIn1s5g0g69+#PmJ_$tsQH7n*pt{FljqR4Ag?|9l;R294tX#d!@P2*sYz3sW3s@
zJb##=puV=!1xfgn=D95=%gja_@x-<%u)76`^O}$>Gd98S%B&9$ww7;gEs0pu)jJ}>
z8o#+UU0&J?&x{sxkO8DGOetVyESQ{M+9Bki_w8+XVii0fh3a6DJY_MW{krS73M&ji
z&7w#5+SC@XhT%BBZ3E{FP7t<o%UoC?m1H!|jHGz;{3T&pkPK>L%(a-#+R)Q(aPy4V
zH{Et_rxPs=am^3&xA;`J(X_<(Fkq#f4AKDx_98IBb2_mu)u_qJC`@6JhIPn_P)OFj
z+|bSwuaggAFW<nyzX{sPsd>*(@*Qrcj{E!f1&6WibFLyY+79F&>sPR5C7UtHJnCyD
zAnP~_r8GyC+08Qw!X><%Ar(Jjg2sk;RLqc1;!KT{h4tlSgNti|lZL*eg6EGeyqQ;E
z2M?d(+6$~l1`s$iT%ltx_v+|@zXaNrai1M|(2VPQ0Zaz+IYoX~Y~(3{-Cgc^YXM1?
z!H4}QtN}bX@=Rl(*#6?J3ASVXC+vKo*QKrQ{`pUc8!x2>v2D;fC*a<D#xTnG*1&Q)
zhOp%L<_RL`1}Yq(bEF?LtmqXS+QP!?UIr6AV;K%~#lt^gU1!{*=Q?{efE*i?nGZ<L
zeQ6<1acIU0j$fFP2n8y*vcqLsLqvL?$p4(Z*73RvaG{`-jyuu7a0|oGCdOqD(Zo~d
zvShfZ$T@3V#|b|SO6l65mA&NB<9);s(Rj}dbJqgTZ4a$y=y0@Kmh9(_nqbLzE)k;c
znPh=hr7pB{EM`<7cSNuO0rk9B0e@hI4+yHomIo;a2(Nq&>_Gbqv%U>Ta%VlfIj&~z
z&B)>6w{QR>_6rDTV!-~){z<aLI4r}K#`&VZx>M;1zHwytRz^gM!dZbmJ;s5I@bGIJ
zUAVJoHhAzG!F2!t9K=aPK~&foL2wNs()r6ry*fAVTR;p9CT7sthOG!4LHrUbK*f#>
z@=#$0pc{0OhpnvfJ5l|Js98p@y*rrZ(AkTud5S6RY=zm-?6wRtIAgVyuQNRMv2FW&
zU_c}AU%{4Y@w+n{;u5^F`QSW_9Q-KyL{rr{D5(=Jo>K-Ju|KT=rooY^=&vXbDqKAv
z+iPf%c+Jf92F<EMP4=J0tZi_Pf(Rt4H&g!J)BhfmaEj@gV`v&4_goTe{*KgsZ^<G+
z-teQ9P3#Wtdg5!mPaSQKU+k>HEfS_=YZq9umiv!mhz$)}uh;;!&Uk+v8{<-XC_RbD
zAnl%X@ykRlTMM6Mflpzv?*8JX^oR={SjqfS`jGJp0U%e03e>S~QAZ2vqX=tftW=o9
z9)7olcC{sliH^K${<mP5eJ_HEEgl;nQq|;rw|M2H^wdX*9$L>SFVbR33)|iTNSXVA
zLh%aM^n4XI&0ztpxieO2uaQSq0c}7rucSL1q8yNX#+Q!NkZO{uE&enRsjcO~A^B|Z
z@LSd|;<((Zo;4+{ZomYMgg-iN8-80LA6|-W2NA66e@HG!lT!?J4H<F^%Rz&7pn|vX
z=A+Kc%?e!U)Pvv%3(nzN<U|qyA4K6Y7&$`+nh}GPW^*E9tSoAlW$F7RbYMB2xiXK@
zfj@58#d{j7*cfLZ+zXlqA7ggb5SgMdGyDXKzNKm`kkf_H_F5|Ay)Yt@^~QpM+L{nf
zQ4nu<zL8wXbdP&@%$hR)2EQ?5FI+)4=sN%zdbPHk@rZ&@&P6StK#<G#82W}_Kj)Wb
z^i|}bpO2*XeG++;M*sh{o#(F|*LB6$lv`gTsY*8(Hed(|3<L=BA^G!@z>#1Zkz+})
zWLhREiKGh4E4NR{hx2~>u9-XUQEnIdV99T0&YZLB+H3C#$5q=rr!|6kiMd`AocA3-
z#kUgIpcS&1tmwAaMt$>o{X4=3S>--dr^PECOO^CBGhRCtw8~nr;`SB!*Adi$>0lNd
zW?+Y{4_no6E?et_T=h_d291eGBgd0OT6NU*8=csrFdtlkgpRF-6V<@!O)Q}3hS+vT
zhIm?+VOVzd)_v_68_rW%8)v-ffc40TJyTe6VT09bj;6qj4siHn<|b&kv5<w4EO$h<
z9TrVt#OpZ*aEE&C*tztt_l+)oEpS0m+@J+xV!r~%@=AJsSU~VyD7y_$Cy2ui^ITXP
zt^5|X`Ikr>SD4Y9jd;;5#_W$In58M}2ogjX@XR{l2MlONrjz{sf(ST@uxl##Edlvn
zi$XRh^nSczI~s$EGLWK+-WTN~dh%j%Si}q*V-CPI&;=PHydu+nMZNk>7PV$yU(xN~
z0fK|7<b2>DJ{+--`Y?kU0e&BGyul*dnA!_s>BbMglSpqYc8Kyzj@_xcufA`GY7MBJ
zV12!g7{<UW{T&KXqR_mJD?aA=D<Zp<jkC~U_|sx2&p_{jLsnT>HYK}!)2dY0Iyjf!
zpp!LJq{3gfkYU3a?u%eKt@J;zd80ed=1r>bGF+{%z)|{0zX&{c(E3k|ppqX_!K1X5
z^9mkHo|$j2iB&uKXDwZl3!e}K9~l-@5rOEYCV+rpGjfl@Xrx=hgJ4zmgV=BsQwujr
zD}NR&r0OX^gxsZ#6A#@E7DtI2%?6o>nGuz1LxFOeMOE-F{^HkvF*lvgNVztM{gK(0
z>0viWj%VExt|TQnC2HF;#A7$;y~Os6NCNH}ewo&CZbh!r?A+dZwN^^5&45NoF(%9J
ztfFK-;vF(TVew-x8q)vDHQ#>~VdWYMQ}7upyw}{io3dU26lcDXo*SPs5H%-MDCDm-
z+1?uYEosgvFnr)g>z?w&{yCF#EZw7vGj%Yn8K^Qh`QNWUoy6L*v(nngCk7-g4jWNJ
zr8nd&QTG@zq9ZpfcIN4GrwANLa$`qVS?!{+2&!=dlHsyyW9vlD6T014Cukc6v?RKI
z%SuU6HCzzZp8&0ld-;E?zhoHG6$(7(fssfyRIE?KhDH(l4T8-xlCP+G?dZf)j|Gy^
z6cMz*)}r$%*8j??xx*3Y$T(n)1;m$dHqJ;$HkLA8Z{Y-|xuIU$SWYtsKUx}#sUtuU
z5k`F`4t&d7RMHFS87*`J5ZoGmV^lAmLxY>3QWG|-V*sNRcbAYSvxF!}DI;r<DB?;X
zDiiD62_AW5Fb&asAHRwtjq1%DA66Nhm0_nY`LM)36b?h%+O*-hnXai#WT{Ofu66Jj
z@vix!H3?-h16#Ck0DyM~FbBU#2fkzFg&_r{bk*Zw<UlaNQY<Kag<ND7PgSn`Bi5h8
zPMBb2HVAKMbTOC#w|J>b=`J+x7V8(IU@AhD=Prf8i&$dfCJ1uqQo%+f+kD5Otv!BS
z;^Lx+JL0vY4<U41hp*6B#C!}3tK#`b{L@&*-Z$e>C`y{UL6AlkTf91KBG}r*x-lbj
z3Nt}bRz8Ezd*A@Vj0WK(BkgGHJR6QN{KT~Yl~xC7O+*^|_)kU!usqzl9awAZ$<~1F
zy2l>9$$;iWN>f6HInl@n=bjlhd%db4kw@Y%>*k*;`rv1%(H_7q45eLK$tA;91(&F`
zaJA3k2o$KbgN-06D3wAh3%lxi=QA5*BMxudJ+Rh_WBc%9L<4q?d=?z|B@yf#7;yl@
z4H@1b?gf<NV1Vz4hN2q3hcR<2MF+&av2K8n;i&lSe;!|KEWxZeiwOKHn~pO8p@^c<
z+y-H7*t5#w)5T#sP}!Y3HzyU0I%*4$vf{n=I#*fN7|~@q#WARUl!8wNM?qA?#TkiX
z@HN7Lmc$8h{v9sN%-mRPRcWWQgIL;m`KS_RV7_tt7MC_M@HIbl0I`HQ1=@%wGy5P*
zl(ZJk<H&e|klYiTwyd5Je>v{Ge3+o9LkDt}6B3kw&jGXb^><|PNny<jE3ov45X&Bn
z-&$T9q5KlF5m{$o+1VhECcM1YLuS~&Wp{{bCw<$mVFlf7V*H;VrR)f9MwZug4Dc}t
zdwoDjibS-+7<S^YraQ%C&em<{#R^u`FpD7PQ>aty&e&*-BU7vjM4^+#1=VL|T_jUR
z;A~NbV;dBi`_`A#H6#3wGRRq%pFKNtfU<?jLY|WQ#;q5Uh$c>Zb+|-WIL37CAfTR;
zdyK%QqZ7BY{t*G5Lm-iIB-r9z>8H}$H__k<yu_H4>yCN9<j;@Lt-Tv*yZ_YOJ0z@N
z<t33o)eFC)^ihMa2B)DiYS<Xj8Ytp50Jt)gIvA^g(X|9xqn-$i2s*Yl1v|5ka&3{Z
zz5{BQS#(SUnwvxl?~`_SVP1W}hDO)GhI5VOoZ0snWVYq{2*jQVzZbhOHhaEAwD;ln
zR(d5plXlV@3~vt7^8`U2VN!=7xEk3Un-)a}NogH0Ei2!H2or0RO+2M7Y8$ot+}^Yw
zLj++l!m8TPs=}l=>wV0jCjt{&SjhPV5=b!JszWzegA-km%8d|Mp~!0+i@bFRI?~C?
zF!NFOdy-otwsy+@9UEA2#JP0OM#D4~fn{do_HREy^&@af5zrNXkYhkE>?HOIu(INP
zlEHIE`JGt?koPW*=SLl~fP;2s#I_u{+46DQjty`YNeX<Qkv?1kq2yges&xv~j;uju
zKwV37?8$Avl96apScv%;nT@HbtiR!UGs1h6l$<J*KI?wmVr#hG^~I5V3Y=wzxEA0N
zmzaTjhocl_*+n#QGzj3xLXpb8RMk=75WdO^(ko-~j*;PkB5uRe0{N=#qvAzZtJ}lS
z#ciOvR1q10R+fmL`9RFjyNbWAiTc8rjnIp&h2{J8*BF4@3fB{EJ;FDP=_$Ffjx9DK
z>TXLMz|>Z;JOsi*x7R=&BYRZa<+gSXzv0Ey9d*4!FLFdPCaVlXKs(neUjB+*lwsS)
zpg38`Pifq&!XTv7<U4|g(b!(>Gx*%bpBDVbpdKeUZHrQ5hu$kQyrJbG$Na}&S2o1i
z8+U_IzRzHnybGe{73}mP4ixMQuq(cPQ7}I~0MiAq#2TYA_m}S!f!8>WDL!!mon&m#
zdo%c15Vw149AW~ajxj@Tz>PePSvnzHQH6KXTbQ@Ps4)W(L<`YvCEW$zPYb>z@QiN!
zh{oIA%SX4?2#a-WNMMqOSJJnKUlgyIeK<1m${6OrlD}LYj^XMlKw|~1Sa7IuK_}$K
zk=P8oQFd=Pwa-yS^`Rmx+1K$Z7`p{P^8{a!&f+DW_?XOMz<}0fD`d6TV3m%I@1-7@
zF^Xtwvc?4gsSsB0fx;;lY-liYYR7ee?iRbXrlvr&D(e+^4Fuu&Zh^U(kQPP-b)N(_
zgezNuz-Yscrn~~-oxzJ6x#WgOR7h@Z5KfRDYtp)Y2GH6x6PeQtCT30UvBN5KfV|c;
z_lmCaDBv~xUu#nF4v#*qe9kD&a~>W|KrIPjbr?GKmQY#YIXq@b=x~BDJ$4kf1!kzN
zOxq%MDX@#Sgu;a4T6snQcCt^e&TOzj!TL|Y7Y4>Lm5^_Ou{t3JUek{|reGN9%KqXs
z!RAdkgN=<M9a|qoGLt79P%Vq@a~cMhFh<1|hW*<19Pl0jT9JR`Bw{<@AhDm(Sl6LK
z$HgzCw-AXLIo|{fD$0t1MC}Er!G<_L@Tkif$uaWlK@3-z&ST@A%XmdOF69Ve+y}wp
zwR6t;Eu!(7tf4ZxaN$cx|MCip+gQRrvR<$fKd~58;I;Ie@1)EAV#lI_(_PxP0j@6&
z2eLxxXZWp>H{T*jef%X+46iuhmZt?Sctt^dh4lCLKZ2M7N0VExZ=}-|Hl!1Q2I8Fa
zsx`+krTn%-G{(K~O-A5cU|+_lTlk~Q%JtEg)7k$mrY}e7H#Q<eRC7;(8TS3S5oV^7
zn*!2Jd7YeHNOp4VgM?Ze7(T_n9pMaGqpjp%Ny9w)5p8#?15SL47BB4c0!42NcQ!md
zJb8hISV0rZd#x-jK7GnJwlAnBL_pubq`eWmYMGkwQd(NuTh9Lu62;t5hAPxP@eR&N
z;-g{`tdGh@d30(REiR(6jX5H8+aUZU!nY%&90nn3a+J(6-9RCO+I4!^%NO*-<ZjOS
z#Bp>+9P^Cx9)6e8$TsEnDaxLhUQXGI=fk5Q8iqFQU@ipjDcICo9$NQkeCcstNKd67
z^Vb17pGiyUTUwJ=z1?f1E9t59o%ALs`4~!P3HrCOVchji*1sjg$%t`-O5K9|<kp_L
zXVFB0vBcILBjsb%tHguf^+|HFC8a-FGJR}@ghgSbfkHCi02b0y=_~2SOdeVB&U2nT
zGOAhV_VvZjq;I6RIq@TlsWJk}5&do(U;I*fDm{^Y1dXq;$+v~{7Q1kM6N^R&_lQv5
zUF68RN~6@&9NKVU@uG3zlsLAcf;l<_;0w-UV>xPA^`!w$rBxaG!gDTtE<NOf^TKY8
zY!*(2CmOpCYWV`n*K{Gu2s5>{t&I*Vc<JI7d}tsl(9v2rXF0OU<Uj<S6WX@jy=_Uz
z3p$b(APsvUskJ_wQ*N=r3|9zJRHZumSKw~XNCHC13k+~Z7vT%6ZQlVbq_3oh(mN!J
zfz}9ee-!pQEVxFxkUp0_1v8zP%#ElgW_Y;ooI;c?ek^?}{ftq<h3(vTR2XYB1Oo-I
z2xEdZ;H=`@${VuicfbEpu8O+b8W5$R5?EkYIuK&i@ODJpXO{lX%!FUzgAZx|>0O^l
zpGo(ncR)F377Au2ca<ZNNl`wR{!e;W`k^UfOHy(L!WOw?=K%g7eITv);{9D4K`q`p
zz=!tG0kpQWA<W@&a}b3ZoOsK&U1M-Fx{t4D_MYMNMi~8!y*RR^?ekB^08`y#hgUd3
zq}Sd-&@(`33qO!@=@-&F$Zbqv*@C=Hz(%*Rg;&x8=|kzB^o}7yjTNU#qTE-#ui>nu
zKS{rq?nysDV<JqLbFUTM)7!%cG?Im##Xb#wdxr4jtbEl&dkdp{7XUUhM6EF-zvKF3
zvF{B3P+0|aDXq-n_wLo%|JcI&(of;4Iw$<dz@{BTz8XKg52bgdpIP(Pn!9J#UViQr
zmUQtW=_5YPe!^0BOd}`(WozDOn1$7lqs%z)+FB`uZ(4PY^tBlXvz}yoW#x*fC)J>v
zdmETx`sd)@!3SP-IWy6YInW0XTlkUmF06-;yw>IoRnn_o4KIaN52UxGQ|Xt|U67Lz
z@x<6ZE}asM{r+D{f0TYJ-Ism=;iuwh!3WLld!7UPpLTK#&7mL+6?VVEt3~e8DJInT
zr?F8|le>0G_CO7#;b689utlaO<1jC&2P}FgfBoA^Po?*zvGniKDT~_>GQ`4C#Qr5F
z{rcjsr1zy0=|7~m5!4K&Jp&R<KJ{B4hq93#OTUuV(m$i$IY%F_ms5>=c{tBKu)4;k
zndStPA#?>gFySjV*A5^K;KWegz#>KngJ5_bMFi*-*ePfIcDF1QPAj*L^S%zGE&Ly8
zA-$CTl`lw4y4ZeN*9+-8A7`|czK}kUj->abALDnoAU{z+DZ#6jet1u%UrTRD|0ca7
zm81z%wq}gzU%B@9F9f)?RG_kPI*m0qjCv?wP#7vp+cWzBLS)gSo&(J|lB5$OEB^&(
zUtmdwy|3@WZ)n~837PYi%{B|1{+aZMv1`{CKbHO*wYhJAS9F!vc*rNxW50MKJ(B(t
z|Gbm#(LNEx<W%@icJ3kRLf-&Y>|AN}{(|y*37j_S(Ga7E8c@xcG%d%CpP)r~&jtla
zM%Fr9^4J_>6um!p2b&nuE_^Ay&k4Ur3#D*Ib68N{NDrkiq$@WePo<Bf`_j*C*&D=Z
z&E_Yd;~U??^yFVjM^YvIgorVC5K}WgPkNIKT>dLwcmvE39lue*Eo{<R$9RoE=^HHP
z42hP(FL~FZ?ISC7Wo`{{Py=euh+eaS3_t{~4=@xhq2z}4;S74xrSv=Lf$w38pg!e^
z?~w~8qj|)qgp()vgY=nuk96_pn9G;aUGk9VsZYv$!ILis6`<jjqRJra@i;c$5b@pK
zhOeyHotedss!@P^og!#v?Xo%RJhHP2_EmxF=`<1e`j^sU+{I414~r&9n`3DseI@-~
zdSsme*IW3BR7p>ye~}g(!IH-&llq<XC+RD904WLhv-DrmTKbuEmw5BY+F;i_`HzG9
zU)UTF8U7T=&zchRnGKrgXir7IhM+*jxn7bJq(+au18NOMFO32g=}++2f-AloL=js=
zcqA>Q_i(^VzQiy^v@WFI_I4qy@M9prf0E9StBMEiAQ^ulJ#Yt*9P-ETDZiC|L>RK8
zdX~dK`=F;h@vAIHuFT_(u+Y(0c1L;%Qj*0EZ%N-OwCD_gaBD`Uv?l5O@ziEaDr>x$
zn8yx)HXroCNrXQmgSe7@$O>1~ac9!G^hfD;(#O)di||r<DE(TxBfT;@&Wxi?HKcS*
z58Znt{OJK{?k_n!;fCjww4O@;D}Ck{$1N&6&H}|er7dJh?7k!Ytb0jBWX%(sAQFY5
zuslM3D=))E02|pwpc}!2Eir+oB+3U4AjJ5AR7=l@Lss1pjU#CxeJMS3Bw5(!Po$;v
ziF9nURHrt=W+6S19=i34iyufU=?(?|I|NRFt;NNwLmA3Ro2H0c*qzMglm+^~@LGP9
z0cK=6TZ~4=8D@6Jt%px(4n#r^H@KCWe-?DMmr#^X2OWTjz@A8wzTvFKMq^CS!Z*@a
z(xYA+UP&*dPo$U9W03610Go7|3+a*c<>BGIl0K4NO81%HTNzq1W*Whl(&NDltmK@7
zH@x91&kX_1*!Z;jmgvwP8M2=KG?@7rW$g>;iS%_WA<DN`hLeQr-jRjR`7f=_BI^Ir
zF8p44DSas2m1YL;Mbw+*|50xXE9r&wq4Yxf4FfrH!@}0+?YGjOdRv%)jZdXZ=^;so
zCzmA==tJr2?lg+M{0B{9V<eHCr7yt*yu-}G4vkgBYO=nhIF#H<N*x$H48xS@KK%Ky
zAu285;Q))NX9d>ML+K7-v26Ck68Pa+PySQk^!L&gsc<BRvbKgR4sU&Ee?FC#(x=8F
z$?*Hh$-R^wN)Hb+;D{ktIJ!v4mei382CbGib+iV_r{KvG``0jAi{1x?ii62p(d`v<
z@ij6M<%t6tXzWDt0na$D%9?_L&AuAsBJB85$D?j9NYsw&TLuDFAfsdKPVlKB%G=$-
zf(RN`PQur6i!sX?V*{$580M55Gdxob?wFqW!YDMG!wg_*-G9Mo32|TJTcXQh7=;k!
zl1I|lmP^M4nXx5@_djt_zBai{S2p%Ht3epK0$cmiK4R0_$w#$gxDz;X(O}snsBDD~
zo^~1VmB-;BXEh=^ChA&PH9lDOog}Q~Z12JsXo)>1)?Qv)C+m$9b#~G#X(fGOVpH>i
znR%<_;1<4+7Sf+h%S$tf!VC{zd`<_-$O^hG*nVb*H$cM;Y$hn*6Lcmaa$88YEef`0
zF{S&>`0(}`*kuc_+_3)WCg|WtfxL+%?1X~k*yOh*8?JBK|73qYm!4WDY;F&%5dQ68
z67+&-eq;>+@s73G=^OX|3)`EZQ#HmZ7?yvb`Sb24)1wby15bQzi0nQ9%>jSL%qbg#
z<z<w*q9d@=4bvc%SJuNG2|;w%rU$<4O_VAJU*pQA#~=RM9+9DU@xr1unWqa&)ZBFX
z75f{z;%>4vj>GVhYBMgW4tobn=seo$u$tVUWm}4^V?x&G=yLDZ^gMWNEp};hPpBJ=
zyWI2L;nzeCccB4u>up|k`!4h@j$dX9sGIgb+~Uff9FNUxn}(m<5U++?ouGho_CNhR
zn!djM*v3PLB@@WS+N9$pp=2L}E+9f&QjK(zxJ11!{Bu^{a|>e^T-b)CutM+Rud}Wq
zwYP|H$bV1l{S<F}a&+9!f&PO5Fvk<hMk-z%X24?u-4~!mEg{tfUL|oZuOY;UeK*#A
zQ9?!><9$l7&;q?0h=9e~g1fG+AjbaPCjT{CIB0)v+rk}%8rhY;&Ku=FK$^&7<G_OC
zTiTq)o@u-O_cd*&6C1*@?edCt=GCs3y<tfE+(=TJ11pk4_`Vn}0{buC@}^U|QG3+g
zqU<laQnxsV*Iw+5wVc?!<_ts}v9?-e77@^vbDt8>j||dmv0>5awhupobF|~K1;vq;
zJ4Qh}=Z>{*{WXU!=6mkn-?W9g_2h25zQy&2tjnlM$2Jh5vJZ@RIw5nTIA+#A){sH9
z)@$EcV2jcxobEYBFY6A7UXYUv^a0$y|2+qw*UJBG_x?W0bNHz2_AA5Ctc;7ZvwKyT
z-Y^~eN=QV?4YZ*#-D&{`T~PR3^HKGx_v9s!O>S;H<Bplt@sDqJ0J>HF^;!{bEoJ*_
z_W$<Vx3~X)TPyrVRq#2L{WD<R%U&x^%H1z)9^{I^Xh(C<n0m^(m;Tl$Ry6#@7R(HC
z$SDPPC`6@h{k`7Y{Qa;0Ypw}0V2M2$a~>nJCo3A!njQ@?68qqN6?l@s(<&nZ?F;!W
zL45`bXag#pbS=m$!A_~ahn)RwoBy{S+h22wsG2)kcWyywX4H(0PeW{pNS{l0upS$8
zn)hI#Bs#;D_W?vkG^6l$LN>A>o6A_yk$uknFW!Q`U`PJ`YV-O#W;X95W4DCSIA?}o
zVG_Bw1QE0qHpHSdB3-I;t@!N6o(&%vSVIGGaK5&v&=D(YsN_|zeVu>EF8(7g%WWJ~
z@F&w*+(AI2o>BG6Ac2|QMT&(dozV6<CB>Z?Eo5K4Jci`81p<w64N)%0Xo}qjK>rU;
z_m!)pGOjNG001R)MObuXVRU6WV{&C-bY%cCFflYOFflDMIaDz>Ix{mmG%+nOH99ab
zsc!zm0000bbVXQnWMOn=I&E)cX=Zr<GB7bTEif@HGB{K+GdeRiIyE*eFf}?bFeRsL
Q_W%F@07*qoM6N<$f<Y6n#{d8T

literal 0
HcmV?d00001

diff --git a/shaders/CRT-Royale.shader/textures/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacingResizeTo64.png b/shaders/CRT-Royale.shader/textures/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacingResizeTo64.png
new file mode 100644
index 0000000000000000000000000000000000000000..df518db5718c3f78e549203bbdede72d29fb7ca5
GIT binary patch
literal 6916
zcmZ`;XE+<)+t&8i`l+Hut2JAMnzf3U9cdA>w%V)2j@T*rqeW_01QC=PRWqnjtBS-<
zjh0%m)!qb&y#DXE_rrUg>pAyzo^#!wp65B|xz9lw>T5Dx=ekZuN5`bC1v0w8$N#l|
zFJH_6#%1IM{NwsW?+M+d=#Mgg(RLT>YmPdaAUfKA$H$i9)C&)Tua+h30=53vUi5gV
z)6sFsYJ;AbxXtKIgw%SO2k?<eui)FTK`#>A{c)hDAZWwyDK=m3n$z9;gz<)Wx=F3i
zK97rDn4XuUyd8HDdWPjfmWl~7;GB9bK0o>Q5>!m+T4eI$4>*YKr{$7MkNnyG4cKm(
z%&j@Ll|z)B@wG0*LkUVNO|WwK5oSMY{|p@<wN<1MQ8T&4Q)ZB*6MVk(cB>k&m94++
z=oca$Jb9_&Z&C3=|9cH3FR4YsLt%eTNhrxbl%=M=E;5eU_oX6LhT_=8k5Ne9ZPxjB
zx=?3IBYUQ6W8Td7<dRgjE`kz6{tAIJ5KDJS-#TqZRvpmd=kMQ0DITmIu3M*4Sm|f5
z`1O$K*aD6tBTQ;E3P)zDSRLUu3-JMO&&@XCxq%|d1wT{^?>#i)x<g#L(xO=5XS2s|
ziGXocE@7`}b0`$5p-{;){Q+?Y_*>@^Cr-@R^yqbiPCNVLqX&B6_U7q#<aTBPrs#lr
z<o!M6t()Hg`U-f2qotXCcQLYD*ym88zCGpkkZyiG$B>LZ#_g<Dl_fYZ{ga;D5fvX@
zE6OJx`}#}07?&>Qr_SU}qSUw>#wo0=krNs080m%?wqn2X3?gudse8g@t^$(RorHex
zO;weMPNc^}Gt&~3PHm3R4xU_?*u*98DbTTe()(#N953q>Dzr>uoxBUd>Snx<FVl_5
ze#FeXH0fBpv}1wJ;6b+d)Tlid8$l~L{xDM6vOl!sV-M?~I;~nL-_A;st-C228IhwN
zah}@jy_Vutq4itoG&dk<o&OR<w>IBl<9=G(zbv|{)g5U)vfO86C02xU!Nbc7ga)6r
zIj@-<3EZxo|M42X=)Z;Y^rugfdl3vV#e6e!MrPxbjIFBkl&MRZqJ92{2}cq$BWJ%7
z5xl!GWzBeB-b(&^-#%(dmPBRbbiwJ{6y~N`5#19W2iq%$?oxnAbVc}+Rf!6l;0{z!
zOi;Cd-5inp^`Do1&xan}Jn@_%!NUmKaftS{Ql_!}tdK03C7i2bVZ`%XhqJ5A6$Si0
z<fSRf<qm@)^3|NCm{HW#Ceh8RambAgs?GUJZ3KNACaY$Zs1=^V&wKyjH9RWN(v9s@
zj{HuGsMlQ4(*^>nby-r}h@by(UxZ)&Xk0ByPDfdCzi6r%eSRWp+tstlrB{%u!i=u&
z9hNv&3P?0Ligfi`)5>xZbKJfsz-zs%iI;V#!<?d1kHyq+sUv(Ywz4i~bOO}wcw-$=
zT{GjPD7XabQ|2Zr)DsQ5u`WN{BN>CQ?)O(al*?&~GP$jUHa3U{aJqbIc}MbEFjlcP
zofoQ8zcs*rhbAVNFE8=Dc2OJj`iOLrt>8Vm_$BsMop;=FHlFxfELpCPRZ!3T_721O
z#{{D19uZttbw<hu{s|_?vJYJ=K_xbuT%Hzb_!g&99}?7gzm5i%-<F*j`Yo0U5%^i_
zl9E}Mw7ayyFx7FXxBhGH+#KCuS(Y(Z)v-<MtKA@+zN!?Wot%vU15zywuV;kE5uHgT
z<Yy>Ct=%l{bhqH9cGS1od70K^)rgKVHd308Vt5UH+r!a0{!!3=*PGC}db36(i|^m-
z9j=XT8Ye%6jr|JV9<g~_Dk0u+r?U9%uu;pg=EzFXjp}<b@&5O7Z5r&~u?TOkeD8uZ
zyo=*Z%aC1Cl5s)xHM-0-2dHOQ6WZer>S5u4YKFQ$?@*V4Pn7GP)ec0K0}IN*&T$d?
zn5tM|X^DGYTh(qDy}HmUwR;$oIQI_KuUGfqleT*FY6gDa-0_cbb5!MPOR_f1mul<K
zAMtz{GK~9tb~jV2CQQJoKh5p6oPgtC!Cm&DE!=&yu&T@rg2iUXcfT%0IW#EW1OGPA
zSxQ(gTsm>GY&+UDQ^It<R5Q4D&-pEcSdpP0;?5IjX;;<Z`Irkvkaxt(CrESu;h07x
zK&`93^~n@%2G5h3B<?)@_~z|dG2hg(>gqjBjXTp}r}TaRObSQjhkv43_Gde(R@U8I
zni~0G-**mQ{XFY+WTFb(E%bA0fUlkF4W|AEp5C8!pi&v~SlHe{%Lq%)=4O!eiXn`p
zJSY2q`Gz%;*Dv?ESrkqdCv)x_T+>Kz#ENIM3Ng`8<|y_~^MBpFJ?#2vGUP^dy*WiE
zjWm~b*awkgSH@Vnfx<;;o;&aRed5Gd!|`MTCD>&QrUYq?lxRUA)K5=3yR6$c3ZsK|
z-ZyI&)T&4q$XEY`T@SP!ba4tDVr$DDdG24y!MXP%<vDmiB?R1b!%5$wIbKI66xx>t
zw0p5%Z6Y=68fe)Kf08?_K(*Cx>bD#0atd`IT<kvG?=_KH;`bivYids(7Tyoh7lc1t
z%9o*10q0Qt^x<ZSu6LBEz^Ct<C-wvjYis*LQk48}h$bGXX3E_r2;71?BefTw@rtB<
zfV>WgIJ5za5t&spjr)8HgdAbnNPlVEEAX=#G^Y-a(u;y>%V6P^6o-1}UO5l{;XJf4
zSX<Y8Kd>XuD->G0<3Gr_OaOCr;f}m3Jmy<%&Odjub#Mx{*#T9b#p_7wup;Y3<N7yr
ztVcSjO%!e96%<i3ywcA{Cs8)r@NQ*R+SQ4NyXV4NlM4nqWpd#%d$i&yMZxEprW9<Q
zI6UJ`CU;h_Zx+pP{^mlufQ|Up>R*(U9MUbLzgNGbjLJnTI-JF_RQ1pEO~X-V9>8@*
z!ORC~cbQ`}+^6k!*Za4BwVH=~3|Hj{jm%T<ifalnSdRcI3e#Io-I~sQz=CJUXAoY@
zYu){|Lo=fcl;MJsw$x5L9vx%evqYcbRdf6725fM>15=$5^?x;pU1vR+ZfoM&AlN0u
zqo7?bng;*;&yeW0<qwdnB7U-YUVz282G|zmmiis_?|F>Iv=FIsEXEY0>9w){jxsQ{
zR7w%?wx>pp(g2)`VANN67vWEDExj$%qjOej2d-O*S}N;X4A?q+FmkMykH|*5l;py|
zSt2}H!hP{Dkd#QQkL|9@k8h*L*kO5gg>TNt>3FxmYZItfry-0@Nd1W%n9+y?F3nwY
z?jKullV`E8G%-LO*{|9UX_DC<)1DtqY;nN)+JBfelr~t%cX%T0L3lfT+R3bsgl<>_
z<g3dC@&cYqi#tV%cAmz+<|!@%eHRVTXml|+=#OV8F+2Ws7jh@vq7>HmO67^xmHV&%
z=HK(4g{P8xZ?9D2YsQ|RNp_@L72S7;z4HKgd4!w1C3OBeZvTgvX5}xneA=2J#OZN&
zeEH*0einl_E~&5%Wlf)9wGtW_&|AcJ@MHYNJ)6EysbqIR^&^FfU*UZ_zrqCKmX!cY
z)LXekBB#cC^F6;|gV?bh%KJ5U@3*1hyPulBDDggADl8aDvs)IKWN$Qyk+pdYzg8&<
zRkj@@elSqX=}9sT9%X<AHd~IUCXI0r32`B;t5@g0Hh;7+G~#+0knE5n=R$UwQCk!-
zpFTbl1*q>#)Kc|xHl}%pRkcI~Zv$hh$)+=p7K_uTsasfd!>al4vP#G|G6uD5>V5Yv
zr>OC%-O`u)8n~?-jqZ{;{MdK{hTqqxKwe^8OH^)asrKf+-SXOfwXgGArN$V)5s#7I
z+dI>NnLHtlRa`QKCG3BCuXW<T8|(4G_nlIns!o1bJnVGuWP9Bx^gAedK2t5HyV1#2
zgzaV1psEM3oLRYX^$?xwAeA$-Ko3mXU`mN~RW@2oOH9=f+5;ytlxCqgh!VkHdc1sa
zFK7Q8aZlweK;Jk@h>V2GO|>uR7`{eAKz;rJQ~LdU^Dtm5X+M|X_n)J6nYJhbdw!WD
zR9ZkLGrlQ9?T4$es?3fZ0g7DC)5>MnuyFXo<8GPw*b3=$Djif1oWHYcgm&XY`Ztkh
z;(1&eov1zw=97~rwkzDQnJ#6kpuaN}v{1FU{qai8=vS4?LDBVQlqO53vxfoZYy4j{
z1Dd2qDW#3_Z=$#LRpnQX3$A#QN}8-`zPz9D>^AOyqDE&}XOxke%hm&8llAL-BN|48
zM59E-5U1Sa+b4Qc3PJ5`WA8N<W`U;$yXa+*xdrzxZ;e01xzw{xp&3H&g?k5Wxv4z4
zz$kx68}yk?Y>Z29O!QFN@i<L_K6xU$xNtIU5>iA_t{)t^Dn8X6z(ck>$X%YlEsF1K
z?v=beR7Zaz17T|}fSfvvD+5FL#9>EE$su61{p{%Wc{*3EX)LV-a-)NSTHLNS*@eS5
z3P*n)GevEtl35|jI~t05DhEx)0EFEyaX}(I7<>i3k8U4?V5J5^ZO7%K!;eQ7{k5|f
z&${dt3+;W}?reO$S^{zuDU7J#@B1uKl&o**7<N&EgFNj$f=+y}4m1jK1{LaMD=;oD
zi(j}dUb^}~A0wCm&vg0<xQpj-ti&U>;?uxybQ0$ZBEcxD9xW?;&K;3h|2;O5goW#+
zI;#(uzcR2k{?v5NPZ<6g{&rf*n>1VBA*QL$rDwO4TGXAX?X_bBz_8#ZyIku|NI$qV
zeG&u5d?y!`qBcE-QUux5f9P`}8hdUzq&e{;1HBu{^<x_Yvl+qEtz!Pd?fDw3<`y=R
z<|kgJ`npG%m&@Ifip#V+i~#j@=U`D6BjpIY(Y&(r_SV1*_aCE;KZ6BsfgO!l&n$o8
zOw!lfBLPFQ(+1f&)88^L&c@uEUwi(L%!=5c;6iDGkQk`0=(uvy@!@KM2Zew-kBw?e
zi}naHt{~fs6n-p`SF-MBY9U#yd|Naqt)N{RESWav2rMNUE@kdoUuYhmihU7Lsugoi
zI{tXS=n}B!0AjbGWf&HvD6#L_1Ghxf%yA7uxIy0R9(jgEdIdR5CN{K8VS`W=xanVm
zSk>FipfZ_dVT}4f;#E~#hq2zs0rhv__o3S<=K2o;o!p=L)G_S#W1aN2;8rS^NIoz9
zII%ntIFF~5rF*?KYFT<UL&sJGp?SA64i;ofG#n4b@1;UFvQW39R}aXb-vJAk2S%pS
z9h>HMlyW)`Ap0>#VRSA5Pp!hDdm5s8<lbm*I5Q`?;P=>NXv$4}hDDMiv57s-Jkntw
zG&}V>Y@9M>;9{qmf3n-h0~cfM6}W?{royrGMSRms<z;>wZ7+f>Xl`MlGJHOJVh;=V
zaK5C@jwWkSUi*cI`Av2-H#K6|t@~f>V@DAZiDz`?)q^PpYpuBLD^*SQA5|*cwE0)o
zf6+I@n<~(r9tTi6&WxO=iO4Cfuy7{DgBj^LmM&Vynq^5_^jH!|lJ2XF=ekyk-M}GZ
zo6Hi8jTV_^&RsD#woXaj-!1bvYU-_NtOy!>OS7VQm<=j4=fbXcdj9ZpGY|?2QrNZ&
zrp6-x=eJ5-S4o7*F<&P@>s*5wx1M^GdD;QCa~-qH`4=Guj2q#4|GKzJ3z8mQkF(z{
zCZA}Dh}foQ$*m%kyDYJFgq+u|?a4+4ZEp6vmu*dlOET>s7`asggWWi6UDroTiSho`
zfA=?q>6;$T_G=L(o9iI99lkI2+;ikyb1(^FEB%%mkM0sCzN~KxhbIqXp*7CETTi%F
zf$YqQg)V+ZuXL4M_>%^(*E~_prJm(7sX1~>D4g@RU-E=J$y1Y~>l#|3>~S*2fxjVz
zQO5d`g)U2vCv>saVaO0ixkA0(H~pwC*%2+Gmy$Hv)XZ!|SRY~qg0Ok8dg$hF*q=fx
z+!y4mtZ(671)P?~*PD~<qZRTCEhXasMY;1|>fQv#1%b+NROE2;NH75<<yhjtRan?J
z^r<Bw=qq&PcAJ%BLOB-=O(kbqf_^Sqd~AG2TAv@ZsYp~pmRsdFPLLbxmS2ssO34l0
zJFz7{JyL4*vo1EsB|U_h7FY6EYFh3pA+krDg8y@+(@8Y!-S%cFe?X=h*k$lsg%WTi
zIn~Ft+i>ulkF*`eco?91?(`;w01c|vR}~J6*j|h*YZuN+n~rVKIXMu>uE*1V$WmGr
z=qPV?an{A_mUNm373?>9&E*HEqpbV#;)Zv$2P@CKqF!?SEO@(Y<ZamyoSl`*4YUZ0
zViaFUWcqaQY!hJ9ULF>@udE2WLQg_RTQvwNKG3_Hshv5UC)urH?-4UBuj2VTn}-*Y
z54kp>l^H_1mOpX534WjlLnWO4;w=G)UaxU3WaE#{`B9^C<575+2{$7lESkY1ej@U-
z3uRmZu2<9b8}}_&)m5|BxQ@WOzVGWv^3q#y2D7v3?q!(RAavw;33yfy6^AJAs}LS8
z^pESQ6RrJ6VNnbQHx{myJ{x~M@xdrlKNIBNFZ}GOqOXa(abn^EpE%>u(e!WmL17Tc
zapH!k+ho>-I9a!%yiOBW{{nwZv;SIt>PIm&<-eDp7y~cn8(+5m4Ln-;7IVn*&r^=~
z-6(gr;w+%*;dkK`a~s-=&6a5TNRD5xik-@$18I4_KGVc3i>1FYpH!MP?ngrlSS4M=
zKGb$UT;P31v0lEutMK}v!Z~y0m*{<=Ttm+FP80%NO5ZlVqafvaDzKojWcp}KIMdLk
zw|C&-auB@Io$NK!y8-A`2{!erGKS@smoYA<@4%zvJSg?DA-G*$?8&oyEJac{QK}84
zgQxru_31lN9quz~OwOHn-W#{8B(r%&x6cXN$_NkraSPb??%;!o)O<0ggwe#ggZbpj
z+w9z>$Pec;@!~Lb&W8p|m&3n=x;9;z6y+_f+&$ymMaxhZQ3&k*q?=t%FQB<<%k^>Y
zo<fJWDRojWnnUwCad}E^0}b-z1ub4-B1!^M)BWtj7)v(dRZY+4L$p7QD}5K*>$#N_
z2p~c~vXNd8*@1^QtN_LeW;ah}-;RLquA0Hx_z_wzu-P+U!hsdwU2te*cd`9fRAQjo
zv4xkp+ov+qd5hP>XudwXS&VRN9f3bRV+bL&_o#PHG+%cO^jl^XOp-oP9aDrJGvqp)
zzw1VPnHtHySCsYuscjxuwFjsmLK{wR7~_f0FQiT|QlYnO#;k;_E^`7-(9qb@iAS)X
zTw98uB0cTKeD17qFL_X)hnFgx63+Ri5r$ipvgZ~_l<tOY?kV1LuP!^Ha);WtV?Kh_
zz0pq=oXv3vQe)-m4UhiqA!>;q0?4Xyj_x6-QJU~`DJ(Vkd8z3(<E_54YXLAysLsHv
z)i<s3L4U*$gPOntt5cPWbai$+x*FD4p1%Tk1KFB4Rqv-SW!V8pmF^yIe|G*Rl!sun
z!!~)oqjkoVI-G|{&9t2?1-lw9cnmI8a!Oe(2u^*EU_7*&+Y{qf3`^7EzKt<FJTsfH
z@pT_5s{$_3bu`bz!C=Nfg2GQ%VWYNLpjS=ji+q<-g_dV|Gpv)PyM#pji7Z%^7ng#O
ztV`w0A{X2UoN=sYkL&zUJHD`3msw@Wn_eEq34^@gBbfd!C59L|=sJ0i-?Y;G!cwJi
zkSi;A(ax+Nvhv?a!Phz9d@@*icJoxsisKg=I$tsQK1jx^I>bJZFwNgs+lxVKcPP2E
zFyE%Xo`N4`Wqqg3C0De`^aeifVP$=j*>{m^Q!u)6(Cvpsu9MD<v<&b03q9la>u^jd
z(8-@Pf+*}2Ic!aPy81lXd)A6Q$ZN;Sl(|v=Os^vy5nEYjZV9r7@{+Kf=8C~`9z2b<
zXI>p+Mu1j7Y&y6iU;kX&)+&ISyYzffuJaEz{HgKis7}b4Vq*@`vWsy?Hav0p1aA)e
z#;y?%>G8#JTs^7gA`<V{<gik<1N+Cgv_=t;^I*SgZ5+^RwxYHXE6AFVCt)|e4!ea#
zKtwm<GL2_=!DH(r9T@P2{0p)mmcuh8S5W)lK=c;4r-vc?JkvCY`b1$NdaN*^G{k*t
zOQx_Tr3H084hwr3h(MC~_vFO4LW1;p6)E6}P);KAR@#V?tO4+)pZ@a$T5Rblg__w!
z%HCreYf<pB>tctuiWTNn+0cJ#!7e+_sP|>ekMj{1vE!x3Lb<Q<6PmHW5;rY9=KjDt
zUB{d}J$l32-tTbpS>1ieXRJ&EU_RMW##q5C#6{~29~6kk=4sCegNSBrr@W@{woFm5
z`)sCg@yv{wc2sca-4xf%5wLT=pRT1h%P?qFSmJp<1dRttXjlYLe-Qc_o`wqzd(9(7
z6RM|VN?M;N9x;`&?5^^V7}aGGJ^V?rUD@-y4Dosd2}gTnNi__8qacA@FqfzAl5o-s
zL{_l!P-laJ6X5#mk?QGT^&guhsxo^>BQpqJ;sFq+VJ`}$oa3uLj|ptgZnCow8^N!f
zR`jQ76*N3cz$uh=*}brK^2B>(h@D(c{M%ZJ_{=pi#2@r1ruS_J?;K)NRcYmmbIUyc
z2;vi)WkPObSg)t|3QjxJc~E1;{_t+asD=en;)r+W*MxyUON7-j^qj#4LFJ*ATVs)6
z`LlR)CfAjn%*Orn#+-E}@(XG)uuUOTPsiM40Q@EuMjoB&61V1_Aw-aVl6tR}F9go!
z>sRPKswH)o!#E{*NyK)=HJ_#7I<ASh@Tx37hoGTxZqG{SH`UD(xL#^>_)5Vd;qUFY
zuY+bLbpixRm8mc?1LRe(!$?6MNgbtXfMJdZEas0Fl^ic2XWt5cin&4Q5GL5e)lPPm
zrIAxCATTn#k|ApInW};Ni}0eJ1^acwy`b2-X9cO|7QBSdeqHe^CfTj>JbJR5(uti_
zAm-u<-_NJl#35Hin6zBFHwv_LqoMbV0=)IJtJM>E>Crr0J(9ypM9)GFfB1q{d}*>i
z24{x!1Ms$s9tw{oD+%W%;0&XW1SP&Tz@o$j%>GaeYgM9E(pmi+f%*4;O(vc^^9<Z=
zl&rYn;W@>#Su_XJIJhg|X#bh)FK2bpxB8P5e|mClUtE}RMV<d~cU#FC-<Yq-vgZ2@
z+?#U?x)C(G^?5+V^FO#{GL7!NXXGDVSTy~`Hxj`AxrM*8gTITilb_24(8)^6$V*Dg
zNXi0DWE7R<6_jNaB&8oIOG~@dax(oNf|rl;#oOBdPvBUs#9a{Rw4dvPDxTU${ts6G
B{H*{0

literal 0
HcmV?d00001